-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86][AVX10.2] Remove YMM rounding from VCVT2PS2PHX #132397
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-mc Author: Phoebe Wang (phoebewang) ChangesRef: https://cdrdv2.intel.com/v1/dl/getContent/784343 Patch is 22.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132397.diff 14 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index ea0d6df4a33c2..ef6bd77ae93ab 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -5006,7 +5006,7 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
}
let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vcvt2ps2phx256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<8, float>, _Vector<8, float>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+ def vcvt2ps2phx256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<8, float>, _Vector<8, float>, _Vector<16, _Float16>, unsigned short)">;
}
let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
diff --git a/clang/lib/Headers/avx10_2convertintrin.h b/clang/lib/Headers/avx10_2convertintrin.h
index f89ba6e830c49..19d91d41f7bde 100644
--- a/clang/lib/Headers/avx10_2convertintrin.h
+++ b/clang/lib/Headers/avx10_2convertintrin.h
@@ -178,8 +178,7 @@ _mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
__m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1),
- _MM_FROUND_CUR_DIRECTION);
+ (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1));
}
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
@@ -223,8 +222,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
+ (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U);
}
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
@@ -266,142 +264,9 @@ _mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
+ (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
}
-/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
-/// single-precision (32-bit) floating-point elements to a 256-bit vector
-/// containing FP16 elements. Rounding mode \a __R needs to be provided.
-///
-/// \code{.operation}
-/// FOR i := 0 to 15
-/// IF i < 8
-/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
-/// ELSE
-/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
-/// FI
-/// ENDFOR
-///
-/// dst[MAX:256] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
-///
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __R
-/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
-/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
-/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
-/// _MM_FROUND_TO_ZERO.
-/// \returns
-/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
-/// (converted) elements from \a __B; higher order elements correspond to the
-/// (converted) elements from \a __A.
-#define _mm256_cvtx_round2ps_ph(__A, __B, __R) \
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)_mm256_undefined_ph(), \
- (__mmask16)(-1), (const int)(__R)))
-
-/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
-/// single-precision (32-bit) floating-point elements to a 256-bit vector
-/// containing FP16 elements. Merging mask \a __U is used to determine if given
-/// element should be taken from \a __W instead. Rounding mode \a __R needs to
-/// be provided.
-///
-/// \code{.operation}
-/// FOR i := 0 to 15
-/// IF __U[i]
-/// IF i < 8
-/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
-/// ELSE
-/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
-/// FI
-/// ELSE
-/// dst.fp16[i] := __W.fp16[i]
-/// FI
-/// ENDFOR
-///
-/// dst[MAX:256] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
-///
-/// \param __W
-/// A 256-bit vector of [16 x fp16].
-/// \param __U
-/// A 16-bit merging mask.
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __R
-/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
-/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
-/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
-/// _MM_FROUND_TO_ZERO.
-/// \returns
-/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
-/// (converted) elements from \a __B; higher order elements correspond to the
-/// (converted) elements from \a __A. If corresponding mask bit is not set, then
-/// element from \a __W is taken instead.
-#define _mm256_mask_cvtx_round2ps_ph(__W, __U, __A, __B, __R) \
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)(__W), (__mmask16)(__U), (const int)(__R)))
-
-/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
-/// single-precision (32-bit) floating-point elements to a 256-bit vector
-/// containing FP16 elements. Zeroing mask \a __U is used to determine if given
-/// element should be zeroed instead. Rounding mode \a __R needs to be provided.
-///
-/// \code{.operation}
-/// FOR i := 0 to 15
-/// IF __U[i]
-/// IF i < 8
-/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
-/// ELSE
-/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
-/// FI
-/// ELSE
-/// dst.fp16[i] := 0
-/// FI
-/// ENDFOR
-///
-/// dst[MAX:256] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
-///
-/// \param __U
-/// A 16-bit zeroing mask.
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __R
-/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
-/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
-/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
-/// _MM_FROUND_TO_ZERO.
-/// \returns
-/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
-/// (converted) elements from \a __B; higher order elements correspond to the
-/// (converted) elements from \a __A. If corresponding mask bit is not set,
-/// then zero is taken instead.
-#define _mm256_maskz_cvtx_round2ps_ph(__U, __A, __B, __R) \
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)(_mm256_setzero_ph()), \
- (__mmask16)(__U), (const int)(__R)))
-
/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
/// 16-bit integer stored in \a __B.
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index e54a278225f1c..6a10d3a20b1bd 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -314,7 +314,6 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_vfmulcph512_mask:
case X86::BI__builtin_ia32_vfcmulcsh_mask:
case X86::BI__builtin_ia32_vfcmulcph512_mask:
- case X86::BI__builtin_ia32_vcvt2ps2phx256_mask:
case X86::BI__builtin_ia32_vcvt2ps2phx512_mask:
ArgNum = 4;
HasRC = true;
diff --git a/clang/test/CodeGen/X86/avx10_2convert-builtins.c b/clang/test/CodeGen/X86/avx10_2convert-builtins.c
index 92ce2ec7e7846..31dd0ecc381ef 100644
--- a/clang/test/CodeGen/X86/avx10_2convert-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2convert-builtins.c
@@ -41,24 +41,6 @@ __m256h test_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
return _mm256_maskz_cvtx2ps_ph(__U, __A, __B);
}
-__m256h test_mm256_cvtx_round2ps_ph(__m256 __A, __m256 __B) {
- // CHECK-LABEL: @test_mm256_cvtx_round2ps_ph(
- // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(
- return _mm256_cvtx_round2ps_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-
-__m256h test_mm256_mask_cvtx_round2ps_ph(__m256h __W, __mmask8 __U, __m256 __A, __m256 __B) {
- // CHECK-LABEL: @test_mm256_mask_cvtx_round2ps_ph(
- // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(
- return _mm256_mask_cvtx_round2ps_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-
-__m256h test_mm256_maskz_cvtx_round2ps_ph(__mmask8 __U, __m256 __A, __m256 __B) {
- // CHECK-LABEL: @test_mm256_maskz_cvtx_round2ps_ph(
- // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(
- return _mm256_maskz_cvtx_round2ps_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-
__m128i test_mm_cvtbiasph_bf8(__m128i __A, __m128h __B) {
// CHECK-LABEL: @test_mm_cvtbiasph_bf8(
// CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128(
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 4fcf2ff8f38df..7bbbb2c451a01 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -7026,8 +7026,8 @@ def int_x86_avx10_mask_vcvt2ps2phx_128 : ClangBuiltin<"__builtin_ia32_vcvt2ps2ph
DefaultAttrsIntrinsic<[llvm_v8f16_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx10_mask_vcvt2ps2phx_256 : ClangBuiltin<"__builtin_ia32_vcvt2ps2phx256_mask">,
- DefaultAttrsIntrinsic<[llvm_v16f16_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v16f16_ty, llvm_i16_ty, llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<4>>]>;
+ DefaultAttrsIntrinsic<[llvm_v16f16_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v16f16_ty, llvm_i16_ty],
+ [IntrNoMem]>;
def int_x86_avx10_mask_vcvt2ps2phx_512 : ClangBuiltin<"__builtin_ia32_vcvt2ps2phx512_mask">,
DefaultAttrsIntrinsic<[llvm_v32f16_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<4>>]>;
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index b368a5299f907..561b6972a680d 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -771,12 +771,6 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr,
_SrcVTInfo.info128>,
EVEX_V128, EVEX_CD8<32, CD8VF>;
}
-
- let Predicates = [HasAVX10_2], hasEVEX_U = 1 in {
- defm Z256 : avx10_cvt2ps2ph_rc<opc, OpcodeStr, sched.YMM,
- _SrcVTInfo.info256, _DstVTInfo.info256,
- OpNodeRnd>;
- }
}
defm VCVT2PS2PHX : avx10_cvt2ps2ph<0x67, "vcvt2ps2phx",
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 31c2bfb8f71c2..0955284662f44 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -440,7 +440,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_128, INTR_TYPE_2OP_MASK,
X86ISD::VFPROUND2, 0),
X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_256, INTR_TYPE_2OP_MASK,
- X86ISD::VFPROUND2, X86ISD::VFPROUND2_RND),
+ X86ISD::VFPROUND2, 0),
X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_512, INTR_TYPE_2OP_MASK,
X86ISD::VFPROUND2, X86ISD::VFPROUND2_RND),
X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2bf8128, TRUNCATE2_TO_REG,
diff --git a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
index fe2bfb7b44691..90e2146cc2c0b 100644
--- a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
@@ -50,7 +50,7 @@ define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256(<8 x float> %A, <8 x float
; CHECK: # %bb.0:
; CHECK-NEXT: vcvt2ps2phx %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x67,0xc1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1, i32 4)
+ %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1)
ret <16 x half> %ret
}
@@ -66,7 +66,7 @@ define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_mask(<16 x half> %W, i16 %
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vcvt2ps2phx %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x67,0xc2]
; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U, i32 4)
+ %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U)
ret <16 x half> %ret
}
@@ -82,52 +82,11 @@ define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_maskz(<16 x half> %W, i16
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vcvt2ps2phx %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x67,0xc2]
; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U, i32 4)
+ %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U)
ret <16 x half> %ret
}
-define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round(<8 x float> %A, <8 x float> %B) {
-; CHECK-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x79,0x78,0x67,0xc1]
-; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1, i32 11)
- ret <16 x half> %ret
-}
-
-define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round_mask(<16 x half> %W, i16 %U, <8 x float> %A, <8 x float> %B) {
-; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_mask:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vcvt2ps2phx {rz-sae}, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x71,0x79,0x67,0xc2]
-; X64-NEXT: retq # encoding: [0xc3]
-;
-; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_mask:
-; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vcvt2ps2phx {rz-sae}, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x71,0x79,0x67,0xc2]
-; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U, i32 11)
- ret <16 x half> %ret
-}
-
-define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round_maskz(i16 %U, <8 x float> %A, <8 x float> %B) {
-; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_maskz:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x79,0xf9,0x67,0xc1]
-; X64-NEXT: retq # encoding: [0xc3]
-;
-; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_maskz:
-; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x79,0xf9,0x67,0xc1]
-; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U, i32 11)
- ret <16 x half> %ret
-}
-
-declare <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float>, <8 x float>, <16 x half>, i16, i32)
+declare <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float>, <8 x float>, <16 x half>, i16)
define <16 x i8> @test_int_x86_avx10_vcvtbiasph2bf8128(<16 x i8> %A, <8 x half> %B) nounwind {
; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2bf8128:
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt
index 3b66fa1da5275..282c5dd087563 100644
--- a/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt
@@ -5,18 +5,10 @@
# INTEL: vcvt2ps2phx ymm2, ymm3, ymm4
0x62,0xf2,0x65,0x28,0x67,0xd4
-# ATT: vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2
-# INTEL: vcvt2ps2phx ymm2, ymm3, ymm4, {rn-sae}
-0x62,0xf2,0x61,0x18,0x67,0xd4
-
# ATT: vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7}
# INTEL: vcvt2ps2phx ymm2 {k7}, ymm3, ymm4
0x62,0xf2,0x65,0x2f,0x67,0xd4
-# ATT: vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z}
-# INTEL: vcvt2ps2phx ymm2 {k7} {z}, ymm3, ymm4, {rz-sae}
-0x62,0xf2,0x61,0xff,0x67,0xd4
-
# ATT: vcvt2ps2phx %zmm4, %zmm3, %zmm2
# INTEL: vcvt2ps2phx zmm2, zmm3, zmm4
0x62,0xf2,0x65,0x48,0x67,0xd4
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt
index 611a584df87cf..6060a1a78e279 100644
--- a/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt
@@ -5,18 +5,10 @@
# INTEL: vcvt2ps2phx ymm22, ymm23, ymm24
0x62,0x82,0x45,0x20,0x67,0xf0
-# ATT: vcvt2ps2phx {rn-sae}, %ymm24, %ymm23, %ymm22
-# INTEL: vcvt2ps2phx ymm22, ymm23, ymm24, {rn-sae}
-0x62,0x82,0x41,0x10,0x67,0xf0
-
# ATT: vcvt2ps2phx %ymm24, %ymm23, %ymm22 {%k7}
# INTEL: vcvt2ps2phx ymm22 {k7}, ymm23, ymm24
0x62,0x82,0x45,0x27,0x67,0xf0
-# ATT: vcvt2ps2phx {rz-sae}, %ymm24, %ymm23, %ymm22 {%k7} {z}
-# INTEL: vcvt2ps2phx ymm22 {k7} {z}, ymm23, ymm24, {rz-sae}
-0x62,0x82,0x41,0xf7,0x67,0xf0
-
# ATT: vcvt2ps2phx %zmm24, %zmm23, %zmm22
# INTEL: vcvt2ps2phx zmm22, zmm23, zmm24
0x62,0x82,0x45,0x40,0x67,0xf0
diff --git a/llvm/test/MC/X86/avx10.2convert-32-att.s b/llvm/test/MC/X86/avx10.2convert-32-att.s
index 940279388e6ac..522a3c63a03ba 100644
--- a/llvm/test/MC/X86/avx10.2convert-32-att.s
+++ b/llvm/test/MC/X86/avx10.2convert-32-att.s
@@ -4,18 +4,10 @@
// CHECK: encoding: [0x62,0xf2,0x65,0x28,0x67,0xd4]
vcvt2ps2phx %ymm4, %ymm3, %ymm2
-// CHECK: vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2
-// CHECK: encoding: [0x62,0xf2,0x61,0x18,0x67,0xd4]
- vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2
-
// CHECK: vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7}
// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0x67,0xd4]
vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7}
-// CHECK: vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf2,0x61,0xff,0x67,0xd4]
- vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z}
-
// CHECK: vcvt2ps2phx %zmm4, %zmm3, %zmm2
// CHECK: encoding: [0x62,0xf2,0x65,0x48,0x67,0xd4]
vcvt2ps2phx %zmm4, %zmm3, %zmm2
diff --git a/llvm/test/MC/X86/avx10.2convert-32-intel.s b/llvm/test/MC/X86/avx10.2convert-32-intel.s
index 52a02f7ff963c..bc60953ce38ad 100644
--- a/llvm/test/MC/X86/avx10.2convert-32-intel.s
+++ b/llvm/test/MC/X...
[truncated]
|
@llvm/pr-subscribers-backend-x86 Author: Phoebe Wang (phoebewang) ChangesRef: https://cdrdv2.intel.com/v1/dl/getContent/784343 Patch is 22.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132397.diff 14 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index ea0d6df4a33c2..ef6bd77ae93ab 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -5006,7 +5006,7 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
}
let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vcvt2ps2phx256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<8, float>, _Vector<8, float>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+ def vcvt2ps2phx256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<8, float>, _Vector<8, float>, _Vector<16, _Float16>, unsigned short)">;
}
let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
diff --git a/clang/lib/Headers/avx10_2convertintrin.h b/clang/lib/Headers/avx10_2convertintrin.h
index f89ba6e830c49..19d91d41f7bde 100644
--- a/clang/lib/Headers/avx10_2convertintrin.h
+++ b/clang/lib/Headers/avx10_2convertintrin.h
@@ -178,8 +178,7 @@ _mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
__m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1),
- _MM_FROUND_CUR_DIRECTION);
+ (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1));
}
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
@@ -223,8 +222,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
+ (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U);
}
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
@@ -266,142 +264,9 @@ _mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
+ (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
}
-/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
-/// single-precision (32-bit) floating-point elements to a 256-bit vector
-/// containing FP16 elements. Rounding mode \a __R needs to be provided.
-///
-/// \code{.operation}
-/// FOR i := 0 to 15
-/// IF i < 8
-/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
-/// ELSE
-/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
-/// FI
-/// ENDFOR
-///
-/// dst[MAX:256] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
-///
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __R
-/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
-/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
-/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
-/// _MM_FROUND_TO_ZERO.
-/// \returns
-/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
-/// (converted) elements from \a __B; higher order elements correspond to the
-/// (converted) elements from \a __A.
-#define _mm256_cvtx_round2ps_ph(__A, __B, __R) \
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)_mm256_undefined_ph(), \
- (__mmask16)(-1), (const int)(__R)))
-
-/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
-/// single-precision (32-bit) floating-point elements to a 256-bit vector
-/// containing FP16 elements. Merging mask \a __U is used to determine if given
-/// element should be taken from \a __W instead. Rounding mode \a __R needs to
-/// be provided.
-///
-/// \code{.operation}
-/// FOR i := 0 to 15
-/// IF __U[i]
-/// IF i < 8
-/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
-/// ELSE
-/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
-/// FI
-/// ELSE
-/// dst.fp16[i] := __W.fp16[i]
-/// FI
-/// ENDFOR
-///
-/// dst[MAX:256] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
-///
-/// \param __W
-/// A 256-bit vector of [16 x fp16].
-/// \param __U
-/// A 16-bit merging mask.
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __R
-/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
-/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
-/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
-/// _MM_FROUND_TO_ZERO.
-/// \returns
-/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
-/// (converted) elements from \a __B; higher order elements correspond to the
-/// (converted) elements from \a __A. If corresponding mask bit is not set, then
-/// element from \a __W is taken instead.
-#define _mm256_mask_cvtx_round2ps_ph(__W, __U, __A, __B, __R) \
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)(__W), (__mmask16)(__U), (const int)(__R)))
-
-/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
-/// single-precision (32-bit) floating-point elements to a 256-bit vector
-/// containing FP16 elements. Zeroing mask \a __U is used to determine if given
-/// element should be zeroed instead. Rounding mode \a __R needs to be provided.
-///
-/// \code{.operation}
-/// FOR i := 0 to 15
-/// IF __U[i]
-/// IF i < 8
-/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
-/// ELSE
-/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
-/// FI
-/// ELSE
-/// dst.fp16[i] := 0
-/// FI
-/// ENDFOR
-///
-/// dst[MAX:256] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
-///
-/// \param __U
-/// A 16-bit zeroing mask.
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __R
-/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
-/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
-/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
-/// _MM_FROUND_TO_ZERO.
-/// \returns
-/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
-/// (converted) elements from \a __B; higher order elements correspond to the
-/// (converted) elements from \a __A. If corresponding mask bit is not set,
-/// then zero is taken instead.
-#define _mm256_maskz_cvtx_round2ps_ph(__U, __A, __B, __R) \
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)(_mm256_setzero_ph()), \
- (__mmask16)(__U), (const int)(__R)))
-
/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
/// 16-bit integer stored in \a __B.
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index e54a278225f1c..6a10d3a20b1bd 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -314,7 +314,6 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_vfmulcph512_mask:
case X86::BI__builtin_ia32_vfcmulcsh_mask:
case X86::BI__builtin_ia32_vfcmulcph512_mask:
- case X86::BI__builtin_ia32_vcvt2ps2phx256_mask:
case X86::BI__builtin_ia32_vcvt2ps2phx512_mask:
ArgNum = 4;
HasRC = true;
diff --git a/clang/test/CodeGen/X86/avx10_2convert-builtins.c b/clang/test/CodeGen/X86/avx10_2convert-builtins.c
index 92ce2ec7e7846..31dd0ecc381ef 100644
--- a/clang/test/CodeGen/X86/avx10_2convert-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2convert-builtins.c
@@ -41,24 +41,6 @@ __m256h test_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
return _mm256_maskz_cvtx2ps_ph(__U, __A, __B);
}
-__m256h test_mm256_cvtx_round2ps_ph(__m256 __A, __m256 __B) {
- // CHECK-LABEL: @test_mm256_cvtx_round2ps_ph(
- // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(
- return _mm256_cvtx_round2ps_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-
-__m256h test_mm256_mask_cvtx_round2ps_ph(__m256h __W, __mmask8 __U, __m256 __A, __m256 __B) {
- // CHECK-LABEL: @test_mm256_mask_cvtx_round2ps_ph(
- // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(
- return _mm256_mask_cvtx_round2ps_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-
-__m256h test_mm256_maskz_cvtx_round2ps_ph(__mmask8 __U, __m256 __A, __m256 __B) {
- // CHECK-LABEL: @test_mm256_maskz_cvtx_round2ps_ph(
- // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(
- return _mm256_maskz_cvtx_round2ps_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-
__m128i test_mm_cvtbiasph_bf8(__m128i __A, __m128h __B) {
// CHECK-LABEL: @test_mm_cvtbiasph_bf8(
// CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128(
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 4fcf2ff8f38df..7bbbb2c451a01 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -7026,8 +7026,8 @@ def int_x86_avx10_mask_vcvt2ps2phx_128 : ClangBuiltin<"__builtin_ia32_vcvt2ps2ph
DefaultAttrsIntrinsic<[llvm_v8f16_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx10_mask_vcvt2ps2phx_256 : ClangBuiltin<"__builtin_ia32_vcvt2ps2phx256_mask">,
- DefaultAttrsIntrinsic<[llvm_v16f16_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v16f16_ty, llvm_i16_ty, llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<4>>]>;
+ DefaultAttrsIntrinsic<[llvm_v16f16_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v16f16_ty, llvm_i16_ty],
+ [IntrNoMem]>;
def int_x86_avx10_mask_vcvt2ps2phx_512 : ClangBuiltin<"__builtin_ia32_vcvt2ps2phx512_mask">,
DefaultAttrsIntrinsic<[llvm_v32f16_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<4>>]>;
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index b368a5299f907..561b6972a680d 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -771,12 +771,6 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr,
_SrcVTInfo.info128>,
EVEX_V128, EVEX_CD8<32, CD8VF>;
}
-
- let Predicates = [HasAVX10_2], hasEVEX_U = 1 in {
- defm Z256 : avx10_cvt2ps2ph_rc<opc, OpcodeStr, sched.YMM,
- _SrcVTInfo.info256, _DstVTInfo.info256,
- OpNodeRnd>;
- }
}
defm VCVT2PS2PHX : avx10_cvt2ps2ph<0x67, "vcvt2ps2phx",
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 31c2bfb8f71c2..0955284662f44 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -440,7 +440,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_128, INTR_TYPE_2OP_MASK,
X86ISD::VFPROUND2, 0),
X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_256, INTR_TYPE_2OP_MASK,
- X86ISD::VFPROUND2, X86ISD::VFPROUND2_RND),
+ X86ISD::VFPROUND2, 0),
X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_512, INTR_TYPE_2OP_MASK,
X86ISD::VFPROUND2, X86ISD::VFPROUND2_RND),
X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2bf8128, TRUNCATE2_TO_REG,
diff --git a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
index fe2bfb7b44691..90e2146cc2c0b 100644
--- a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
@@ -50,7 +50,7 @@ define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256(<8 x float> %A, <8 x float
; CHECK: # %bb.0:
; CHECK-NEXT: vcvt2ps2phx %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x67,0xc1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1, i32 4)
+ %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1)
ret <16 x half> %ret
}
@@ -66,7 +66,7 @@ define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_mask(<16 x half> %W, i16 %
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vcvt2ps2phx %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x67,0xc2]
; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U, i32 4)
+ %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U)
ret <16 x half> %ret
}
@@ -82,52 +82,11 @@ define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_maskz(<16 x half> %W, i16
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vcvt2ps2phx %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x67,0xc2]
; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U, i32 4)
+ %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U)
ret <16 x half> %ret
}
-define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round(<8 x float> %A, <8 x float> %B) {
-; CHECK-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x79,0x78,0x67,0xc1]
-; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1, i32 11)
- ret <16 x half> %ret
-}
-
-define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round_mask(<16 x half> %W, i16 %U, <8 x float> %A, <8 x float> %B) {
-; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_mask:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vcvt2ps2phx {rz-sae}, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x71,0x79,0x67,0xc2]
-; X64-NEXT: retq # encoding: [0xc3]
-;
-; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_mask:
-; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vcvt2ps2phx {rz-sae}, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x71,0x79,0x67,0xc2]
-; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U, i32 11)
- ret <16 x half> %ret
-}
-
-define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round_maskz(i16 %U, <8 x float> %A, <8 x float> %B) {
-; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_maskz:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x79,0xf9,0x67,0xc1]
-; X64-NEXT: retq # encoding: [0xc3]
-;
-; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_maskz:
-; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x79,0xf9,0x67,0xc1]
-; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U, i32 11)
- ret <16 x half> %ret
-}
-
-declare <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float>, <8 x float>, <16 x half>, i16, i32)
+declare <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float>, <8 x float>, <16 x half>, i16)
define <16 x i8> @test_int_x86_avx10_vcvtbiasph2bf8128(<16 x i8> %A, <8 x half> %B) nounwind {
; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2bf8128:
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt
index 3b66fa1da5275..282c5dd087563 100644
--- a/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt
@@ -5,18 +5,10 @@
# INTEL: vcvt2ps2phx ymm2, ymm3, ymm4
0x62,0xf2,0x65,0x28,0x67,0xd4
-# ATT: vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2
-# INTEL: vcvt2ps2phx ymm2, ymm3, ymm4, {rn-sae}
-0x62,0xf2,0x61,0x18,0x67,0xd4
-
# ATT: vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7}
# INTEL: vcvt2ps2phx ymm2 {k7}, ymm3, ymm4
0x62,0xf2,0x65,0x2f,0x67,0xd4
-# ATT: vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z}
-# INTEL: vcvt2ps2phx ymm2 {k7} {z}, ymm3, ymm4, {rz-sae}
-0x62,0xf2,0x61,0xff,0x67,0xd4
-
# ATT: vcvt2ps2phx %zmm4, %zmm3, %zmm2
# INTEL: vcvt2ps2phx zmm2, zmm3, zmm4
0x62,0xf2,0x65,0x48,0x67,0xd4
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt
index 611a584df87cf..6060a1a78e279 100644
--- a/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt
@@ -5,18 +5,10 @@
# INTEL: vcvt2ps2phx ymm22, ymm23, ymm24
0x62,0x82,0x45,0x20,0x67,0xf0
-# ATT: vcvt2ps2phx {rn-sae}, %ymm24, %ymm23, %ymm22
-# INTEL: vcvt2ps2phx ymm22, ymm23, ymm24, {rn-sae}
-0x62,0x82,0x41,0x10,0x67,0xf0
-
# ATT: vcvt2ps2phx %ymm24, %ymm23, %ymm22 {%k7}
# INTEL: vcvt2ps2phx ymm22 {k7}, ymm23, ymm24
0x62,0x82,0x45,0x27,0x67,0xf0
-# ATT: vcvt2ps2phx {rz-sae}, %ymm24, %ymm23, %ymm22 {%k7} {z}
-# INTEL: vcvt2ps2phx ymm22 {k7} {z}, ymm23, ymm24, {rz-sae}
-0x62,0x82,0x41,0xf7,0x67,0xf0
-
# ATT: vcvt2ps2phx %zmm24, %zmm23, %zmm22
# INTEL: vcvt2ps2phx zmm22, zmm23, zmm24
0x62,0x82,0x45,0x40,0x67,0xf0
diff --git a/llvm/test/MC/X86/avx10.2convert-32-att.s b/llvm/test/MC/X86/avx10.2convert-32-att.s
index 940279388e6ac..522a3c63a03ba 100644
--- a/llvm/test/MC/X86/avx10.2convert-32-att.s
+++ b/llvm/test/MC/X86/avx10.2convert-32-att.s
@@ -4,18 +4,10 @@
// CHECK: encoding: [0x62,0xf2,0x65,0x28,0x67,0xd4]
vcvt2ps2phx %ymm4, %ymm3, %ymm2
-// CHECK: vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2
-// CHECK: encoding: [0x62,0xf2,0x61,0x18,0x67,0xd4]
- vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2
-
// CHECK: vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7}
// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0x67,0xd4]
vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7}
-// CHECK: vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf2,0x61,0xff,0x67,0xd4]
- vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z}
-
// CHECK: vcvt2ps2phx %zmm4, %zmm3, %zmm2
// CHECK: encoding: [0x62,0xf2,0x65,0x48,0x67,0xd4]
vcvt2ps2phx %zmm4, %zmm3, %zmm2
diff --git a/llvm/test/MC/X86/avx10.2convert-32-intel.s b/llvm/test/MC/X86/avx10.2convert-32-intel.s
index 52a02f7ff963c..bc60953ce38ad 100644
--- a/llvm/test/MC/X86/avx10.2convert-32-intel.s
+++ b/llvm/test/MC/X...
[truncated]
|
@llvm/pr-subscribers-clang Author: Phoebe Wang (phoebewang) ChangesRef: https://cdrdv2.intel.com/v1/dl/getContent/784343 Patch is 22.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132397.diff 14 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index ea0d6df4a33c2..ef6bd77ae93ab 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -5006,7 +5006,7 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
}
let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vcvt2ps2phx256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<8, float>, _Vector<8, float>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+ def vcvt2ps2phx256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<8, float>, _Vector<8, float>, _Vector<16, _Float16>, unsigned short)">;
}
let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
diff --git a/clang/lib/Headers/avx10_2convertintrin.h b/clang/lib/Headers/avx10_2convertintrin.h
index f89ba6e830c49..19d91d41f7bde 100644
--- a/clang/lib/Headers/avx10_2convertintrin.h
+++ b/clang/lib/Headers/avx10_2convertintrin.h
@@ -178,8 +178,7 @@ _mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
__m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1),
- _MM_FROUND_CUR_DIRECTION);
+ (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1));
}
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
@@ -223,8 +222,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
+ (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U);
}
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
@@ -266,142 +264,9 @@ _mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
+ (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
}
-/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
-/// single-precision (32-bit) floating-point elements to a 256-bit vector
-/// containing FP16 elements. Rounding mode \a __R needs to be provided.
-///
-/// \code{.operation}
-/// FOR i := 0 to 15
-/// IF i < 8
-/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
-/// ELSE
-/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
-/// FI
-/// ENDFOR
-///
-/// dst[MAX:256] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
-///
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __R
-/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
-/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
-/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
-/// _MM_FROUND_TO_ZERO.
-/// \returns
-/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
-/// (converted) elements from \a __B; higher order elements correspond to the
-/// (converted) elements from \a __A.
-#define _mm256_cvtx_round2ps_ph(__A, __B, __R) \
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)_mm256_undefined_ph(), \
- (__mmask16)(-1), (const int)(__R)))
-
-/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
-/// single-precision (32-bit) floating-point elements to a 256-bit vector
-/// containing FP16 elements. Merging mask \a __U is used to determine if given
-/// element should be taken from \a __W instead. Rounding mode \a __R needs to
-/// be provided.
-///
-/// \code{.operation}
-/// FOR i := 0 to 15
-/// IF __U[i]
-/// IF i < 8
-/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
-/// ELSE
-/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
-/// FI
-/// ELSE
-/// dst.fp16[i] := __W.fp16[i]
-/// FI
-/// ENDFOR
-///
-/// dst[MAX:256] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
-///
-/// \param __W
-/// A 256-bit vector of [16 x fp16].
-/// \param __U
-/// A 16-bit merging mask.
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __R
-/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
-/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
-/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
-/// _MM_FROUND_TO_ZERO.
-/// \returns
-/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
-/// (converted) elements from \a __B; higher order elements correspond to the
-/// (converted) elements from \a __A. If corresponding mask bit is not set, then
-/// element from \a __W is taken instead.
-#define _mm256_mask_cvtx_round2ps_ph(__W, __U, __A, __B, __R) \
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)(__W), (__mmask16)(__U), (const int)(__R)))
-
-/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
-/// single-precision (32-bit) floating-point elements to a 256-bit vector
-/// containing FP16 elements. Zeroing mask \a __U is used to determine if given
-/// element should be zeroed instead. Rounding mode \a __R needs to be provided.
-///
-/// \code{.operation}
-/// FOR i := 0 to 15
-/// IF __U[i]
-/// IF i < 8
-/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
-/// ELSE
-/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
-/// FI
-/// ELSE
-/// dst.fp16[i] := 0
-/// FI
-/// ENDFOR
-///
-/// dst[MAX:256] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
-///
-/// \param __U
-/// A 16-bit zeroing mask.
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __R
-/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
-/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
-/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
-/// _MM_FROUND_TO_ZERO.
-/// \returns
-/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
-/// (converted) elements from \a __B; higher order elements correspond to the
-/// (converted) elements from \a __A. If corresponding mask bit is not set,
-/// then zero is taken instead.
-#define _mm256_maskz_cvtx_round2ps_ph(__U, __A, __B, __R) \
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)(_mm256_setzero_ph()), \
- (__mmask16)(__U), (const int)(__R)))
-
/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
/// 16-bit integer stored in \a __B.
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index e54a278225f1c..6a10d3a20b1bd 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -314,7 +314,6 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_vfmulcph512_mask:
case X86::BI__builtin_ia32_vfcmulcsh_mask:
case X86::BI__builtin_ia32_vfcmulcph512_mask:
- case X86::BI__builtin_ia32_vcvt2ps2phx256_mask:
case X86::BI__builtin_ia32_vcvt2ps2phx512_mask:
ArgNum = 4;
HasRC = true;
diff --git a/clang/test/CodeGen/X86/avx10_2convert-builtins.c b/clang/test/CodeGen/X86/avx10_2convert-builtins.c
index 92ce2ec7e7846..31dd0ecc381ef 100644
--- a/clang/test/CodeGen/X86/avx10_2convert-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2convert-builtins.c
@@ -41,24 +41,6 @@ __m256h test_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
return _mm256_maskz_cvtx2ps_ph(__U, __A, __B);
}
-__m256h test_mm256_cvtx_round2ps_ph(__m256 __A, __m256 __B) {
- // CHECK-LABEL: @test_mm256_cvtx_round2ps_ph(
- // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(
- return _mm256_cvtx_round2ps_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-
-__m256h test_mm256_mask_cvtx_round2ps_ph(__m256h __W, __mmask8 __U, __m256 __A, __m256 __B) {
- // CHECK-LABEL: @test_mm256_mask_cvtx_round2ps_ph(
- // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(
- return _mm256_mask_cvtx_round2ps_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-
-__m256h test_mm256_maskz_cvtx_round2ps_ph(__mmask8 __U, __m256 __A, __m256 __B) {
- // CHECK-LABEL: @test_mm256_maskz_cvtx_round2ps_ph(
- // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(
- return _mm256_maskz_cvtx_round2ps_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-
__m128i test_mm_cvtbiasph_bf8(__m128i __A, __m128h __B) {
// CHECK-LABEL: @test_mm_cvtbiasph_bf8(
// CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128(
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 4fcf2ff8f38df..7bbbb2c451a01 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -7026,8 +7026,8 @@ def int_x86_avx10_mask_vcvt2ps2phx_128 : ClangBuiltin<"__builtin_ia32_vcvt2ps2ph
DefaultAttrsIntrinsic<[llvm_v8f16_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx10_mask_vcvt2ps2phx_256 : ClangBuiltin<"__builtin_ia32_vcvt2ps2phx256_mask">,
- DefaultAttrsIntrinsic<[llvm_v16f16_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v16f16_ty, llvm_i16_ty, llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<4>>]>;
+ DefaultAttrsIntrinsic<[llvm_v16f16_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v16f16_ty, llvm_i16_ty],
+ [IntrNoMem]>;
def int_x86_avx10_mask_vcvt2ps2phx_512 : ClangBuiltin<"__builtin_ia32_vcvt2ps2phx512_mask">,
DefaultAttrsIntrinsic<[llvm_v32f16_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<4>>]>;
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index b368a5299f907..561b6972a680d 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -771,12 +771,6 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr,
_SrcVTInfo.info128>,
EVEX_V128, EVEX_CD8<32, CD8VF>;
}
-
- let Predicates = [HasAVX10_2], hasEVEX_U = 1 in {
- defm Z256 : avx10_cvt2ps2ph_rc<opc, OpcodeStr, sched.YMM,
- _SrcVTInfo.info256, _DstVTInfo.info256,
- OpNodeRnd>;
- }
}
defm VCVT2PS2PHX : avx10_cvt2ps2ph<0x67, "vcvt2ps2phx",
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 31c2bfb8f71c2..0955284662f44 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -440,7 +440,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_128, INTR_TYPE_2OP_MASK,
X86ISD::VFPROUND2, 0),
X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_256, INTR_TYPE_2OP_MASK,
- X86ISD::VFPROUND2, X86ISD::VFPROUND2_RND),
+ X86ISD::VFPROUND2, 0),
X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_512, INTR_TYPE_2OP_MASK,
X86ISD::VFPROUND2, X86ISD::VFPROUND2_RND),
X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2bf8128, TRUNCATE2_TO_REG,
diff --git a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
index fe2bfb7b44691..90e2146cc2c0b 100644
--- a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
@@ -50,7 +50,7 @@ define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256(<8 x float> %A, <8 x float
; CHECK: # %bb.0:
; CHECK-NEXT: vcvt2ps2phx %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x67,0xc1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1, i32 4)
+ %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1)
ret <16 x half> %ret
}
@@ -66,7 +66,7 @@ define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_mask(<16 x half> %W, i16 %
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vcvt2ps2phx %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x67,0xc2]
; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U, i32 4)
+ %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U)
ret <16 x half> %ret
}
@@ -82,52 +82,11 @@ define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_maskz(<16 x half> %W, i16
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vcvt2ps2phx %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x67,0xc2]
; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U, i32 4)
+ %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U)
ret <16 x half> %ret
}
-define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round(<8 x float> %A, <8 x float> %B) {
-; CHECK-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x79,0x78,0x67,0xc1]
-; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1, i32 11)
- ret <16 x half> %ret
-}
-
-define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round_mask(<16 x half> %W, i16 %U, <8 x float> %A, <8 x float> %B) {
-; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_mask:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vcvt2ps2phx {rz-sae}, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x71,0x79,0x67,0xc2]
-; X64-NEXT: retq # encoding: [0xc3]
-;
-; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_mask:
-; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vcvt2ps2phx {rz-sae}, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x71,0x79,0x67,0xc2]
-; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U, i32 11)
- ret <16 x half> %ret
-}
-
-define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round_maskz(i16 %U, <8 x float> %A, <8 x float> %B) {
-; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_maskz:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x79,0xf9,0x67,0xc1]
-; X64-NEXT: retq # encoding: [0xc3]
-;
-; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_maskz:
-; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x79,0xf9,0x67,0xc1]
-; X86-NEXT: retl # encoding: [0xc3]
- %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U, i32 11)
- ret <16 x half> %ret
-}
-
-declare <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float>, <8 x float>, <16 x half>, i16, i32)
+declare <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float>, <8 x float>, <16 x half>, i16)
define <16 x i8> @test_int_x86_avx10_vcvtbiasph2bf8128(<16 x i8> %A, <8 x half> %B) nounwind {
; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2bf8128:
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt
index 3b66fa1da5275..282c5dd087563 100644
--- a/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt
@@ -5,18 +5,10 @@
# INTEL: vcvt2ps2phx ymm2, ymm3, ymm4
0x62,0xf2,0x65,0x28,0x67,0xd4
-# ATT: vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2
-# INTEL: vcvt2ps2phx ymm2, ymm3, ymm4, {rn-sae}
-0x62,0xf2,0x61,0x18,0x67,0xd4
-
# ATT: vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7}
# INTEL: vcvt2ps2phx ymm2 {k7}, ymm3, ymm4
0x62,0xf2,0x65,0x2f,0x67,0xd4
-# ATT: vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z}
-# INTEL: vcvt2ps2phx ymm2 {k7} {z}, ymm3, ymm4, {rz-sae}
-0x62,0xf2,0x61,0xff,0x67,0xd4
-
# ATT: vcvt2ps2phx %zmm4, %zmm3, %zmm2
# INTEL: vcvt2ps2phx zmm2, zmm3, zmm4
0x62,0xf2,0x65,0x48,0x67,0xd4
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt
index 611a584df87cf..6060a1a78e279 100644
--- a/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt
@@ -5,18 +5,10 @@
# INTEL: vcvt2ps2phx ymm22, ymm23, ymm24
0x62,0x82,0x45,0x20,0x67,0xf0
-# ATT: vcvt2ps2phx {rn-sae}, %ymm24, %ymm23, %ymm22
-# INTEL: vcvt2ps2phx ymm22, ymm23, ymm24, {rn-sae}
-0x62,0x82,0x41,0x10,0x67,0xf0
-
# ATT: vcvt2ps2phx %ymm24, %ymm23, %ymm22 {%k7}
# INTEL: vcvt2ps2phx ymm22 {k7}, ymm23, ymm24
0x62,0x82,0x45,0x27,0x67,0xf0
-# ATT: vcvt2ps2phx {rz-sae}, %ymm24, %ymm23, %ymm22 {%k7} {z}
-# INTEL: vcvt2ps2phx ymm22 {k7} {z}, ymm23, ymm24, {rz-sae}
-0x62,0x82,0x41,0xf7,0x67,0xf0
-
# ATT: vcvt2ps2phx %zmm24, %zmm23, %zmm22
# INTEL: vcvt2ps2phx zmm22, zmm23, zmm24
0x62,0x82,0x45,0x40,0x67,0xf0
diff --git a/llvm/test/MC/X86/avx10.2convert-32-att.s b/llvm/test/MC/X86/avx10.2convert-32-att.s
index 940279388e6ac..522a3c63a03ba 100644
--- a/llvm/test/MC/X86/avx10.2convert-32-att.s
+++ b/llvm/test/MC/X86/avx10.2convert-32-att.s
@@ -4,18 +4,10 @@
// CHECK: encoding: [0x62,0xf2,0x65,0x28,0x67,0xd4]
vcvt2ps2phx %ymm4, %ymm3, %ymm2
-// CHECK: vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2
-// CHECK: encoding: [0x62,0xf2,0x61,0x18,0x67,0xd4]
- vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2
-
// CHECK: vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7}
// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0x67,0xd4]
vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7}
-// CHECK: vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z}
-// CHECK: encoding: [0x62,0xf2,0x61,0xff,0x67,0xd4]
- vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z}
-
// CHECK: vcvt2ps2phx %zmm4, %zmm3, %zmm2
// CHECK: encoding: [0x62,0xf2,0x65,0x48,0x67,0xd4]
vcvt2ps2phx %zmm4, %zmm3, %zmm2
diff --git a/llvm/test/MC/X86/avx10.2convert-32-intel.s b/llvm/test/MC/X86/avx10.2convert-32-intel.s
index 52a02f7ff963c..bc60953ce38ad 100644
--- a/llvm/test/MC/X86/avx10.2convert-32-intel.s
+++ b/llvm/test/MC/X...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Ref: https://cdrdv2.intel.com/v1/dl/getContent/784343