From 4101bafc6ec15a1749ba0291cf6eb9071d9d99c0 Mon Sep 17 00:00:00 2001 From: Daniel Kiss Date: Tue, 6 Aug 2024 11:26:43 +0200 Subject: [PATCH 01/36] Add DS_Store to the git ignore. (#339) --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1639fde7..271193c4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ tmp pdfs -tex2pdf* \ No newline at end of file +tex2pdf* +.DS_Store \ No newline at end of file From ede45987c799c92d2724a88c28184518a7daf16e Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Wed, 14 Aug 2024 10:34:33 +0100 Subject: [PATCH 02/36] Remove request for preprocessor guards from header files. (#321) Remove request for preprocessor guards from header files. Co-authored-by: rsandifo-arm Co-authored-by: Sander de Smalen --- main/acle.md | 63 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/main/acle.md b/main/acle.md index 34b1283d..adefa8f1 100644 --- a/main/acle.md +++ b/main/acle.md @@ -923,8 +923,8 @@ and: to the more specific header files below. These intrinsics are in the C implementation namespace and begin with double underscores. It is unspecified whether they are available without the header being -included. The `__ARM_ACLE` macro should be tested before including the -header: +included. When `__ARM_ACLE` is defined to `1`, the header file is +guaranteed to be available. ``` c #ifdef __ARM_ACLE @@ -937,8 +937,9 @@ header: `` is provided to define the scalar 16-bit floating point arithmetic intrinsics. As these intrinsics are in the user namespace, an implementation would not normally define them until the header is -included. The `__ARM_FEATURE_FP16_SCALAR_ARITHMETIC` feature macro -should be tested before including the header: +included. When `__ARM_FEATURE_FP16_SCALAR_ARITHMETIC` is defined to `1`, +the header file is available regardless of the context in which the macro +is evaluated. ``` c #ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC @@ -951,8 +952,9 @@ should be tested before including the header: `` is provided to define the 16-bit brain floating point arithmetic intrinsics. As these intrinsics are in the user namespace, an implementation would not normally define them until the header is -included. The `__ARM_FEATURE_BF16` feature macro -should be tested before including the header: +included. When `__ARM_FEATURE_BF16` is defined to `1`, the header file is +guaranteed to be available regardless of the context in which the macro +is evaluated. ``` c #ifdef __ARM_FEATURE_BF16 @@ -973,8 +975,10 @@ instructions available are conversion intrinsics between `bfloat16_t` and intrinsics](#advanced-simd-neon-intrinsics) and associated [data types](#vector-data-types). As these intrinsics and data types are in the user namespace, an implementation would not normally define them -until the header is included. The `__ARM_NEON` macro should be tested -before including the header: +until the header is included. When `__ARM_NEON` is defined to `1`, +the header file is available regardless of the context in which the macro is +evaluated. + ``` c #ifdef __ARM_NEON @@ -995,8 +999,8 @@ to be included, if the header files are available: `` defines data types and intrinsics for SVE and its extensions; see [SVE language extensions and intrinsics](#sve-language-extensions-and-intrinsics) for details. -You should test the `__ARM_FEATURE_SVE` macro before including the -header: +When `__ARM_FEATURE_SVE` is defined to `1`, the header file is available +regardless of the context in which the macro is evaluated. ``` c #ifdef __ARM_FEATURE_SVE @@ -1015,7 +1019,7 @@ Including `` also includes the following header files: `` defines intrinsics for moving data between Neon and SVE vector types; see [NEON-SVE Bridge](#neon-sve-bridge) -for details. The `__ARM_NEON_SVE_BRIDGE` macro should be tested +for details. The `__ARM_NEON_SVE_BRIDGE` macro should be tested before including the header: ``` c @@ -1057,8 +1061,8 @@ change or be extended in the future. `` declares functions and defines intrinsics for SME and its extensions; see [SME language extensions and intrinsics](#sme-language-extensions-and-intrinsics) -for details. The `__ARM_FEATURE_SME` macro should be tested before -including the header: +for details. When `__ARM_FEATURE_SME` is defined to `1`, the header file is +available regardless of the context in which the macro is evaluated. ``` c #ifdef __ARM_FEATURE_SME @@ -1068,6 +1072,39 @@ including the header: Including `` also includes [``](#arm_sve.h). +### Predefined feature macros and header files + +Evaluating a feature macro returns the availability of intrinsics and inline +assembly for that feature, but no assumptions should be made on the order or +context in which the preprocessor macros are evaluated. For example: + +``` c + __attribute__((target("+sve"))) + void foo() { + #ifdef __ARM_FEATURE_SVE + // The user should make no assumptions that the target attribute + // has enabled the __ARM_FEATURE_SVE macro. + #endif +} +``` + +The compiler may add additional restrictions to the intrinsics beyond what is +captured by the ACLE macros depending on the context in which the intrinsics +are used. For example: + +``` c + #include + void foo(svbool_t pg, void *ptr, uint32_t slice_base) { + #ifdef __ARM_FEATURE_SME + svst1_hor_za8(0, slice_base, pg, ptr); + #endif + } +``` + +If `__ARM_FEATURE_SME` evaluates to `true` the SME intrinsic `svst1_hor_za8` +is available, but `foo` may still fail to compile because the call does not +occur in a [streaming statement](#streaming-statement). + ## Attributes GCC-style attributes are provided to annotate types, objects and From 92bc9582a78e39fc86bffe433a9f92a651564188 Mon Sep 17 00:00:00 2001 From: Daniel Kiss Date: Mon, 19 Aug 2024 12:36:40 +0200 Subject: [PATCH 03/36] [FMV] Add __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL. (#301) --- main/acle.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/main/acle.md b/main/acle.md index adefa8f1..72b13f3f 100644 --- a/main/acle.md +++ b/main/acle.md @@ -400,6 +400,8 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Added a requirement for function version declaration in Function Multi Versioning. * Fixed some rendering issues in the online Markdown documentation and fixed a misplaced anchor. +* Added `__FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL` to indicate the support + level of the [Function Multi Versioning](#function-multi-versioning). ### References @@ -2572,6 +2574,15 @@ following: versioning mechanism described in this section is supported by the compiler and it is enabled. +`__FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL` is defined to the currently supported +version of the ACLE. The value and the format are the same as the `__ARM_ACLE`. + +For example, it can be implemented as: + +``` c +#define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL __ARM_ACLE_VERSION(2024, 3, 0) +``` + ### Name mangling The `"default"` version is mangled with `".default"` on top of the From e98f3badb6742b8a2fd058368bde1b23e1a75209 Mon Sep 17 00:00:00 2001 From: rsandifo-arm Date: Tue, 20 Aug 2024 12:26:14 +0100 Subject: [PATCH 04/36] Add Alpha support for SME2.1 (#309) --- main/acle.md | 484 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 416 insertions(+), 68 deletions(-) diff --git a/main/acle.md b/main/acle.md index 72b13f3f..96957a27 100644 --- a/main/acle.md +++ b/main/acle.md @@ -402,6 +402,8 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin a misplaced anchor. * Added `__FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL` to indicate the support level of the [Function Multi Versioning](#function-multi-versioning). +* Added [**Alpha**](#current-status-and-anticipated-changes) + support for SME2.1 (FEAT_SME2p1). ### References @@ -1906,23 +1908,31 @@ intrinsics are available. This implies that the following macros are nonzero: #### Scalable Matrix Extension (SME) -The specification for SME is in -[**Beta** state](#current-status-and-anticipated-changes) and may -change or be extended in the future. +The specification for SME2.1 is in +[**Alpha** state](#current-status-and-anticipated-changes) and the +specification for the rest of SME is in +[**Beta** state](#current-status-and-anticipated-changes). The +specifications may change or be extended in the future. + +ACLE provides [features](#sme-language-extensions-and-intrinsics) +for accessing the Scalable Matrix Extension (SME). Each revision +of SME has an associated preprocessor macro, given in the table below: -`__ARM_FEATURE_SME` is defined to 1 if there is hardware support -for the FEAT_SME instructions and if the associated [ACLE -features](#sme-language-extensions-and-intrinsics) are available. -This implies that `__ARM_FEATURE_SVE` is nonzero. +| **Feature** | **Macro** | +| ----------- | -------------------------- | +| FEAT_SME | __ARM_FEATURE_SME | +| FEAT_SME2 | __ARM_FEATURE_SME2 | +| FEAT_SME2p1 | __ARM_FEATURE_SME2p1 | + +Each macro is defined if there is hardware support for the associated +architecture feature and if all of the [ACLE +features](#sme-language-extensions-and-intrinsics) that are conditional +on the macro are supported. In addition, `__ARM_FEATURE_LOCALLY_STREAMING` is defined to 1 if the [`arm_locally_streaming`](#arm_locally_streaming) attribute is available. -`__ARM_FEATURE_SME2` is defined to 1 if the FEAT_SME2 instructions -are available and if the associated [ACLE -features](#sme-language-extensions-and-intrinsics) are supported. - #### M-profile Vector Extension `__ARM_FEATURE_MVE` is defined as a bitmap to indicate M-profile Vector @@ -1974,6 +1984,16 @@ instructions from Armv8.2-A are supported and intrinsics targeting them are available. This implies that `__ARM_FEATURE_FP16_SCALAR_ARITHMETIC` is defined to a nonzero value. +#### Half-precision floating-point SME intrinsics + +The specification for SME2.1 is in +[**Alpha** state](#current-status-and-anticipated-changes) and may change or be +extended in the future. + +`__ARM_FEATURE_SME_F16F16` is defined to `1` if there is hardware support +for the SME2 half-precision (FEAT_SME_F16F16) instructions and if their +associated intrinsics are available. + #### Brain 16-bit floating-point support `__ARM_BF16_FORMAT_ALTERNATIVE` is defined to 1 if the Arm @@ -1999,6 +2019,32 @@ See [Half-precision brain floating-point](#half-precision-brain-floating-point) for details of half-precision brain floating-point types. +#### Non-widening brain 16-bit floating-point support + +The specification for B16B16 is in +[**Alpha** state](#current-status-and-anticipated-changes) and may change or be +extended in the future. + +`__ARM_FEATURE_SVE_B16B16` is defined to `1` if there is hardware support +for the FEAT_SVE_B16B16 instructions and if their associated intrinsics +are available. Specifically, if this macro is defined to `1`, then: + +* the SVE subset of the FEAT_SVE_B16B16 intrinsics are available in + [non-streaming statements](#non-streaming-statement) + if `__ARM_FEATURE_SVE` is nonzero. + +* the SVE subset of the FEAT_SVE_B16B16 intrinsics are available in + [streaming-compatible statements](#streaming-compatible-statement) + if `__ARM_FEATURE_SME` is nonzero. + +* all FEAT_SVE_B16B16 intrinsics are available in + [streaming statements](#streaming-statement) if `__ARM_FEATURE_SME` + is nonzero. + +`__ARM_FEATURE_SME_B16B16` is defined to `1` if there is hardware support +for the FEAT_SME_B16B16 instructions and if their associated intrinsics +are available. + ### Cryptographic extensions #### “Crypto” extension @@ -2392,10 +2438,13 @@ be found in [[BA]](#BA). | [`__ARM_FEATURE_SM4`](#sm4-extension) | SM4 Crypto extension (Arm v8.4-A, optional Armv8.2-A, Armv8.3-A) | 1 | | [`__ARM_FEATURE_SME`](#scalable-matrix-extension-sme) | Scalable Matrix Extension (FEAT_SME) | 1 | | [`__ARM_FEATURE_SME2`](#scalable-matrix-extension-sme) | Scalable Matrix Extension (FEAT_SME2) | 1 | +| [`__ARM_FEATURE_SME_B16B16`](#non-widening-brain-16-bit-floating-point-support) | Non-widening brain 16-bit floating-point SME intrinsics (FEAT_SME_B16B16) | 1 | +| [`__ARM_FEATURE_SME_F16F16`](#half-precision-floating-point-sme-intrinsics) | Half-precision floating-point SME intrinsics (FEAT_SME_F16F16) | 1 | | [`__ARM_FEATURE_SME_F64F64`](#double-precision-floating-point-outer-product-intrinsics) | Double precision floating-point outer product intrinsics (FEAT_SME_F64F64) | 1 | | [`__ARM_FEATURE_SME_I16I64`](#16-bit-to-64-bit-integer-widening-outer-product-intrinsics) | 16-bit to 64-bit integer widening outer product intrinsics (FEAT_SME_I16I64) | 1 | | [`__ARM_FEATURE_SME_LOCALLY_STREAMING`](#scalable-matrix-extension-sme) | Support for the `arm_locally_streaming` attribute | 1 | | [`__ARM_FEATURE_SVE`](#scalable-vector-extension-sve) | Scalable Vector Extension (FEAT_SVE) | 1 | +| [`__ARM_FEATURE_SVE_B16B16`](#non-widening-brain-16-bit-floating-point-support) | Non-widening brain 16-bit floating-point intrinsics (FEAT_SVE_B16B16) | 1 | | [`__ARM_FEATURE_SVE_BF16`](#brain-16-bit-floating-point-support) | SVE support for the 16-bit brain floating-point extension (FEAT_BF16) | 1 | | [`__ARM_FEATURE_SVE_BITS`](#scalable-vector-extension-sve) | The number of bits in an SVE vector, when known in advance | 256 | | [`__ARM_FEATURE_SVE_MATMUL_FP32`](#multiplication-of-32-bit-floating-point-matrices) | 32-bit floating-point matrix multiply extension (FEAT_F32MM) | 1 | @@ -8683,8 +8732,8 @@ The specification for B16B16 is in [**Alpha** state](#current-status-and-anticipated-changes) and may change or be extended in the future. -The instructions in this section are available when __ARM_FEATURE_B16B16 is -non-zero. +The instructions in this section are available when `__ARM_FEATURE_SVE_B16B16` +is non-zero. #### BFADD, BFSUB @@ -8755,6 +8804,7 @@ BFloat16 floating-point maximum/minimum number (predicated). ``` #### BFMLA, BFMLS + BFloat16 floating-point fused multiply add or sub vectors. ``` c @@ -10202,17 +10252,16 @@ Replacing `_hor` with `_ver` gives the associated vertical forms. __arm_streaming __arm_inout("za"); ``` -#### FMOPA (non-widening) +#### BFMOPA, FMOPA (non-widening) ``` c + // Variants are also available for: + // _za16[_bf16]_m (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16]_m (only if __ARM_FEATURE_SME_F16F16 != 0) + // _za64[_f64]_m (only if __ARM_FEATURE_SME_F64F64 != 0) void svmopa_za32[_f32]_m(uint64_t tile, svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za"); - - // Only if __ARM_FEATURE_SME_F64F64 != 0 - void svmopa_za64[_f64]_m(uint64_t tile, svbool_t pn, svbool_t pm, - svfloat64_t zn, svfloat64_t zm) - __arm_streaming __arm_inout("za"); ``` #### BFMOPS, FMOPS (widening), SMOPS, UMOPS @@ -10245,17 +10294,16 @@ Replacing `_hor` with `_ver` gives the associated vertical forms. __arm_streaming __arm_inout("za"); ``` -#### FMOPS (non-widening) +#### BFMOPS, FMOPS (non-widening) ``` c + // Variants are also available for: + // _za16[_bf16]_m (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16]_m (only if __ARM_FEATURE_SME_F16F16 != 0) + // _za64[_f64]_m (only if __ARM_FEATURE_SME_F64F64 != 0) void svmops_za32[_f32]_m(uint64_t tile, svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za"); - - // Only if __ARM_FEATURE_SME_F64F64 != 0 - void svmops_za64[_f64]_m(uint64_t tile, svbool_t pn, svbool_t pm, - svfloat64_t zn, svfloat64_t zm) - __arm_streaming __arm_inout("za"); ``` #### RDSVL @@ -10473,12 +10521,14 @@ Multi-vector add svint8x4_t svadd[_single_s8_x4](svint8x4_t zdn, svint8_t zm) __arm_streaming; ``` -#### ADD, SUB, FADD, FSUB (accumulate into ZA) +#### ADD, SUB, BFADD, BFSUB, FADD, FSUB (accumulate into ZA) Multi-vector add/sub and accumulate into ZA ``` c // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za32[_s32] // _za32[_u32] @@ -10490,6 +10540,8 @@ Multi-vector add/sub and accumulate into ZA // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za32[_s32] // _za32[_u32] @@ -10501,6 +10553,8 @@ Multi-vector add/sub and accumulate into ZA // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za32[_s32] // _za32[_u32] @@ -10512,6 +10566,8 @@ Multi-vector add/sub and accumulate into ZA // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za32[_s32] // _za32[_u32] @@ -10531,6 +10587,16 @@ Multi-vector floating-point convert from single-precision to interleaved half-pr svbfloat16_t svcvtn_bf16[_f32_x2](svfloat32x2_t zn) __arm_streaming; ``` +#### FCVTL + +Multi-vector floating-point convert from half-precision to deinterleaved +single-precision. + +``` + // Only if __ARM_FEATURE_SME_F16F16 != 0 + svfloat32x2_t svcvtl_f32[_f16_x2](svfloat16_t zn) __arm_streaming; +``` + #### FCVT, BFCVT, FCVTZS, FCVTZU, SCVTF, UCVTF Multi-vector convert to/from floating-point. @@ -10546,6 +10612,9 @@ Multi-vector convert to/from floating-point. // Variants are also available for _f32[_u32_x4], _s32[_f32_x4] and _u32[_f32_x4] svfloat32x4_t svcvt_f32[_s32_x4](svint32x4_t zn) __arm_streaming; + + // Only if __ARM_FEATURE_SME_F16F16 != 0 + svfloat32x2_t svcvt_f32[_f16_x2](svfloat16_t zn) __arm_streaming; ``` #### SQCVT, SQCVTU, UQCVT @@ -10792,12 +10861,14 @@ Bitwise exclusive NOR population count outer product and accumulate/subtract __arm_streaming __arm_inout("za"); ``` -#### FMLA, FMLS (single) +#### BFMLA, BFMLS, FMLA, FMLS (single) Multi-vector floating-point fused multiply-add/subtract ``` c // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmla[_single]_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, @@ -10806,6 +10877,8 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmla[_single]_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, @@ -10814,6 +10887,8 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmls[_single]_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, @@ -10822,6 +10897,8 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmls[_single]_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, @@ -10829,12 +10906,14 @@ Multi-vector floating-point fused multiply-add/subtract __arm_streaming __arm_inout("za"); ``` -#### FMLA, FMLS (multi) +#### BFMLA, BFMLS, FMLA, FMLS (multi) Multi-vector floating-point fused multiply-add/subtract ``` c // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmla_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, @@ -10843,6 +10922,8 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmla_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, @@ -10851,6 +10932,8 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmls_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, @@ -10859,6 +10942,8 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmls_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, @@ -10866,12 +10951,14 @@ Multi-vector floating-point fused multiply-add/subtract __arm_streaming __arm_inout("za"); ``` -#### FMLA, FMLS (indexed) +#### BFMLA. BFMLS, FMLA, FMLS (indexed) Multi-vector floating-point fused multiply-add/subtract ``` c // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmla_lane_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, @@ -10880,6 +10967,8 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmla_lane_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, @@ -10888,6 +10977,8 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmls_lane_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, @@ -10896,6 +10987,8 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: + // _za16[_bf16] (only if __ARM_FEATURE_SME_B16B16 != 0) + // _za16[_f16] (only if __ARM_FEATURE_SME_F16F16 != 0) // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) void svmls_lane_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, @@ -11287,114 +11380,214 @@ Multi-vector multiply-subtract long long (widening) __arm_streaming __arm_inout("za"); ``` -#### SMAX, SMIN, UMAX, UMIN, FMAX, FMIN (single) +#### SMAX, SMIN, UMAX, UMIN, BFMAX, BFMIN, FMAX, FMIN (single) Multi-vector min/max ``` c - // Variants are also available for _single_s8_x2, _single_u8_x2, - // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, - // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 + // Variants are also available for: + // _single_s8_x2 + // _single_u8_x2, + // _single_bf16_x2 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _single_s16_x2 + // _single_u16_x2 + // _single_s32_x2 + // _single_u32_x2, + // _single_f32_x2 + // _single_s64_x2 + // _single_u64_x2 + // _single_f64_x2 svfloat16x2_t svmax[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm) __arm_streaming; - // Variants are also available for _single_s8_x4, _single_u8_x4, - // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, - // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 + // Variants are also available for: + // _single_s8_x4 + // _single_u8_x4, + // _single_bf16_x4 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _single_s16_x4 + // _single_u16_x4 + // _single_s32_x4 + // _single_u32_x4, + // _single_f32_x4 + // _single_s64_x4 + // _single_u64_x4 + // _single_f64_x4 svfloat16x4_t svmax[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm) __arm_streaming; - // Variants are also available for _single_s8_x2, _single_u8_x2, - // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, - // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 + // Variants are also available for: + // _single_s8_x2 + // _single_u8_x2, + // _single_bf16_x2 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _single_s16_x2 + // _single_u16_x2 + // _single_s32_x2 + // _single_u32_x2, + // _single_f32_x2 + // _single_s64_x2 + // _single_u64_x2 + // _single_f64_x2 svfloat16x2_t svmin[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm) __arm_streaming; - // Variants are also available for _single_s8_x4, _single_u8_x4, - // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, - // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 + // Variants are also available for: + // _single_s8_x4 + // _single_u8_x4, + // _single_bf16_x4 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _single_s16_x4 + // _single_u16_x4 + // _single_s32_x4 + // _single_u32_x4, + // _single_f32_x4 + // _single_s64_x4 + // _single_u64_x4 + // _single_f64_x4 svfloat16x4_t svmin[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm) __arm_streaming; ``` -#### SMAX, SMIN, UMAX, UMIN, FMAX, FMIN (multi) +#### SMAX, SMIN, UMAX, UMIN, BFMAX, BFMIN, FMAX, FMIN (multi) Multi-vector min/max ``` c - // Variants are also available for _s8_x2, _u8_x2, _s16_x2, _u16_x2, - // _s32_x2, _u32_x2, _f32_x2, _s64_x2, _u64_x2 and _f64_x2 + // Variants are also available for: + // _s8_x2 + // _u8_x2 + // _bf16_x2 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _s16_x2 + // _u16_x2, + // _s32_x2 + // _u32_x2 + // _f32_x2 + // _s64_x2 + // _u64_x2 + // _f64_x2 svfloat16x2_t svmax[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming; - // Variants are also available for _s8_x4, _u8_x4, _s16_x4, _u16_x4, - // _s32_x4, _u32_x4, _f32_x4, _s64_x4, _u64_x4 and _f64_x4 + // Variants are also available for: + // _s8_x4 + // _u8_x4 + // _bf16_x4 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _s16_x4 + // _u16_x4, + // _s32_x4 + // _u32_x4 + // _f32_x4 + // _s64_x4 + // _u64_x4 + // _f64_x4 svfloat16x4_t svmax[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming; - // Variants are also available for _s8_x2, _u8_x2, _s16_x2, _u16_x2, - // _s32_x2, _u32_x2, _f32_x2, _s64_x2, _u64_x2 and _f64_x2 + // Variants are also available for: + // _s8_x2 + // _u8_x2 + // _bf16_x2 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _s16_x2 + // _u16_x2, + // _s32_x2 + // _u32_x2 + // _f32_x2 + // _s64_x2 + // _u64_x2 + // _f64_x2 svfloat16x2_t svmin[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming; - // Variants are also available for _s8_x4, _u8_x4, _s16_x4, _u16_x4, - // _s32_x4, _u32_x4, _f32_x4, _s64_x4,_u64_x4 and _f64_x4 + // Variants are also available for: + // _s8_x4 + // _u8_x4 + // _bf16_x4 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _s16_x4 + // _u16_x4, + // _s32_x4 + // _u32_x4 + // _f32_x4 + // _s64_x4 + // _u64_x4 + // _f64_x4 svfloat16x4_t svmin[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming; ``` -#### FMAXNM, FMINNM (single) +#### BFMAXNM, BFMINNM, FMAXNM, FMINNM (single) Multi-vector floating point min/max number ``` c - // Variants are also available for _single_f32_x2 and _single_f64_x2 + // Variants are also available for: + // _single_bf16_x2 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _single_f32_x2 + // _single_f64_x2 svfloat16x2_t svmaxnm[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm) __arm_streaming; - // Variants are also available for _single_f32_x4 and _single_f64_x4 + // Variants are also available for: + // _single_bf16_x4 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _single_f32_x4 + // _single_f64_x4 svfloat16x4_t svmaxnm[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm) __arm_streaming; - // Variants are also available for _single_f32_x2 and _single_f64_x2 + // Variants are also available for: + // _single_bf16_x2 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _single_f32_x2 + // _single_f64_x2 svfloat16x2_t svminnm[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm) __arm_streaming; - // Variants are also available for _single_f32_x4 and _single_f64_x4 + // Variants are also available for: + // _single_bf16_x4 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _single_f32_x4 + // _single_f64_x4 svfloat16x4_t svminnm[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm) __arm_streaming; ``` -#### FMAXNM, FMINNM (multi) +#### BFMAXNM, BFMINNM, FMAXNM, FMINNM (multi) Multi-vector floating point min/max number ``` c - // Variants are also available for _f32_x2 and _f64_x2 + // Variants are also available for: + // _bf16_x2 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _f32_x2 + // _f64_x2 svfloat16x2_t svmaxnm[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming; - // Variants are also available for _f32_x4 and _f64_x4 + // Variants are also available for: + // _bf16_x4 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _f32_x4 + // _f64_x4 svfloat16x4_t svmaxnm[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming; - // Variants are also available for _f32_x2 and _f64_x2 + // Variants are also available for: + // _bf16_x2 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _f32_x2 + // _f64_x2 svfloat16x2_t svminnm[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming; - // Variants are also available for _f32_x4 and _f64_x4 + // Variants are also available for: + // _bf16_x4 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _f32_x4 + // _f64_x4 svfloat16x4_t svminnm[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming; ``` @@ -11584,22 +11777,40 @@ Move multi-vectors to/from ZA __arm_streaming __arm_inout("za"); ``` -#### UCLAMP, SCLAMP, FCLAMP +#### UCLAMP, SCLAMP, BFCLAMP, FCLAMP Multi-vector clamp to minimum/maximum vector ``` c - // Variants are also available for _single_s8_x2, _single_u8_x2, - // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, - // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 + // Variants are also available for: + // _single_s8_x2 + // _single_u8_x2, + // _single_bf16_x2 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _single_s16_x2 + // _single_u16_x2 + // _single_s32_x2 + // _single_u32_x2, + // _single_f32_x2 + // _single_s64_x2 + // _single_u64_x2 + // _single_f64_x2 svfloat16x2_t svclamp[_single_f16_x2](svfloat16x2_t zd, svfloat16_t zn, svfloat16_t zm) __arm_streaming; - // Variants are also available for _single_s8_x4, _single_u8_x4, - // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, - // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 + // Variants are also available for: + // _single_s8_x4 + // _single_u8_x4, + // _single_bf16_x4 (only if __ARM_FEATURE_SVE_B16B16 != 0) + // _single_s16_x4 + // _single_u16_x4 + // _single_s32_x4 + // _single_u32_x4, + // _single_f32_x4 + // _single_s64_x4 + // _single_u64_x4 + // _single_f64_x4 svfloat16x4_t svclamp[_single_f16_x4](svfloat16x4_t zd, svfloat16_t zn, svfloat16_t zm) __arm_streaming; @@ -11821,6 +12032,143 @@ element types. svint8x4_t svuzpq[_s8_x4](svint8x4_t zn) __arm_streaming; ``` +### SME2.1 instruction intrinsics + +The specification for SME2.1 is in +[**Alpha** state](#current-status-and-anticipated-changes) and may change or be +extended in the future. + +The intrinsics in this section are defined by the header file +[``](#arm_sme.h) when `__ARM_FEATURE_SME2p1` is defined. + +#### MOVAZ (tile to vector, single) + +Move and zero ZA tile slice to vector register. + +``` + // And similarly for u8. + svint8_t svreadz_hor_za8_s8(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + // And similarly for u16, bf16 and f16. + svint16_t svreadz_hor_za16_s16(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + // And similarly for u32 and f32. + svint32_t svreadz_hor_za32_s32(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + // And similarly for u64 and f64. + svint64_t svreadz_hor_za64_s64(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + // And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64 + svint8_t svreadz_hor_za128_s8(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + // And similarly for u8. + svint8_t svreadz_ver_za8_s8(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + // And similarly for u16, bf16 and f16. + svint16_t svreadz_ver_za16_s16(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + // And similarly for u32 and f32. + svint32_t svreadz_ver_za32_s32(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + // And similarly for u64 and f64. + svint64_t svreadz_ver_za64_s64(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + // And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64 + svint8_t svreadz_ver_za128_s8(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); +``` + +#### MOVAZ (tile to vector, multiple) + +Move and zero multiple ZA tile slices to vector registers + +``` c + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x2_t svreadz_hor_za8_s8_vg2(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x4_t svreadz_hor_za8_s8_vg4(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x2_t svreadz_ver_za8_s8_vg2(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); + + + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x4_t svreadz_ver_za8_s8_vg4(uint64_t tile, uint32_t slice) + __arm_streaming __arm_inout("za"); +``` + +#### MOVAZ (array to vector) + +Move and zero multiple ZA single-vector groups to vector registers + +``` + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x2_t svreadz_za8_s8_vg1x2(uint32_t slice) + __arm_streaming __arm_inout("za"); + + + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x4_t svreadz_za8_s8_vg1x4(uint32_t slice) + __arm_streaming __arm_inout("za"); +``` + +#### ZERO (vector groups) + +Zero ZA vector groups + +``` + void svzero_za64_vg1x2(uint32_t slice) + __arm_streaming __arm_inout("za"); + + void svzero_za64_vg1x4(uint32_t slice) + __arm_streaming __arm_inout("za"); + + void svzero_za64_vg2x1(uint32_t slice) + __arm_streaming __arm_inout("za"); + + void svzero_za64_vg2x2(uint32_t slice) + __arm_streaming __arm_inout("za"); + + void svzero_za64_vg2x4(uint32_t slice) + __arm_streaming __arm_inout("za"); + + void svzero_za64_vg4x1(uint32_t slice) + __arm_streaming __arm_inout("za"); + + void svzero_za64_vg4x2(uint32_t slice) + __arm_streaming __arm_inout("za"); + + void svzero_za64_vg4x4(uint32_t slice) + __arm_streaming __arm_inout("za"); +``` + ### Streaming-compatible versions of standard routines ACLE provides the following streaming-compatible functions, From 6f11d88deec56fd13d86d39ff4e187022928a846 Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Tue, 3 Sep 2024 10:17:21 +0100 Subject: [PATCH 05/36] Fix incorrect naming of __ARM_FEATURE_SVE2p1 (#338) In one instance, it was incorrectly named `__ARM_FEATURE_SVE2.1`. --- main/acle.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/acle.md b/main/acle.md index 96957a27..cca7631c 100644 --- a/main/acle.md +++ b/main/acle.md @@ -12197,7 +12197,7 @@ extended in the future. The functions in this section are defined by either the header file [``](#arm_sve.h) or [``](#arm_sme.h) -when `__ARM_FEATURE_SVE2.1` or `__ARM_FEATURE_SME2` is defined, respectively. +when `__ARM_FEATURE_SVE2p1` or `__ARM_FEATURE_SME2` is defined, respectively. These intrinsics can only be called from non-streaming code if `__ARM_FEATURE_SVE2p1` is defined. They can only be called from streaming code From ddfc04800edad91bfcefa37200618da21ed63e12 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Tue, 3 Sep 2024 10:17:51 +0100 Subject: [PATCH 06/36] Fix formatting error from #321 (#341) --- main/acle.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/acle.md b/main/acle.md index cca7631c..6951d9cc 100644 --- a/main/acle.md +++ b/main/acle.md @@ -1087,7 +1087,7 @@ context in which the preprocessor macros are evaluated. For example: void foo() { #ifdef __ARM_FEATURE_SVE // The user should make no assumptions that the target attribute - // has enabled the __ARM_FEATURE_SVE macro. + // has enabled the __ARM_FEATURE_SVE macro. #endif } ``` From e938350b3034512912eb2655402324750cb8c1ef Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Tue, 3 Sep 2024 15:37:41 +0100 Subject: [PATCH 07/36] Intrinsics for absolute minimum and maximum, and table lookup (#324) * Intrinsics for absolute minimum and maximum, and table lookup --- main/acle.md | 148 +++++++++++++++++- neon_intrinsics/advsimd.md | 77 ++++++++- neon_intrinsics/advsimd.template.md | 4 +- tools/intrinsic_db/advsimd.csv | 81 +++++++++- tools/intrinsic_db/advsimd_classification.csv | 62 +++++++- 5 files changed, 363 insertions(+), 9 deletions(-) diff --git a/main/acle.md b/main/acle.md index 6951d9cc..692baa3d 100644 --- a/main/acle.md +++ b/main/acle.md @@ -405,6 +405,11 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Added [**Alpha**](#current-status-and-anticipated-changes) support for SME2.1 (FEAT_SME2p1). +* Added specifications for floating-point absolute minimum + and maximum intrinsics (FEAT_FAMINMAX). + +* Added specifications for table lookup intrinsics (FEAT_LUT, FEAT_SME_LUTv2). + ### References This document refers to the following documents. @@ -2124,6 +2129,22 @@ support for the SVE2 SM4 (FEAT_SVE_SM4) instructions and if the associated ACLE intrinsics are available. This implies that `__ARM_FEATURE_SM4` and `__ARM_FEATURE_SVE2` are both nonzero. +### Floating-point absolute minimum and maximum extension + +`__ARM_FEATURE_FAMINMAX` is defined to 1 if there is hardware support for +floating-point absolute minimum and maximum instructions (FEAT_FAMINMAX) +and if the associated ACLE intrinsics are available. + +### Lookup table extensions + +`__ARM_FEATURE_LUT` is defined to 1 if there is hardware support for +lookup table instructions with 2-bit and 4-bit indices (FEAT_LUT) +and if the associated ACLE intrinsics are available. + +`__ARM_FEATURE_SME_LUTv2` is defined to 1 if there is hardware support for +lookup table instructions with 4-bit indices and 8-bit elements (FEAT_SME_LUTv2) +and if the associated ACLE intrinsics are available. + ### Other floating-point and vector extensions #### Fused multiply-accumulate (FMA) @@ -2411,12 +2432,14 @@ be found in [[BA]](#BA). | [`__ARM_FEATURE_DIRECTED_ROUNDING`](#directed-rounding) | Directed Rounding | 1 | | [`__ARM_FEATURE_DOTPROD`](#availability-of-dot-product-intrinsics) | Dot product extension (ARM v8.2-A) | 1 | | [`__ARM_FEATURE_DSP`](#dsp-instructions) | DSP instructions (Arm v5E) (32-bit-only) | 1 | +| [`__ARM_FEATURE_FAMINMAX`](#floating-point-absolute-minimum-and-maximum-extension) | Floating-point absolute minimum and maximum extension | 1 | | [`__ARM_FEATURE_FMA`](#fused-multiply-accumulate-fma) | Floating-point fused multiply-accumulate | 1 | | [`__ARM_FEATURE_FP16_FML`](#fp16-fml-extension) | FP16 FML extension (Arm v8.4-A, optional Armv8.2-A, Armv8.3-A) | 1 | | [`__ARM_FEATURE_FRINT`](#availability-of-armv8.5-a-floating-point-rounding-intrinsics) | Floating-point rounding extension (Arm v8.5-A) | 1 | | [`__ARM_FEATURE_IDIV`](#hardware-integer-divide) | Hardware Integer Divide | 1 | | [`__ARM_FEATURE_JCVT`](#javascript-floating-point-conversion) | Javascript conversion (ARMv8.3-A) | 1 | | [`__ARM_FEATURE_LDREX`](#ldrexstrex) *(Deprecated)* | Load/store exclusive instructions | 0x0F | +| [`__ARM_FEATURE_LUT`](#lookup-table-extensions) | Lookup table extensions (FEAT_LUT) | 1 | | [`__ARM_FEATURE_MATMUL_INT8`](#availability-of-armv8.6-a-integer-matrix-multiply-intrinsics) | Integer Matrix Multiply extension (Armv8.6-A, optional Armv8.2-A, Armv8.3-A, Armv8.4-A, Armv8.5-A) | 1 | | [`__ARM_FEATURE_MEMORY_TAGGING`](#memory-tagging) | Memory Tagging (Armv8.5-A) | 1 | | [`__ARM_FEATURE_MOPS`](#memcpy-family-of-memory-operations-standarization-instructions---mops) | `memcpy`, `memset`, and `memmove` family of operations standardization instructions | 1 | @@ -2443,6 +2466,7 @@ be found in [[BA]](#BA). | [`__ARM_FEATURE_SME_F64F64`](#double-precision-floating-point-outer-product-intrinsics) | Double precision floating-point outer product intrinsics (FEAT_SME_F64F64) | 1 | | [`__ARM_FEATURE_SME_I16I64`](#16-bit-to-64-bit-integer-widening-outer-product-intrinsics) | 16-bit to 64-bit integer widening outer product intrinsics (FEAT_SME_I16I64) | 1 | | [`__ARM_FEATURE_SME_LOCALLY_STREAMING`](#scalable-matrix-extension-sme) | Support for the `arm_locally_streaming` attribute | 1 | +| [`__ARM_FEATURE_SME_LUTv2`](#lookup-table-extensions) | Lookup table extensions (FEAT_SME_LUTv2) | 1 | | [`__ARM_FEATURE_SVE`](#scalable-vector-extension-sve) | Scalable Vector Extension (FEAT_SVE) | 1 | | [`__ARM_FEATURE_SVE_B16B16`](#non-widening-brain-16-bit-floating-point-support) | Non-widening brain 16-bit floating-point intrinsics (FEAT_SVE_B16B16) | 1 | | [`__ARM_FEATURE_SVE_BF16`](#brain-16-bit-floating-point-support) | SVE support for the 16-bit brain floating-point extension (FEAT_BF16) | 1 | @@ -9125,6 +9149,73 @@ Interleave elements from halves of each pair of quadword vector segments. svuint8_t svzipq2[_u8](svuint8_t zn, svuint8_t zm); ``` +### SVE2 maximum and minimum absolute value + +The intrinsics in this section are defined by the header file +[``](#arm_sve.h) when either `__ARM_FEATURE_SVE2` or +`__ARM_FEATURE_SME2` is defined to 1, and `__ARM_FEATURE_FAMINMAX` +is defined to 1. + +#### FAMAX + +Floating-point absolute maximum (predicated). +``` c + // Variants are also available for: _f32 and _f64 + svfloat16_t svamax[_f16]_m(svbool_t pg, svfloat16_t zn, svfloat16_t zm); + svfloat16_t svamax[_f16]_x(svbool_t pg, svfloat16_t zn, svfloat16_t zm); + svfloat16_t svamax[_f16]_z(svbool_t pg, svfloat16_t zn, svfloat16_t zm); + + // Variants are also available for: _f32 and _f64 + svfloat16_t svamax[_n_f16]_m(svbool_t pg, svfloat16_t zn, float16_t zm); + svfloat16_t svamax[_n_f16]_x(svbool_t pg, svfloat16_t zn, float16_t zm); + svfloat16_t svamax[_n_f16]_z(svbool_t pg, svfloat16_t zn, float16_t zm); +``` + +#### FAMIN + +Floating-point absolute minimum (predicated). +``` c + // Variants are also available for: _f32 and _f64 + svfloat16_t svamin[_f16]_m(svbool_t pg, svfloat16_t zn, svfloat16_t zm); + svfloat16_t svamin[_f16]_x(svbool_t pg, svfloat16_t zn, svfloat16_t zm); + svfloat16_t svamin[_f16]_z(svbool_t pg, svfloat16_t zn, svfloat16_t zm); + + // Variants are also available for: _f32 and _f64 + svfloat16_t svamin[_n_f16]_m(svbool_t pg, svfloat16_t zn, float16_t zm); + svfloat16_t svamin[_n_f16]_x(svbool_t pg, svfloat16_t zn, float16_t zm); + svfloat16_t svamin[_n_f16]_z(svbool_t pg, svfloat16_t zn, float16_t zm); +``` + +### SVE2 lookup table + +The intrinsics in this section are defined by the header file +[``](#arm_sve.h) when either `__ARM_FEATURE_SVE2` or +`__ARM_FEATURE_SME2` is defined to 1, and `__ARM_FEATURE_LUT` +is defined to 1. + +#### LUTI2 + +Lookup table read with 2-bit indices. +```c + // Variant is also available for: _u8 + svint8_t svluti2_lane[_s8](svint8_t table, svuint8_t indices, uint64_t imm_idx); + + // Variant are also available for: _u16, _f16 and _bf16 + svint16_t svluti2_lane[_s16]( svint16_t table, svuint8_t indices, uint64_t imm_idx); +``` + +#### LUTI4 + +Lookup table read with 4-bit indices. +```c + // Variant is also available for: _u8 + svint8_t svluti4_lane[_s8](svint8_t table, svuint8_t indices, uint64_t imm_idx); + + // Variant are also available for: _u16, _f16, _bf16 + svint16_t svluti4_lane[_s16](svint16_t table, svuint8_t indices, uint64_t imm_idx); + svint16_t svluti4_lane[_s16_x2](svint16x2_t table, svuint8_t indices, uint64_t imm_idx); +``` + # SME language extensions and intrinsics The specification for SME is in @@ -12714,7 +12805,62 @@ While (resulting in predicate tuple) // _b64[_s64]_x2, _b8[_u64]_x2, _b16[_u64]_x2, _b32[_u64]_x2 and // _b64[_u64]_x2 svboolx2_t svwhilelt_b8[_s64]_x2(int64_t rn, int64_t rm); - ``` +``` + + +### SME2 maximum and minimum absolute value + +The intrinsics in this section are defined by the header file +[``](#arm_sme.h) when `__ARM_FEATURE_SME2` is defined to 1 +and `__ARM_FEATURE_FAMINMAX` is defined to 1. + +#### FAMAX + +Absolute maximum. +``` c + // Variants are also available for: + // [_f32_x2], [_f64_x2], + // [_f16_x4], [_f32_x4] and [_f64_x4] + svfloat16x2_t svamax[_f16_x2](svfloat16x2 zd, svfloat16x2_t zm) __arm_streaming; +``` + +#### FAMIN + +Absolute minimum. +``` c + // Variants are also available for: + // [_f32_x2], [_f64_x2], + // [_f16_x4], [_f32_x4] and [_f64_x4] + svfloat16x2_t svamin[_f16_x2](svfloat16x2 zd, svfloat16x2_t zm) __arm_streaming; +``` + +### SME2 lookup table + +The intrinsics in this section are defined by the header file +[``](#arm_sme.h) when `__ARM_FEATURE_SME_LUTv2` is defined to 1. + +#### MOVT + +Move vector register to ZT0. +``` c + // Variants are also available for: + // [_s8], [_u16], [_s16], [_u32], [_s32], [_u64], [_s64] + // [_bf16], [_f16], [_f32], [_f64] + void svwrite_zt[_u8](uint64_t zt0, svuint8_t zt) __arm_streaming __arm_out("zt0"); + + // Variants are also available for: + // [_s8], [_u16], [_s16], [_u32], [_s32], [_u64], [_s64] + // [_bf16], [_f16], [_f32], [_f64] + void svwrite_lane_zt[_u8](uint64_t zt0, svuint8_t zt, uint64_t idx) __arm_streaming __arm_inout("zt0"); +``` + +#### LUTI4 + +Lookup table read with 4-bit indexes and 8-bit elements. +``` c + // Variants are also available for: _u8 + svint8x4_t svluti4_zt_s8_x4(uint64_t zt0, svuint8x2_t zn) __arm_streaming __arm_in("zt0"); +``` # M-profile Vector Extension (MVE) intrinsics diff --git a/neon_intrinsics/advsimd.md b/neon_intrinsics/advsimd.md index ffdd5952..ee89c92b 100644 --- a/neon_intrinsics/advsimd.md +++ b/neon_intrinsics/advsimd.md @@ -12,7 +12,7 @@ toc: true --- -[![All Contributors](https://img.shields.io/badge/all_contributors-34-orange.svg?style=flat-square)](#contributors-) +[![All Contributors](https://img.shields.io/badge/all_contributors-35-orange.svg?style=flat-square)](#contributors-) ![Continuous Integration](https://github.com/ARM-software/acle/actions/workflows/ci.yml/badge.svg) @@ -130,6 +130,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d Lucas Duarte Prates
Lucas Duarte Prates

💻 Andrew Carlotti
Andrew Carlotti

👀 Alexandros Lamprineas
Alexandros Lamprineas

💻 + Lukacma
Lukacma

📖 From 487cdbc05ce66d49418d32c1d0d77350d16f19da Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Thu, 12 Sep 2024 09:09:28 +0100 Subject: [PATCH 10/36] [FMV] Unify sha1, sha2. (#347) According to ArmARM: SHA2, bits [15:12] 0b0000 No SHA2 instructions implemented. FEAT_SHA256 implements the functionality identified by the value 0b0001. If the value of ID_AA64ISAR0_EL1.SHA1 is 0b0000, this field must have the value 0b0000. SHA1, bits [11:8] 0b0000 No SHA1 instructions implemented. FEAT_SHA1 implements the functionality identified by the value 0b0001. If the value of ID_AA64ISAR0_EL1.SHA2 is 0b0000, this field must have the value 0b0000. To my understanding this means you can't have one without the other. --- main/acle.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/acle.md b/main/acle.md index 17a30075..653a315f 100644 --- a/main/acle.md +++ b/main/acle.md @@ -402,6 +402,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin a misplaced anchor. * Added `__FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL` to indicate the support level of the [Function Multi Versioning](#function-multi-versioning). +* Unified Function Multi Versioning features sha1, sha2. * Added [**Alpha**](#current-status-and-anticipated-changes) support for SME2.1 (FEAT_SME2p1). * Removed requirement to add preprocessor guards for header files. @@ -2714,8 +2715,7 @@ The following table lists the architectures feature mapping for AArch64 | 106 | `FEAT_SM3`, `FEAT_SM4` | sm4 | ```ID_AA64ISAR0_EL1.SM4 >= 0b0001``` | | 108 | `FEAT_RDM` | rdm, rdma | ```ID_AA64ISAR0_EL1.RDM >= 0b0001``` | | 110 | `FEAT_CRC32` | crc | ```ID_AA64ISAR0_EL1.CRC32 >= 0b0001``` | - | 120 | `FEAT_SHA1` | sha1 | ```ID_AA64ISAR0_EL1.SHA1 >= 0b0001``` | - | 130 | `FEAT_SHA256` | sha2 | ```ID_AA64ISAR0_EL1.SHA2 >= 0b0001``` | + | 130 | `FEAT_SHA1`,`FEAT_SHA256`| sha2 | ```ID_AA64ISAR0_EL1.SHA2 >= 0b0001``` | | 140 | `FEAT_SHA512`,`FEAT_SHA3`| sha3 | ```ID_AA64ISAR0_EL1.SHA3 >= 0b0001``` | | 150 | `FEAT_AES` | aes | ```ID_AA64ISAR0_EL1.AES >= 0b0001``` | | 160 | `FEAT_PMULL` | pmull | ```ID_AA64ISAR0_EL1.AES >= 0b0010``` | From 72e3673017f275572c3b2cb12e31c0e7602906cf Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Thu, 12 Sep 2024 09:56:06 +0100 Subject: [PATCH 11/36] Update GitHub Action versions (#348) Action 'upload-artifact' version 2 has been deprecated, hence the need to update its version. To piggyback on this work, this patch updates all GitHub Action versions to their latest release. --- .github/workflows/ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d955239b..ca97c667 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,10 +10,10 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4.1.7 - name: check the correctness of the sources and generate the PDFs run: ./build_with_docker.sh - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v4.4.0 with: name: pdfs path: pdfs @@ -21,14 +21,14 @@ jobs: build-github-pages: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4.1.7 - name: generate the GitHub Pages locally in order to check for errors run: ./tools/build-github-pages.sh build markdown-link-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@master + - uses: actions/checkout@4.1.7 - uses: gaurav-nelson/github-action-markdown-link-check@v1 with: config-file: '.github/workflows/markdown-link-check.json' @@ -37,7 +37,7 @@ jobs: runs-on: ubuntu-latest if: github.base_ref == 'main' || github.ref == 'refs/heads/main' steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v4.1.7 with: fetch-depth: 0 - name: Check correctness of draftversion fields From fb3e19d2025ddf94d9c5160a0dae70ba9c7bc183 Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Thu, 12 Sep 2024 09:58:01 +0100 Subject: [PATCH 12/36] Fix typo in ci.yml workflow --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ca97c667..a784ef39 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: markdown-link-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@4.1.7 + - uses: actions/checkout@v4.1.7 - uses: gaurav-nelson/github-action-markdown-link-check@v1 with: config-file: '.github/workflows/markdown-link-check.json' From 264f4cd1538eb863bdec9cf51eb62c8a33d0cce5 Mon Sep 17 00:00:00 2001 From: SpencerAbson Date: Thu, 12 Sep 2024 10:06:21 +0100 Subject: [PATCH 13/36] Fix range of immediate argument in vst2q_lane_f64 (#343) * Rectify range of immediate argument in vst2q_lane_f64 The range of 'lane' vstq_lane_f64 is documented as 0 <= lane <= 2. However, the 64-bit variant of ST2 specifies that this immediate has a range of 0 <= lane <= 1. - ST2: https://developer.arm.com/documentation/dui0801/l/A64-SIMD-Vector-Instructions/ST2--vector--single-structure---A64- --- neon_intrinsics/advsimd.md | 3 ++- neon_intrinsics/advsimd.template.md | 1 + tools/intrinsic_db/advsimd.csv | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/neon_intrinsics/advsimd.md b/neon_intrinsics/advsimd.md index ee89c92b..c8056afa 100644 --- a/neon_intrinsics/advsimd.md +++ b/neon_intrinsics/advsimd.md @@ -149,6 +149,7 @@ for more information about Arm’s trademarks. ### Changes for next release * Textual improvements (non-functional changes). +* Fixed the range of the ``lane`` immediate argument for ``vst2q_lane_f64``. -[![All Contributors](https://img.shields.io/badge/all_contributors-35-orange.svg?style=flat-square)](#contributors-) +[![All Contributors](https://img.shields.io/badge/all_contributors-36-orange.svg?style=flat-square)](#contributors-) ![Continuous Integration](https://github.com/ARM-software/acle/actions/workflows/ci.yml/badge.svg) @@ -132,6 +132,9 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d Alexandros Lamprineas
Alexandros Lamprineas

💻 Lukacma
Lukacma

📖 + + Robert Dazi
Robert Dazi

🖋 + From a89792669ed39409bcf0e685881637a1e65b1f93 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Wed, 6 Nov 2024 09:42:21 +0000 Subject: [PATCH 27/36] [FMV][AArch64] Remove features which expose non exploitable runtime behavior. (#355) Feature `rpres` allows an increase in the precision of the single-precision floating-point reciprocal estimate and reciprocal square root estimate from an 8-bit mantissa to a 12-bit mantissa depending on the value FPCR.AH of the Floating-point Control Register. Similarly, `ebf16` allows existing floating-point instructions (BFDOT, BFMMLA, BFMOPA, BFMOPS, and BFVDOT) to change numeric behaviour depending on the value FPCR.EBF of the Floating-point Control Register. Feature `memtag3` allows Tag Check Faults to change behaviour depending on the value SCTLR_ELx.{TCF, TCF0} of the System Control Register. The runtime detection in FMV does not examine the content of control registers, therefore runtime dispatch cannot be based on that. One may argue there is value in altering the control register from a version, for example "msr dit, \#1" if the feature is available on hardware. The registers FPCR and SCTLR_ELx can be accessed in the absence of rpres, ebf16, and memtag3, making it hard to justify adding them to the compiler. --- main/acle.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/main/acle.md b/main/acle.md index 175c72d9..7dceef50 100644 --- a/main/acle.md +++ b/main/acle.md @@ -419,6 +419,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Unified Function Multi Versioning features memtag and memtag2. * Unified Function Multi Versioning features aes and pmull. * Unified Function Multi Versioning features sve2-aes and sve2-pmull128. +* Removed Function Multi Versioning features ebf16, memtag3, and rpres. * Fixed range of operand `o0` (too small) in AArch64 system register designations. * Fixed SVE2.1 quadword gather load/scatter store intrinsics. @@ -2803,8 +2804,6 @@ The following table lists the architectures feature mapping for AArch64 | 260 | `FEAT_DGH` | dgh | ```ID_AA64ISAR1_EL1.DGH >= 0b0001``` | | 270 | `FEAT_I8MM` | i8mm | ```ID_AA64ISAR1_EL1.I8MM >= 0b0001``` | | 280 | `FEAT_BF16` | bf16 | ```ID_AA64ISAR1_EL1.BF16 >= 0b0001``` | - | 290 | `FEAT_EBF16` | ebf16 | ```ID_AA64ISAR1_EL1.BF16 >= 0b0010``` | - | 300 | `FEAT_RPRES` | rpres | ```ID_AA64ISAR2_EL1.RPRES >= 0b0001``` | | 310 | `FEAT_SVE` | sve | ```ID_AA64PFR0_EL1.SVE >= 0b0001``` | | 320 | `FEAT_BF16` | sve-bf16 | ```ID_AA64ZFR0_EL1.BF16 >= 0b0001``` | | 330 | `FEAT_EBF16` | sve-ebf16 | ```ID_AA64ZFR0_EL1.BF16 >= 0b0010``` | @@ -2818,7 +2817,6 @@ The following table lists the architectures feature mapping for AArch64 | 420 | `FEAT_SM3`, `FEAT_SVE_SM4` | sve2-sm4 | ```ID_AA64ZFR0_EL1.SM4 >= 0b0001``` | | 430 | `FEAT_SME` | sme | ```ID_AA64PFR1_EL1.SME >= 0b0001``` | | 440 | `FEAT_MTE`, `FEAT_MTE2` | memtag | ```ID_AA64PFR1_EL1.MTE >= 0b0010``` | - | 460 | `FEAT_MTE3` | memtag3 | ```ID_AA64PFR1_EL1.MTE >= 0b0011``` | | 470 | `FEAT_SB` | sb | ```ID_AA64ISAR1_EL1.SB >= 0b0001``` | | 480 | `FEAT_SPECRES` | predres | ```ID_AA64ISAR1_EL1.SPECRES >= 0b0001``` | | 490 | `FEAT_SSBS`, `FEAT_SSBS2`| ssbs | ```ID_AA64PFR1_EL1.SSBS >= 0b0010``` | From 36736eac822a58c1cff644bfb7f8d29e61c49eb5 Mon Sep 17 00:00:00 2001 From: Claudio Bantaloukas Date: Wed, 6 Nov 2024 16:57:56 +0000 Subject: [PATCH 28/36] Remove unnecessary argument from svcvtnb_mf8[_f32_x2]_fpm (#360) The FCVTNB instruction overwrites all bits of the destination register, thus passing an initial register argument is not required. --- main/acle.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main/acle.md b/main/acle.md index 7dceef50..ea6caee1 100644 --- a/main/acle.md +++ b/main/acle.md @@ -422,6 +422,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Removed Function Multi Versioning features ebf16, memtag3, and rpres. * Fixed range of operand `o0` (too small) in AArch64 system register designations. * Fixed SVE2.1 quadword gather load/scatter store intrinsics. +* Removed unnecessary Zd argument from `svcvtnb_mf8[_f32_x2]_fpm`. ### References @@ -13098,7 +13099,7 @@ floating-point. Single-precision convert, narrow, and interleave to 8-bit floating-point (top and bottom). ``` c svmfloat8_t svcvtnt_mf8[_f32_x2]_fpm(svmfloat8_t zd, svfloat32x2_t zn, fpm_t fpm); - svmfloat8_t svcvtnb_mf8[_f32_x2]_fpm(svmfloat8_t zd, svfloat32x2_t zn, fpm_t fpm); + svmfloat8_t svcvtnb_mf8[_f32_x2]_fpm(svfloat32x2_t zn, fpm_t fpm); ``` #### FDOT (4-way, vectors) From d6f218b0c6a355930952917e04dc18c4ad60387f Mon Sep 17 00:00:00 2001 From: Claudio Bantaloukas Date: Mon, 25 Nov 2024 12:06:47 +0000 Subject: [PATCH 29/36] Update url to target attribute documentation (#366) The link was pointing to an unmaintained documentation page that happened to be indexed by search engines in preference to the actual documentation. --- name: Update url to target attribute documentation about: Technical issues, document format problems, bugs in scripts or feature proposal. --- **Thank you for submitting a pull request!** If this PR is about a bugfix: Please use the bugfix label and make sure to go through the checklist below. If this PR is about a proposal: We are looking forward to evaluate your proposal, and if possible to make it part of the Arm C Language Extension (ACLE) specifications. We would like to encourage you reading through the [contribution guidelines](https://github.com/ARM-software/acle/blob/main/CONTRIBUTING.md), in particular the section on [submitting a proposal](https://github.com/ARM-software/acle/blob/main/CONTRIBUTING.md#proposals-for-new-content). Please use the proposal label. As for any pull request, please make sure to go through the below checklist. Checklist: (mark with ``X`` those which apply) * [ ] If an issue reporting the bug exists, I have mentioned it in the PR (do not bother creating the issue if all you want to do is fixing the bug yourself). * [ ] I have added/updated the `SPDX-FileCopyrightText` lines on top of any file I have edited. Format is `SPDX-FileCopyrightText: Copyright {year} {entity or name} <{contact informations}>` (Please update existing copyright lines if applicable. You can specify year ranges with hyphen , as in `2017-2019`, and use commas to separate gaps, as in `2018-2020, 2022`). * [ ] I have updated the `Copyright` section of the sources of the specification I have edited (this will show up in the text rendered in the PDF and other output format supported). The format is the same described in the previous item. * [x] I have run the CI scripts (if applicable, as they might be tricky to set up on non-*nix machines). The sequence can be found in the [contribution guidelines](https://github.com/ARM-software/acle/blob/main/CONTRIBUTING.md#continuous-integration). Don't worry if you cannot run these scripts on your machine, your patch will be automatically checked in the Actions of the pull request. * [x] I have added an item that describes the changes I have introduced in this PR in the section **Changes for next release** of the section **Change Control**/**Document history** of the document. Create **Changes for next release** if it does not exist. Notice that changes that are not modifying the content and rendering of the specifications (both HTML and PDF) do not need to be listed. * [x] When modifying content and/or its rendering, I have checked the correctness of the result in the PDF output (please refer to the instructions on [how to build the PDFs locally](https://github.com/ARM-software/acle/blob/main/CONTRIBUTING.md#continuous-integration)). * [x] The variable `draftversion` is set to `true` in the YAML header of the sources of the specifications I have modified. * [ ] Please *DO NOT* add my GitHub profile to the list of contributors in the [README](https://github.com/ARM-software/acle/blob/main/README.md#contributors-) page of the project. --- main/acle.md | 3 ++- main/design_documents/function-multi-versioning.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/main/acle.md b/main/acle.md index ea6caee1..b038a5ee 100644 --- a/main/acle.md +++ b/main/acle.md @@ -423,6 +423,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Fixed range of operand `o0` (too small) in AArch64 system register designations. * Fixed SVE2.1 quadword gather load/scatter store intrinsics. * Removed unnecessary Zd argument from `svcvtnb_mf8[_f32_x2]_fpm`. +* Fixed urls. ### References @@ -2584,7 +2585,7 @@ be found in [[BA]](#BA). This section describes ACLE features that use GNU-style attributes. The general rules for attribute syntax are described in the GCC -documentation . +documentation . Briefly, for this declaration: ``` c diff --git a/main/design_documents/function-multi-versioning.md b/main/design_documents/function-multi-versioning.md index 57283458..73bf08f5 100644 --- a/main/design_documents/function-multi-versioning.md +++ b/main/design_documents/function-multi-versioning.md @@ -25,7 +25,7 @@ derived from a function via FMV: 2. the derived function obey to the same calling convention of the original function. -Currently the `target` [attribute for aarch64](https://gcc.gnu.org/onlinedocs/gcc/extensions-to-the-c-language-family/declaring-attributes-of-functions/aarch64-function-attributes.html) +Currently the `target` [attribute for aarch64](https://gcc.gnu.org/onlinedocs/gcc/AArch64-Function-Attributes.html) is used for many purposes, some of which might overlap the functionality introduced by FMV. To avoid confusion, we named the attributes used by FMV with `target_version` and `target_clones`. From f6190ce920cc3e8937bc143bb4608a9f17480110 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Tue, 26 Nov 2024 09:37:51 +0000 Subject: [PATCH 30/36] Include SME attributes in the name mangling of types (#358) This change extends the name mangling of types to include the SME streaming and ZA interface. This will avoid naming conflicts which can currently arise such as in the following example: ``` void foo(void (*f)()) { f(); } void foo(void (*f)() __arm_streaming) { f(); } ``` Without this change, both functions 'foo' above will mangle to the same name, despite the function pointers being different. --- main/acle.md | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/main/acle.md b/main/acle.md index b038a5ee..1182f7bb 100644 --- a/main/acle.md +++ b/main/acle.md @@ -424,6 +424,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Fixed SVE2.1 quadword gather load/scatter store intrinsics. * Removed unnecessary Zd argument from `svcvtnb_mf8[_f32_x2]_fpm`. * Fixed urls. +* Changed name mangling of function types to include SME attributes. ### References @@ -10094,6 +10095,65 @@ an [`__arm_streaming`](#arm_streaming) type. See [Changing streaming mode locally](#changing-streaming-mode-locally) for more information. +### C++ mangling of SME keywords + +SME keyword attributes which apply to function types must be included +in the name mangling of the type, if the mangling would normally include +the return type of the function. + +SME attributes are mangled in the same way as a template: + +``` c + template __SME_ATTRS; +``` + +with the arguments: + +``` c + __SME_ATTRS; +``` + +where: + +* normal_function_type is the function type without any SME attributes. + +* sme_state is an unsigned 64-bit integer representing the streaming and ZA + properties of the function's interface. + +The bits are defined as follows: + +| **Bits** | **Value** | **Interface Type** | +| -------- | --------- | ------------------------------ | +| 0 | 0b1 | __arm_streaming | +| 1 | 0b1 | __arm_streaming_compatible | +| 2 | 0b1 | __arm_agnostic("sme_za_state") | +| 3-5 | 0b000 | No ZA state (default) | +| | 0b001 | __arm_in("za") | +| | 0b010 | __arm_out("za") | +| | 0b011 | __arm_inout("za") | +| | 0b100 | __arm_preserves("za") | +| 6-8 | 0b000 | No ZT0 state (default) | +| | 0b001 | __arm_in("zt0") | +| | 0b010 | __arm_out("zt0") | +| | 0b011 | __arm_inout("zt0") | +| | 0b100 | __arm_preserves("zt0") | + +Bits 9-63 are defined to be zero by this revision of the ACLE and are reserved +for future type attributes. + +For example: + +``` c + // Mangled as _Z1fP11__SME_ATTRSIFu10__SVInt8_tvELj1EE + void f(svint8_t (*fn)() __arm_streaming) { fn(); } + + // Mangled as _Z1fP11__SME_ATTRSIFu10__SVInt8_tvELj26EE + void f(svint8_t (*fn)() __arm_streaming_compatible __arm_inout("za")) { fn(); } + + // Mangled as _Z1fP11__SME_ATTRSIFu10__SVInt8_tvELj128EE + void f(svint8_t (*fn)() __arm_out("zt0")) { fn(); } +``` + ## SME types ### Predicate-as-counter From e9cb1e495995aa9eaadf08a8923ce2fc73fc315b Mon Sep 17 00:00:00 2001 From: SpencerAbson Date: Fri, 29 Nov 2024 16:51:38 +0000 Subject: [PATCH 31/36] Change __ARM_NEON_SVE_BRIDGE to refer to the availability of the header (#362) **Afterthought**: Another way of looking at this is that the user should not expect to be able to use intrinsics after specifying the relevant target features via anything other than the command line, it's unclear to me if this is the case. The ACLE suggests the use of the predefined `__ARM_NEON_SVE_BRIDGE` macro to gaurd the inclusion of `arm_neon_sve_bridge.h`. > defines intrinsics for moving data between Neon and SVE vector types; see [NEON-SVE Bridge](https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge) for details. Before including the header, you should test the __ARM_NEON_SVE_BRIDGE macro. The current definition of this macro is >`__ARM_NEON_SVE_BRIDGE` is defined to 1 if [NEON-SVE Bridge](#neon-sve-bridge) intrinsics are available. This implies that the following macros are nonzero > - __ARM_NEON > - __ARM_NEON_FP > - __ARM_FEATURE_SVE The intrinsics described here are not preprocessor guarded (See [change for LLVM]( https://reviews.llvm.org/D132639)). We should expect to be able to use them in any function with the necessary features, whether they are supplied globally on the command line or via a `target` attribute. However, since we cannot make assumptions about the order in which the predefined feature macros are evaluated (see [relevant ACLE](https://github.com/ARM-software/acle/blob/main/main/acle.md#predefined-feature-macros-and-header-file)), we cannot use the `__ARM_NEON_SVE_BRIDGE` macro to guard the inclusion of `arm_neon_sve_bridge.h` **and** expect to use it's builtins in unless the required features are supplied globally on the command line. See an example of this issue (in LLVM Vs. GCC) from @georges-arm - https://godbolt.org/z/6YPvqdjTv. The proposal of this PR is to change the meaning of `__ARM_NEON_SVE_BRIDGE` to refer to the availability of the `arm_neon_sve_bridge.h` header file only, such that it can be unconditionally defined in supporting compilers and it's builtins can be safely used in the context of the example above. --- main/acle.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/main/acle.md b/main/acle.md index 1182f7bb..6589011c 100644 --- a/main/acle.md +++ b/main/acle.md @@ -241,7 +241,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin specifications in [Cortex-M Security Extension (CMSE)](#cortex-m-security-extension-cmse). * Added specification for [NEON-SVE Bridge](#neon-sve-bridge) and - [NEON-SVE Bridge macros](#neon-sve-bridge-macros). + [NEON-SVE Bridge macros](#neon-sve-bridge-macro). * Added feature detection macro for the memcpy family of memory operations (MOPS) at [memcpy family of memory operations standarization instructions - @@ -425,6 +425,9 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Removed unnecessary Zd argument from `svcvtnb_mf8[_f32_x2]_fpm`. * Fixed urls. * Changed name mangling of function types to include SME attributes. +* Changed `__ARM_NEON_SVE_BRIDGE` to refer to the availability of the + [`arm_neon_sve_bridge.h`](#arm_neon_sve_bridge.h) header file, rather + than the [NEON-SVE bridge](#neon-sve-bridge) intrinsics. ### References @@ -1928,14 +1931,10 @@ are available. This implies that `__ARM_FEATURE_SVE` is nonzero. are available and if the associated [ACLE features] (#sme-language-extensions-and-intrinsics) are supported. -#### NEON-SVE Bridge macros +#### NEON-SVE Bridge macro -`__ARM_NEON_SVE_BRIDGE` is defined to 1 if [NEON-SVE Bridge](#neon-sve-bridge) -intrinsics are available. This implies that the following macros are nonzero: - -* `__ARM_NEON` -* `__ARM_NEON_FP` -* `__ARM_FEATURE_SVE` +`__ARM_NEON_SVE_BRIDGE` is defined to 1 if the [``](#arm_neon_sve_bridge.h) +header file is available. #### Scalable Matrix Extension (SME) @@ -2570,7 +2569,7 @@ be found in [[BA]](#BA). | [`__ARM_FP_FENV_ROUNDING`](#floating-point-model) | Rounding is configurable at runtime | 1 | | [`__ARM_NEON`](#advanced-simd-architecture-extension-neon) | Advanced SIMD (Neon) extension | 1 | | [`__ARM_NEON_FP`](#neon-floating-point) | Advanced SIMD (Neon) floating-point | 0x04 | -| [`__ARM_NEON_SVE_BRIDGE`](#neon-sve-bridge-macros) | Moving data between Neon and SVE data types | 1 | +| [`__ARM_NEON_SVE_BRIDGE`](#neon-sve-bridge-macro) | Availability of [`arm_neon_sve_brdge.h`](#arm_neon_sve_bridge.h) | 1 | | [`__ARM_PCS`](#procedure-call-standard) | Arm procedure call standard (32-bit-only) | 0x01 | | [`__ARM_PCS_AAPCS64`](#procedure-call-standard) | Arm PCS for AArch64. | 1 | | [`__ARM_PCS_VFP`](#procedure-call-standard) | Arm PCS hardware FP variant in use (32-bit-only) | 1 | From 6d6b40b6cf341b31a87fd04be724ebf04b496a1d Mon Sep 17 00:00:00 2001 From: "allcontributors[bot]" <46447321+allcontributors[bot]@users.noreply.github.com> Date: Fri, 29 Nov 2024 16:54:19 +0000 Subject: [PATCH 32/36] docs: add SpencerAbson as a contributor for content (#367) Adds @SpencerAbson as a contributor for content. This was requested by vhscampos [in this comment](https://github.com/ARM-software/acle/pull/362#issuecomment-2508148895) [skip ci] --------- Co-authored-by: allcontributors[bot] <46447321+allcontributors[bot]@users.noreply.github.com> --- .all-contributorsrc | 9 +++++++++ README.md | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.all-contributorsrc b/.all-contributorsrc index 657fbc50..560cd3db 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -342,6 +342,15 @@ "contributions": [ "content" ] + }, + { + "login": "SpencerAbson", + "name": "SpencerAbson", + "avatar_url": "https://avatars.githubusercontent.com/u/76910239?v=4", + "profile": "https://github.com/SpencerAbson", + "contributions": [ + "content" + ] } ], "contributorsPerLine": 7, diff --git a/README.md b/README.md index 9d95d9f3..a31c1ae7 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ -[![All Contributors](https://img.shields.io/badge/all_contributors-36-orange.svg?style=flat-square)](#contributors-) +[![All Contributors](https://img.shields.io/badge/all_contributors-37-orange.svg?style=flat-square)](#contributors-) ![Continuous Integration](https://github.com/ARM-software/acle/actions/workflows/ci.yml/badge.svg) @@ -134,6 +134,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d Robert Dazi
Robert Dazi

🖋 + SpencerAbson
SpencerAbson

🖋 From 33a0cb30f67291862497547662f7c62e6b52e93f Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Wed, 4 Dec 2024 09:04:14 +0000 Subject: [PATCH 33/36] [FMV][AArch64] Remove feature dgh since it can be used unconditionally. (#357) The DGH instruction belongs to the hint space. It executes as NOP if the corresponding feature is not present in hardware, so there's no need for runtime dispatch. --- main/acle.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/acle.md b/main/acle.md index 6589011c..244067ca 100644 --- a/main/acle.md +++ b/main/acle.md @@ -420,6 +420,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Unified Function Multi Versioning features aes and pmull. * Unified Function Multi Versioning features sve2-aes and sve2-pmull128. * Removed Function Multi Versioning features ebf16, memtag3, and rpres. +* Removed Function Multi Versioning feature dgh. * Fixed range of operand `o0` (too small) in AArch64 system register designations. * Fixed SVE2.1 quadword gather load/scatter store intrinsics. * Removed unnecessary Zd argument from `svcvtnb_mf8[_f32_x2]_fpm`. @@ -2803,7 +2804,6 @@ The following table lists the architectures feature mapping for AArch64 | 240 | `FEAT_LRCPC2` | rcpc2 | ```ID_AA64ISAR1_EL1.LRCPC >= 0b0010``` | | 241 | `FEAT_LRCPC3` | rcpc3 | ```ID_AA64ISAR1_EL1.LRCPC >= 0b0011``` | | 250 | `FEAT_FRINTTS` | frintts | ```ID_AA64ISAR1_EL1.FRINTTS >= 0b0001``` | - | 260 | `FEAT_DGH` | dgh | ```ID_AA64ISAR1_EL1.DGH >= 0b0001``` | | 270 | `FEAT_I8MM` | i8mm | ```ID_AA64ISAR1_EL1.I8MM >= 0b0001``` | | 280 | `FEAT_BF16` | bf16 | ```ID_AA64ISAR1_EL1.BF16 >= 0b0001``` | | 310 | `FEAT_SVE` | sve | ```ID_AA64PFR0_EL1.SVE >= 0b0001``` | From 11ce13e67e58c918fb0ce5b3b1c74dc1adf97388 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Fri, 6 Dec 2024 09:49:42 +0000 Subject: [PATCH 34/36] [FMV] Remove features which can be expressed as a combination of other features (#353) All of sve-bf16, sve-ebf16, and sve-i8mm are obsolete. This is already reflected on the second column of the FMV table (we have bf16, ebf16, and i8mm with the same Architecture name). According to https://developer.arm.com/documentation/ddi0487/latest Arm Architecture Reference Manual for A-profile architecture: D23.2.72 ID_AA64ISAR1_EL1, AArch64 Instruction Set Attribute Register 1 ID_AA64ISAR1_EL1.I8MM, bits [55:52] > When Advanced SIMD and SVE are both implemented, this field must return > the same value as ID_AA64ZFR0_EL1.I8MM ID_AA64ISAR1_EL1.BF16, bits [47:44] > When FEAT_SVE or FEAT_SME is implemented, this field must return the > same value as ID_AA64ZFR0_EL1.BF16. So one could write target_version("sve+bf16") or sme+bf16 instead. There is a proposal to explicitely document FMV feature dependences in ACLE, so that the user won't have to write long feature strings on the attributes like sve+simd+i8mm (sve+i8mm should be enough). --- main/acle.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/main/acle.md b/main/acle.md index 244067ca..232b8ba1 100644 --- a/main/acle.md +++ b/main/acle.md @@ -419,6 +419,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Unified Function Multi Versioning features memtag and memtag2. * Unified Function Multi Versioning features aes and pmull. * Unified Function Multi Versioning features sve2-aes and sve2-pmull128. +* Removed Function Multi Versioning features sve-bf16, sve-ebf16, and sve-i8mm. * Removed Function Multi Versioning features ebf16, memtag3, and rpres. * Removed Function Multi Versioning feature dgh. * Fixed range of operand `o0` (too small) in AArch64 system register designations. @@ -2807,9 +2808,6 @@ The following table lists the architectures feature mapping for AArch64 | 270 | `FEAT_I8MM` | i8mm | ```ID_AA64ISAR1_EL1.I8MM >= 0b0001``` | | 280 | `FEAT_BF16` | bf16 | ```ID_AA64ISAR1_EL1.BF16 >= 0b0001``` | | 310 | `FEAT_SVE` | sve | ```ID_AA64PFR0_EL1.SVE >= 0b0001``` | - | 320 | `FEAT_BF16` | sve-bf16 | ```ID_AA64ZFR0_EL1.BF16 >= 0b0001``` | - | 330 | `FEAT_EBF16` | sve-ebf16 | ```ID_AA64ZFR0_EL1.BF16 >= 0b0010``` | - | 340 | `FEAT_I8MM` | sve-i8mm | ```ID_AA64ZFR0_EL1.I8MM >= 0b00001``` | | 350 | `FEAT_F32MM` | f32mm | ```ID_AA64ZFR0_EL1.F32MM >= 0b00001``` | | 360 | `FEAT_F64MM` | f64mm | ```ID_AA64ZFR0_EL1.F64MM >= 0b00001``` | | 370 | `FEAT_SVE2` | sve2 | ```ID_AA64ZFR0_EL1.SVEver >= 0b0001``` | From 73c35a3d26d929244910338ae88db778640a8a30 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Thu, 12 Dec 2024 15:53:14 +0000 Subject: [PATCH 35/36] [FMV] Document feature dependencies and detect at selection. (#368) --- main/acle.md | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/main/acle.md b/main/acle.md index 232b8ba1..79ad91c6 100644 --- a/main/acle.md +++ b/main/acle.md @@ -422,6 +422,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Removed Function Multi Versioning features sve-bf16, sve-ebf16, and sve-i8mm. * Removed Function Multi Versioning features ebf16, memtag3, and rpres. * Removed Function Multi Versioning feature dgh. +* Document Function Multi Versioning feature dependencies. * Fixed range of operand `o0` (too small) in AArch64 system register designations. * Fixed SVE2.1 quadword gather load/scatter store intrinsics. * Removed unnecessary Zd argument from `svcvtnb_mf8[_f32_x2]_fpm`. @@ -2675,8 +2676,6 @@ The following attributes trigger the multi version code generation: * The `default` version means the version of the function that would be generated without these attributes. * `name` is the dependent features from the tables below. - * If a feature depends on another feature as defined by the Architecture - Reference Manual then no need to explicitly state in the attribute[^fmv-note-names]. * The dependent features could be joined by the `+` sign. * None of these attributes will enable the corresponding ACLE feature(s) associated to the `name` expressed in the attribute. @@ -2686,9 +2685,6 @@ The following attributes trigger the multi version code generation: * FMV may be disabled in compile time by a compiler flag. In this case the `default` version shall be used. -[^fmv-note-names]: For example the `sve_bf16` feature depends on `sve` - but it is enough to say `target_version("sve_bf16")` in the code. - The attribute `__attribute__((target_version("name")))` expresses the following: @@ -2828,6 +2824,50 @@ The following table lists the architectures feature mapping for AArch64 | 580 | `FEAT_SME2` | sme2 | ```ID_AA64PFR1_EL1.SMEver >= 0b0001``` | | 650 | `FEAT_MOPS` | mops | ```ID_AA64ISAR2_EL1.MOPS >= 0b0001``` | +### Dependencies + +If a feature depends on another feature as defined by the table below then: + +* the depended-on feature *need not* be specified in the attribute, +* the depended-on feature *may* be specified in the attribute. + +These dependencies are taken into account transitively when selecting the +most appropriate version of a function (see section [Selection](#selection)). +The following table lists the feature dependencies for AArch64. + + | **Feature** | **Depends on** | + | ---------------- | ----------------- | + | flagm2 | flagm | + | simd | fp | + | dotprod | simd | + | sm4 | simd | + | rdm | simd | + | sha2 | simd | + | sha3 | sha2 | + | aes | simd | + | fp16 | fp | + | fp16fml | simd, fp16 | + | dpb2 | dpb | + | jscvt | fp | + | fcma | simd | + | rcpc2 | rcpc | + | rcpc3 | rcpc2 | + | frintts | fp | + | i8mm | simd | + | bf16 | simd | + | sve | fp16 | + | f32mm | sve | + | f64mm | sve | + | sve2 | sve | + | sve2-aes | sve2, aes | + | sve2-bitperm | sve2 | + | sve2-sha3 | sve2, sha3 | + | sve2-sm4 | sve2, sm4 | + | sme | fp16, bf16 | + | sme-f64f64 | sme | + | sme-i16i64 | sme | + | sme2 | sme | + ### Selection The following rules shall be followed by all implementations: From ff7467b9f1dae7e3cd38463b3377c5e27d31dd01 Mon Sep 17 00:00:00 2001 From: rsandifo-arm Date: Wed, 18 Dec 2024 15:51:05 +0000 Subject: [PATCH 36/36] Some tweaks to the SVE2p1 load and store intrinsics (#359) The pre-SVE2p1 gather and scatter intrinsics allow vector displacements (offsets or indices) to be either signed or unsigned. svld1q and svst1q instead required them to be unsigned. This patch adds signed versions too, for consistency. Also, the SVE2p1 stores were specified to take pointers to const, but they ought to be pointers to non-const instead. --- main/acle.md | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/main/acle.md b/main/acle.md index 79ad91c6..3e434b5c 100644 --- a/main/acle.md +++ b/main/acle.md @@ -431,6 +431,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin * Changed `__ARM_NEON_SVE_BRIDGE` to refer to the availability of the [`arm_neon_sve_bridge.h`](#arm_neon_sve_bridge.h) header file, rather than the [NEON-SVE bridge](#neon-sve-bridge) intrinsics. +* Removed extraneous `const` from SVE2.1 store intrinsics. ### References @@ -9221,11 +9222,13 @@ Gather Load Quadword. // _mf8, _bf16, _f16, _f32, _f64 svint8_t svld1q_gather[_u64base]_s8(svbool_t pg, svuint64_t zn); svint8_t svld1q_gather[_u64base]_offset_s8(svbool_t pg, svuint64_t zn, int64_t offset); + svint8_t svld1q_gather_[s64]offset[_s8](svbool_t pg, const int8_t *base, svint64_t offset); svint8_t svld1q_gather_[u64]offset[_s8](svbool_t pg, const int8_t *base, svuint64_t offset); // Variants are also available for: // _u16, _u32, _s32, _u64, _s64 // _bf16, _f16, _f32, _f64 + svint16_t svld1q_gather_[s64]index[_s16](svbool_t pg, const int16_t *base, svint64_t index); svint16_t svld1q_gather_[u64]index[_s16](svbool_t pg, const int16_t *base, svuint64_t index); svint16_t svld1q_gather[_u64base]_index_s16(svbool_t pg, svuint64_t zn, int64_t index); ``` @@ -9295,14 +9298,14 @@ Contiguous store of single vector operand, truncating from quadword. ``` c // Variants are also available for: // _u32, _s32 - void svst1wq[_f32](svbool_t, const float32_t *ptr, svfloat32_t data); - void svst1wq_vnum[_f32](svbool_t, const float32_t *ptr, int64_t vnum, svfloat32_t data); + void svst1wq[_f32](svbool_t, float32_t *ptr, svfloat32_t data); + void svst1wq_vnum[_f32](svbool_t, float32_t *ptr, int64_t vnum, svfloat32_t data); // Variants are also available for: // _u64, _s64 - void svst1dq[_f64](svbool_t, const float64_t *ptr, svfloat64_t data); - void svst1dq_vnum[_f64](svbool_t, const float64_t *ptr, int64_t vnum, svfloat64_t data); + void svst1dq[_f64](svbool_t, float64_t *ptr, svfloat64_t data); + void svst1dq_vnum[_f64](svbool_t, float64_t *ptr, int64_t vnum, svfloat64_t data); ``` #### ST1Q @@ -9315,12 +9318,14 @@ Scatter store quadwords. // _mf8, _bf16, _f16, _f32, _f64 void svst1q_scatter[_u64base][_s8](svbool_t pg, svuint64_t zn, svint8_t data); void svst1q_scatter[_u64base]_offset[_s8](svbool_t pg, svuint64_t zn, int64_t offset, svint8_t data); - void svst1q_scatter_[u64]offset[_s8](svbool_t pg, const uint8_t *base, svuint64_t offset, svint8_t data); + void svst1q_scatter_[s64]offset[_s8](svbool_t pg, uint8_t *base, svint64_t offset, svint8_t data); + void svst1q_scatter_[u64]offset[_s8](svbool_t pg, uint8_t *base, svuint64_t offset, svint8_t data); // Variants are also available for: // _u16, _u32, _s32, _u64, _s64 // _bf16, _f16, _f32, _f64 - void svst1q_scatter_[u64]index[_s16](svbool_t pg, const int16_t *base, svuint64_t index, svint16_t data); + void svst1q_scatter_[s64]index[_s16](svbool_t pg, int16_t *base, svint64_t index, svint16_t data); + void svst1q_scatter_[u64]index[_s16](svbool_t pg, int16_t *base, svuint64_t index, svint16_t data); void svst1q_scatter[_u64base]_index[_s16](svbool_t pg, svuint64_t zn, int64_t index, svint16_t data); ```