Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend SME2.1 intrinsics to mf8 #375

Merged
merged 3 commits into from
Jan 15, 2025
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 62 additions & 41 deletions main/acle.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ toc: true
---

<!--
SPDX-FileCopyrightText: Copyright 2011-2024 Arm Limited and/or its affiliates <[email protected]>
SPDX-FileCopyrightText: Copyright 2011-2025 Arm Limited and/or its affiliates <[email protected]>
SPDX-FileCopyrightText: Copyright 2022 Google LLC.
CC-BY-SA-4.0 AND Apache-Patent-License
See LICENSE.md file for details
Expand Down Expand Up @@ -435,6 +435,7 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin
* Added [`__arm_agnostic`](#arm_agnostic) keyword attribute.
* Refined function versioning scope and signature rules to use the default
version scope and signature.
* Added mf8 variants of SME 2.1 intrinsics.

### References

Expand Down Expand Up @@ -12156,85 +12157,97 @@ Lookup table read with 2-bit and 4-bit indexes
Move multi-vectors to/from ZA

``` c
// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svread_hor_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svread_hor_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svread_ver_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svread_ver_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svread_za8_s8_vg1x2(uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svread_za8_s8_vg1x4(uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], _za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], _za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], _za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], _za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], _za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_za8[_s8]_vg1x2(uint32_t slice, svint8x2_t zn)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8[_u8], za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_za8[_s8]_vg1x4(uint32_t slice, svint8x4_t zn)
__arm_streaming __arm_inout("za");
Expand Down Expand Up @@ -12509,7 +12522,7 @@ The intrinsics in this section are defined by the header file
Move and zero ZA tile slice to vector register.

```
// And similarly for u8.
// And similarly for u8 and mf8.
svint8_t svreadz_hor_za8_s8(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

Expand All @@ -12525,11 +12538,12 @@ Move and zero ZA tile slice to vector register.
svint64_t svreadz_hor_za64_s64(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
// And similarly for s16, s32, s64, u8, u16, u32, u64,
// mf8, bf16, f16, f32, f64
svint8_t svreadz_hor_za128_s8(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

// And similarly for u8.
// And similarly for u8 and mf8.
svint8_t svreadz_ver_za8_s8(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

Expand All @@ -12545,7 +12559,8 @@ Move and zero ZA tile slice to vector register.
svint64_t svreadz_ver_za64_s64(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
// And similarly for s16, s32, s64, u8, u16, u32, u64,
// mf8, bf16, f16, f32, f64
svint8_t svreadz_ver_za128_s8(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");
```
Expand All @@ -12555,29 +12570,33 @@ Move and zero ZA tile slice to vector register.
Move and zero multiple ZA tile slices to vector registers

``` c
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svreadz_hor_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svreadz_hor_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svreadz_ver_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svreadz_ver_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");
Expand All @@ -12588,15 +12607,17 @@ Move and zero multiple ZA tile slices to vector registers
Move and zero multiple ZA single-vector groups to vector registers

```
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svreadz_za8_s8_vg1x2(uint32_t slice)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svreadz_za8_s8_vg1x4(uint32_t slice)
__arm_streaming __arm_inout("za");
Expand Down
Loading