Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 24 additions & 24 deletions src/integrated-matrix.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -1323,8 +1323,8 @@ This permits hardware implementations (including outer-product engines and regis
| A, column-major | `vmttl.v` (transposing) | `vmtts.v`
| B, column-major | `vmtl.v` (order-preserving) | —
| B, row-major | `vmttl.v` (transposing) | —
| C, column-major | `vmtl.v` (if loaded from memory) | `vmts.v`
| C, row-major | `vmttl.v` (transposing) | `vmtts.v`
| C, row-major | `vmtl.v` (if loaded from memory) | `vmts.v`
| C, column-major | `vmttl.v` (transposing) | `vmtts.v`
|===

==== Example: single-precision floating-point GEMM
Expand All @@ -1333,11 +1333,11 @@ The following pseudocode implements C ← C + A × B for single-precision
floating-point data, tiled over all three matrix dimensions.
The configuration uses SEW=32, LMUL=1, and λ=4, giving K_eff = 4 and
M_tile = VLEN/128.
A is stored row-major; B and C are stored column-major.
A and C are stored row-major; B is stored column-major.

[source,c]
--
// C (M×N, col-major, stride ldc) += A (M×K, row-major, stride lda)
// C (M×N, row-major, stride ldc) += A (M×K, row-major, stride lda)
// × B (K×N, col-major, stride ldb)
// Pre-condition: vtype configured for SEW=32, LMUL=1, lambda=4
// Pre-condition: K is a multiple of K_eff (= lambda × LMUL = 4)
Expand Down Expand Up @@ -1372,9 +1372,9 @@ for (j = 0; j < N; j += N_tile) {
vfmmacc.vv v8, v0, v4; // C[i..][j..] += A_tile × B_tile
}

// Store C[i:i+M_tile][j:j+N_tile]: M_tile rows, N_tile cols, col-major.
// Store C[i:i+M_tile][j:j+N_tile]: M_tile rows, N_tile cols, row-major.
vsetvl(vl);
vmts.v v8, &C[j*ldc + i], ldc; // LD = ldc (C col stride = M_total)
vmts.v v8, &C[i*ldc + j], ldc; // LD = ldc (C row stride = N_total)
}
}
--
Expand Down Expand Up @@ -1842,11 +1842,11 @@ function mat_B_idx(k : int, j : int, K_eff : int,
LMUL : int, lambda : int, epr : int) -> int =
tile_reg_idx(j * K_eff + k, LMUL, lambda, epr)

// C[i, j] in vd: C is stored row-major.
// C[i, j] in vd: C is stored row-major; the row stride is N (= columns of C).
// MUL_C = VLEN / (SEW * lambda^2) is the number of registers in the C group.
function mat_C_idx(i : int, j : int, M : int,
function mat_C_idx(i : int, j : int, N : int,
MUL_C : int, lambda : int, epr : int) -> int =
tile_reg_idx(i * M + j, MUL_C, lambda, epr)
tile_reg_idx(i * N + j, MUL_C, lambda, epr)

// fp_format_of(EEW : int, alt : bit) -> fp_fmt
// Decode (element width in bits, altfmt flag) to the concrete FP format as specified
Expand Down Expand Up @@ -2164,7 +2164,7 @@ function int_gemm(g : gemm_geom, signed_A : bool, signed_B : bool,
vs1 : regidx, vs2 : regidx, vd : regidx) -> unit = {
foreach (j from 0 to (g.N - 1)) {
foreach (i from 0 to (g.M - 1)) {
let c_flat : int = mat_C_idx(i, j, g.M, g.MUL_C, g.lambda, g.epr_C);
let c_flat : int = mat_C_idx(i, j, g.N, g.MUL_C, g.lambda, g.epr_C);
let c_bits : bits(g.EEW_C) = read_single_element(g.EEW_C, c_flat, vd);
var acc : int = signed(c_bits);
acc = acc + int_block_dot(i, j, 0, g.K_eff - 1, g,
Expand All @@ -2188,7 +2188,7 @@ function fp_gemm(g : gemm_geom,

foreach (j from 0 to (g.N - 1)) {
foreach (i from 0 to (g.M - 1)) {
let c_flat : int = mat_C_idx(i, j, g.M, g.MUL_C, g.lambda, g.epr_C);
let c_flat : int = mat_C_idx(i, j, g.N, g.MUL_C, g.lambda, g.epr_C);
var acc : bits(g.EEW_C) = read_single_element(g.EEW_C, c_flat, vd);

// The semantics for LMUL > 1 are defined as repeated application
Expand Down Expand Up @@ -2238,7 +2238,7 @@ function fp_scaled_gemm(g : gemm_geom,

foreach (j from 0 to (g.N - 1)) {
foreach (i from 0 to (g.M - 1)) {
let c_flat : int = mat_C_idx(i, j, g.M, g.MUL_C, g.lambda, g.epr_C);
let c_flat : int = mat_C_idx(i, j, g.N, g.MUL_C, g.lambda, g.epr_C);
var acc : bits(g.EEW_C) = read_single_element(g.EEW_C, c_flat, vd);
var nan_out : bool = false;

Expand Down Expand Up @@ -2316,7 +2316,7 @@ function int_scaled_gemm(g : gemm_geom,

foreach (j from 0 to (g.N - 1)) {
foreach (i from 0 to (g.M - 1)) {
let c_flat : int = mat_C_idx(i, j, g.M, g.MUL_C, g.lambda, g.epr_C);
let c_flat : int = mat_C_idx(i, j, g.N, g.MUL_C, g.lambda, g.epr_C);
var acc : bits(g.EEW_C) = read_single_element(g.EEW_C, c_flat, vd);
var nan_out : bool = false;

Expand Down Expand Up @@ -2371,7 +2371,7 @@ Computes the matrix-matrix product T = vs1 × vs2 and accumulates it into _vd_:
with a widening factor of 8 applied to the K dimension.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register
(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register
layout).
_vs1_ and _vs2_ have elements of width SEW÷8; _vd_ has elements of width SEW.
+
Expand Down Expand Up @@ -2453,7 +2453,7 @@ Computes the floating-point matrix-matrix product T = vs1 × vs2 and accumulates
with a widening factor of 8 applied to the K dimension.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register
(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register
layout).
_vs1_ and _vs2_ have elements of width SEW÷8; _vd_ has elements of width SEW.
+
Expand Down Expand Up @@ -2550,7 +2550,7 @@ Description::
Computes the floating-point matrix-matrix product T = vs1 × vs2 and accumulates it into _vd_: C ← C + T.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register
(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register
layout). All three tiles have elements of width SEW.
+
K_effective = λ × LMUL.
Expand Down Expand Up @@ -2635,7 +2635,7 @@ Computes the floating-point matrix-matrix product T = vs1 × vs2 and accumulates
with a widening factor of 4 applied to the K dimension.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register
(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register
layout).
_vs1_ and _vs2_ have elements of width SEW÷4; _vd_ has elements of width SEW.
+
Expand Down Expand Up @@ -2736,7 +2736,7 @@ Computes the floating-point matrix-matrix product T = vs1 × vs2 and accumulates
with a widening factor of 2 applied to the K dimension.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register
(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register
layout).
_vs1_ and _vs2_ have elements of width SEW÷2; _vd_ has elements of width SEW.
+
Expand Down Expand Up @@ -2841,7 +2841,7 @@ scaled result into the floating-point accumulator _vd_: C ← C + scale_A × sca
The widening factor is 2 applied to the K dimension.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N FP accumulator tile (column-major
(column-major register layout), and _vd_ as an M×N FP accumulator tile (row-major
register layout).
_vs1_ and _vs2_ have integer elements of width SEW÷2; _vd_ has FP elements of width SEW.
+
Expand Down Expand Up @@ -2947,7 +2947,7 @@ scaled result into the floating-point accumulator _vd_: C ← C + scale_A × sca
The widening factor is 8 applied to the K dimension.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N FP accumulator tile (column-major
(column-major register layout), and _vd_ as an M×N FP accumulator tile (row-major
register layout).
_vs1_ and _vs2_ have integer elements of width SEW÷8; _vd_ has FP elements of width SEW.
+
Expand Down Expand Up @@ -3048,7 +3048,7 @@ scaled result into the floating-point accumulator _vd_: C ← C + scale_A × sca
The widening factor is 4 applied to the K dimension.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N FP accumulator tile (column-major
(column-major register layout), and _vd_ as an M×N FP accumulator tile (row-major
register layout).
_vs1_ and _vs2_ have integer elements of width SEW÷4; _vd_ has FP elements of width SEW.
+
Expand Down Expand Up @@ -3148,7 +3148,7 @@ Description::
Computes the matrix-matrix product T = vs1 × vs2 and accumulates it into _vd_: C ← C + T.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register
(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register
layout). All three tiles have elements of width SEW.
+
K_effective = λ × LMUL.
Expand Down Expand Up @@ -3629,7 +3629,7 @@ Computes the matrix-matrix product T = vs1 × vs2 and accumulates it into _vd_:
with a widening factor of 4 applied to the K dimension.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register
(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register
layout).
_vs1_ and _vs2_ have elements of width SEW÷4; _vd_ has elements of width SEW.
+
Expand Down Expand Up @@ -3705,7 +3705,7 @@ Computes the matrix-matrix product T = vs1 × vs2 and accumulates it into _vd_:
with a widening factor of 2 applied to the K dimension.
+
_vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile
(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register
(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register
layout).
_vs1_ and _vs2_ have elements of width SEW÷2; _vd_ has elements of width SEW.
+
Expand Down
Loading