diff --git a/src/integrated-matrix.adoc b/src/integrated-matrix.adoc index cd5409df0..64640dd0e 100644 --- a/src/integrated-matrix.adoc +++ b/src/integrated-matrix.adoc @@ -1323,8 +1323,8 @@ This permits hardware implementations (including outer-product engines and regis | A, column-major | `vmttl.v` (transposing) | `vmtts.v` | B, column-major | `vmtl.v` (order-preserving) | — | B, row-major | `vmttl.v` (transposing) | — -| C, column-major | `vmtl.v` (if loaded from memory) | `vmts.v` -| C, row-major | `vmttl.v` (transposing) | `vmtts.v` +| C, row-major | `vmtl.v` (if loaded from memory) | `vmts.v` +| C, column-major | `vmttl.v` (transposing) | `vmtts.v` |=== ==== Example: single-precision floating-point GEMM @@ -1333,11 +1333,11 @@ The following pseudocode implements C ← C + A × B for single-precision floating-point data, tiled over all three matrix dimensions. The configuration uses SEW=32, LMUL=1, and λ=4, giving K_eff = 4 and M_tile = VLEN/128. -A is stored row-major; B and C are stored column-major. +A and C are stored row-major; B is stored column-major. [source,c] -- -// C (M×N, col-major, stride ldc) += A (M×K, row-major, stride lda) +// C (M×N, row-major, stride ldc) += A (M×K, row-major, stride lda) // × B (K×N, col-major, stride ldb) // Pre-condition: vtype configured for SEW=32, LMUL=1, lambda=4 // Pre-condition: K is a multiple of K_eff (= lambda × LMUL = 4) @@ -1372,9 +1372,9 @@ for (j = 0; j < N; j += N_tile) { vfmmacc.vv v8, v0, v4; // C[i..][j..] += A_tile × B_tile } - // Store C[i:i+M_tile][j:j+N_tile]: M_tile rows, N_tile cols, col-major. + // Store C[i:i+M_tile][j:j+N_tile]: M_tile rows, N_tile cols, row-major. vsetvl(vl); - vmts.v v8, &C[j*ldc + i], ldc; // LD = ldc (C col stride = M_total) + vmts.v v8, &C[i*ldc + j], ldc; // LD = ldc (C row stride = N_total) } } -- @@ -1842,11 +1842,11 @@ function mat_B_idx(k : int, j : int, K_eff : int, LMUL : int, lambda : int, epr : int) -> int = tile_reg_idx(j * K_eff + k, LMUL, lambda, epr) -// C[i, j] in vd: C is stored row-major. +// C[i, j] in vd: C is stored row-major; the row stride is N (= columns of C). // MUL_C = VLEN / (SEW * lambda^2) is the number of registers in the C group. -function mat_C_idx(i : int, j : int, M : int, +function mat_C_idx(i : int, j : int, N : int, MUL_C : int, lambda : int, epr : int) -> int = - tile_reg_idx(i * M + j, MUL_C, lambda, epr) + tile_reg_idx(i * N + j, MUL_C, lambda, epr) // fp_format_of(EEW : int, alt : bit) -> fp_fmt // Decode (element width in bits, altfmt flag) to the concrete FP format as specified @@ -2164,7 +2164,7 @@ function int_gemm(g : gemm_geom, signed_A : bool, signed_B : bool, vs1 : regidx, vs2 : regidx, vd : regidx) -> unit = { foreach (j from 0 to (g.N - 1)) { foreach (i from 0 to (g.M - 1)) { - let c_flat : int = mat_C_idx(i, j, g.M, g.MUL_C, g.lambda, g.epr_C); + let c_flat : int = mat_C_idx(i, j, g.N, g.MUL_C, g.lambda, g.epr_C); let c_bits : bits(g.EEW_C) = read_single_element(g.EEW_C, c_flat, vd); var acc : int = signed(c_bits); acc = acc + int_block_dot(i, j, 0, g.K_eff - 1, g, @@ -2188,7 +2188,7 @@ function fp_gemm(g : gemm_geom, foreach (j from 0 to (g.N - 1)) { foreach (i from 0 to (g.M - 1)) { - let c_flat : int = mat_C_idx(i, j, g.M, g.MUL_C, g.lambda, g.epr_C); + let c_flat : int = mat_C_idx(i, j, g.N, g.MUL_C, g.lambda, g.epr_C); var acc : bits(g.EEW_C) = read_single_element(g.EEW_C, c_flat, vd); // The semantics for LMUL > 1 are defined as repeated application @@ -2238,7 +2238,7 @@ function fp_scaled_gemm(g : gemm_geom, foreach (j from 0 to (g.N - 1)) { foreach (i from 0 to (g.M - 1)) { - let c_flat : int = mat_C_idx(i, j, g.M, g.MUL_C, g.lambda, g.epr_C); + let c_flat : int = mat_C_idx(i, j, g.N, g.MUL_C, g.lambda, g.epr_C); var acc : bits(g.EEW_C) = read_single_element(g.EEW_C, c_flat, vd); var nan_out : bool = false; @@ -2316,7 +2316,7 @@ function int_scaled_gemm(g : gemm_geom, foreach (j from 0 to (g.N - 1)) { foreach (i from 0 to (g.M - 1)) { - let c_flat : int = mat_C_idx(i, j, g.M, g.MUL_C, g.lambda, g.epr_C); + let c_flat : int = mat_C_idx(i, j, g.N, g.MUL_C, g.lambda, g.epr_C); var acc : bits(g.EEW_C) = read_single_element(g.EEW_C, c_flat, vd); var nan_out : bool = false; @@ -2371,7 +2371,7 @@ Computes the matrix-matrix product T = vs1 × vs2 and accumulates it into _vd_: with a widening factor of 8 applied to the K dimension. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register +(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register layout). _vs1_ and _vs2_ have elements of width SEW÷8; _vd_ has elements of width SEW. + @@ -2453,7 +2453,7 @@ Computes the floating-point matrix-matrix product T = vs1 × vs2 and accumulates with a widening factor of 8 applied to the K dimension. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register +(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register layout). _vs1_ and _vs2_ have elements of width SEW÷8; _vd_ has elements of width SEW. + @@ -2550,7 +2550,7 @@ Description:: Computes the floating-point matrix-matrix product T = vs1 × vs2 and accumulates it into _vd_: C ← C + T. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register +(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register layout). All three tiles have elements of width SEW. + K_effective = λ × LMUL. @@ -2635,7 +2635,7 @@ Computes the floating-point matrix-matrix product T = vs1 × vs2 and accumulates with a widening factor of 4 applied to the K dimension. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register +(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register layout). _vs1_ and _vs2_ have elements of width SEW÷4; _vd_ has elements of width SEW. + @@ -2736,7 +2736,7 @@ Computes the floating-point matrix-matrix product T = vs1 × vs2 and accumulates with a widening factor of 2 applied to the K dimension. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register +(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register layout). _vs1_ and _vs2_ have elements of width SEW÷2; _vd_ has elements of width SEW. + @@ -2841,7 +2841,7 @@ scaled result into the floating-point accumulator _vd_: C ← C + scale_A × sca The widening factor is 2 applied to the K dimension. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N FP accumulator tile (column-major +(column-major register layout), and _vd_ as an M×N FP accumulator tile (row-major register layout). _vs1_ and _vs2_ have integer elements of width SEW÷2; _vd_ has FP elements of width SEW. + @@ -2947,7 +2947,7 @@ scaled result into the floating-point accumulator _vd_: C ← C + scale_A × sca The widening factor is 8 applied to the K dimension. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N FP accumulator tile (column-major +(column-major register layout), and _vd_ as an M×N FP accumulator tile (row-major register layout). _vs1_ and _vs2_ have integer elements of width SEW÷8; _vd_ has FP elements of width SEW. + @@ -3048,7 +3048,7 @@ scaled result into the floating-point accumulator _vd_: C ← C + scale_A × sca The widening factor is 4 applied to the K dimension. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N FP accumulator tile (column-major +(column-major register layout), and _vd_ as an M×N FP accumulator tile (row-major register layout). _vs1_ and _vs2_ have integer elements of width SEW÷4; _vd_ has FP elements of width SEW. + @@ -3148,7 +3148,7 @@ Description:: Computes the matrix-matrix product T = vs1 × vs2 and accumulates it into _vd_: C ← C + T. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register +(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register layout). All three tiles have elements of width SEW. + K_effective = λ × LMUL. @@ -3629,7 +3629,7 @@ Computes the matrix-matrix product T = vs1 × vs2 and accumulates it into _vd_: with a widening factor of 4 applied to the K dimension. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register +(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register layout). _vs1_ and _vs2_ have elements of width SEW÷4; _vd_ has elements of width SEW. + @@ -3705,7 +3705,7 @@ Computes the matrix-matrix product T = vs1 × vs2 and accumulates it into _vd_: with a widening factor of 2 applied to the K dimension. + _vs1_ is interpreted as an M×K tile (row-major register layout), _vs2_ as a K×N tile -(column-major register layout), and _vd_ as an M×N accumulator tile (column-major register +(column-major register layout), and _vd_ as an M×N accumulator tile (row-major register layout). _vs1_ and _vs2_ have elements of width SEW÷2; _vd_ has elements of width SEW. +