diff --git a/Makefile b/Makefile index a230762..fc4f05f 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ push-image-arm64: docker push fangruil/chisel-dev:arm64-${VER} docs: - pip3 install markdown-wavedrom mkdocs mkdocs-material python-markdown-math + pip3 install markdown-wavedrom mkdocs mkdocs-material python-markdown-math mkdocs-mermaid2-plugin mkdocs serve clean: diff --git a/README.md b/README.md index dcaa0d5..9efc23d 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ [![Documentation Status](https://readthedocs.org/projects/chisel-opennpu/badge/?version=latest)](https://chisel-opennpu.readthedocs.io/en/latest/?badge=latest) +Docs: https://chisel-opennpu.readthedocs.io + This is a chisel workbench designed for someone who like docker containers and vscode dev container plugin. DEVELOP IN PROGRESS. COMMERCIAL USE IS NOT ALLOWED. diff --git a/docs/images/neural_core.png b/docs/images/neural_core.png new file mode 100644 index 0000000..5e3c40b Binary files /dev/null and b/docs/images/neural_core.png differ diff --git a/docs/implementations/NeuralCore.md b/docs/implementations/NeuralCore.md new file mode 100644 index 0000000..ee7adfc --- /dev/null +++ b/docs/implementations/NeuralCore.md @@ -0,0 +1,48 @@ +# Neural Core + +[Systolic Arrays](SystolicArray.md) are high throughput high latency computing architectures. They can be very efficient if we controll them with care. + +To support more general operations for linear algebra, we need to split the computing logic from the addressing and controlling logic. So the architecture should look like below: + +
+ +
+ +The overall architecture of the proposed Neural Core will look like a multi-layered 3D grid. If you look along z-axis, you will find them forming a pipeline. + +```mermaid +graph LR + subgraph I[Scratch Pad Memory] + C[SPM inbound] + F[SPM outbound] + end + + G[DMA] + + subgraph Neural Core + A[CU] + B[i-MMU] + D[PE] + E[o-MMU] + end + + + B --> |addr| C + C --> |data| B + A --> |ctrl| B + A --> |i-base-addr| B + B --> |data| D + A --> |ctrl| D + D --> |data| E + A --> |ctrl| E + E --> |addr| F + E --> |data| F + A --> |o-base-addr| E + I <--> G +``` + +Above is the pipeline of a Neural Unit (NU), which is an element of the processing element pipeline in the Neural Core. They can organize as systolic arrays or parallelized thread cores. + +This flexible architecture is managed by MMU, where all the data flow is controlled. To reduce the number of running transistors, we fused the systolic design with a parallelism design. All $\mu$-CU and i-$\mu$MMU will have a stair-like scheduling characteristics. Though this design choice may lead to high latency, I think it is still quite efficient: It preserves high throughput with fair amount of registers and arithmetic units. Of course you can have a multiplexed control set to manage this grid, but that will have more overhead. For example, you need a large piece of logic to implement the parallelism and another one to avoid bubbles in Neural Units. + +An Neural Processing Unit (NPU) can have multiple Neural Cores (NCore). Each Neural Core has a 2 dimensional grid of Neural Uint (NU). Each Neural Unit has its own micro-CU ($\mu$-CU), micro-MMU for both input and output(i-$\mu$MMU/o-$\mu$MMU) and [processing element (PE)](ProcessingElement.md). Having large registers that hold the matrix is impossible. So the design follows other NPU designs, using a Scratch Pad Memory to store input and output data. Each $\mu$MMU is directly connected to SPM to obtain a instant access to the data. \ No newline at end of file diff --git a/docs/implementations/ProcessingElement.md b/docs/implementations/ProcessingElement.md index 2a4e277..cabc09a 100644 --- a/docs/implementations/ProcessingElement.md +++ b/docs/implementations/ProcessingElement.md @@ -1,16 +1,16 @@ # Processing Element ```ascii_art - ACCUM TOP + ACCUM IN_B \ | \ | \___v___ | Proc | - LEFT ---->| Elem |-----> RIGHT - ¯¯¯|¯¯¯\ - | \ - v \ - BOTTOM OUT (to TLB mapped memory) + IN_A ---->| Elem | + ¯¯¯|¯¯¯ + | + v + OUT (to TLB mapped memory) ``` Processing Element is the fundamental element in systolic array. This is a basic implementation of a 2D PE for 2D systolic array or DSP grid. @@ -22,11 +22,9 @@ PE component will only accumulate the result if the `ACCUM` is high. This is eff wavedrom ( { signal: [ { name: "clk", wave:"P......", period: 4 }, - { name: "top_in", wave: "x====xx", data:["top_1", "top_2", "top_3", "top_4"], period: 4}, - { name: "left_in", wave: "x====xx", data:["left_1", "left_2", "left_3", "left_4"], period: 4}, + { name: "in_a", wave: "x====xx", data:["a_1", "a_2", "a_3", "a_4"], period: 4}, + { name: "in_b", wave: "x====xx", data:["b_1", "b_2", "b_3", "b_4"], period: 4}, { name: "accu", wave: "1...01.", period: 4}, - { name: "right_out", wave: "xx====x", data:["top_1", "top_2", "top_3", "top_4"], period: 4}, - { name: "bottom_out", wave: "xx====x", data:["left_1", "left_2", "left_3", "left_4"], period: 4}, - { name: "out", wave: "xx====x", data:["prod1=top_1*left_1", "prod_1 + top_2 * left_2", "prod_3=top_3 * left_3", "prod_3 + top_4 * left_4"], period: 4}, + { name: "out", wave: "xx====x", data:["prod1=a_1*b_1", "prod_1 + a_2 * b_2", "prod_3=a_3 * b_3", "prod_3 + a_4 * b_4"], period: 4}, ] } ) diff --git a/docs/index.md b/docs/index.md index 2b04404..d711efe 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,14 +4,17 @@ This is an open-source neural processing unit implementation in Chisel3. Specifically, this NPU is targeted at to be integerated to a low-power and edge-oriented SoC systems. So all design choices are facing those demands. +You can check the source code on [GitHub](https://github.com/mpskex/chisel-npu). + For overall chip design, you may find [the FullChipDesign website](https://www.fullchipdesign.com/) pretty helpful there. -## Designs +## ISA Designs - [Instructions](designs/01.isa.md) - [Memory](designs/02.memory.md) - [Buses](designs/03.bus.md) ## Implementation Details -- [Processing Element (PE)](implementations/ProcessingElement.md) -- [Systolic Array (SA)](implementations/SystolicArray.md) \ No newline at end of file +- [Neural Core (NCore)](implementations/NeuralCore.md) + - [Processing Element (PE)](implementations/ProcessingElement.md) + - [Systolic Array (SA)](implementations/SystolicArray.md) \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 69dcd77..7627440 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,6 +4,9 @@ repo_name: Chisel NPU theme: name: material +plugins: + - search + - mermaid2 markdown_extensions: - admonition diff --git a/src/main/scala/ncore/cu/controlUnit.scala b/src/main/scala/ncore/cu/controlUnit.scala new file mode 100644 index 0000000..d71189f --- /dev/null +++ b/src/main/scala/ncore/cu/controlUnit.scala @@ -0,0 +1,28 @@ +// See README.md for license details +package ncore.cu + +import chisel3._ + +/** + * Control unit also uses systolic array to pass instructions + */ +class ControlUnit(val n: Int = 8, val ctrl_width: Int = 8) extends Module { + val io = IO(new Bundle { + val cbus_in = Input(UInt(ctrl_width.W)) + val cbus_out = Output(Vec(n * n, UInt(ctrl_width.W))) + }) + // Assign each element with diagnal control signal + val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U(ctrl_width.W)))) + + // 1D systolic array for control + reg(0) := io.cbus_in + for(i<- 1 until 2*n-1){ + reg(i) := reg(i-1) + } + // Boardcast to all elements in the array + for(i <- 0 until n){ + for(j <- 0 until n){ + io.cbus_out(n*i+j) := reg(i+j) + } + } +} \ No newline at end of file diff --git a/src/main/scala/ncore/neuralCore.scala b/src/main/scala/ncore/neuralCore.scala new file mode 100644 index 0000000..3ef0eb9 --- /dev/null +++ b/src/main/scala/ncore/neuralCore.scala @@ -0,0 +1,63 @@ +// See README.md for license details +package ncore + +import chisel3._ + +/** + * This is the neural core design + */ + class NeuralCore(val n: Int = 8, val nbits: Int = 8, val ctrl_width: Int = 8) extends Module { + val io = IO(new Bundle { + val vec_a = Input(Vec(n, UInt(nbits.W))) // vector `a` is the left input + val vec_b = Input(Vec(n, UInt(nbits.W))) // vector `b` is the top input + val ctrl = Input(UInt(ctrl_width.W)) + val out = Output(Vec(n * n, UInt((2 * nbits + 12).W))) + }) + + // Create n x n pe blocks + val pe_io = VecInit(Seq.fill(n * n) {Module(new pe.PE(nbits)).io}) + // Create 2d register for horizontal & vertical + val pe_reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W)))) + val pe_reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W)))) + + // we use systolic array to pipeline the instructions + // this will avoid bubble and inst complexity + // while simplifying design with higher efficiency + val ctrl_array = Module(new cu.ControlUnit(n, ctrl_width)) + ctrl_array.io.cbus_in := io.ctrl + + for (i <- 0 until n){ + for (j <- 0 until n) { + // ==== OUTPUT ==== + // pe array's output mapped to the matrix position + io.out(n * i + j) := pe_io(n * i + j).out + + // ==== INPUT ==== + // vertical + if (i==0) { + pe_io(j).in_b := io.vec_b(j) + } else { + pe_io(n * i + j).in_b := pe_reg_v(n * (i - 1) + j) + } + if (i < n - 1 && j < n) + pe_reg_v(n * i + j) := pe_io(n * i + j).in_b + + // horizontal + if (j==0) { + pe_io(n * i).in_a := io.vec_a(i) + } else { + pe_io(n * i + j).in_a := pe_reg_h((n - 1) * i + (j - 1)) + } + if (i < n && j < n - 1) + pe_reg_h((n - 1) * i + j) := pe_io(n * i + j).in_a + + // ==== CONTROL ==== + // Currently we only have one bit control + // which is `ACCUM` + // TODO: + // Add ALU control to pe elements + val ctrl = ctrl_array.io.cbus_out(n * i + j).asBools + pe_io(n * i + j).accum := ctrl(0) + } + } + } \ No newline at end of file diff --git a/src/main/scala/procElem/procElem.scala b/src/main/scala/ncore/pe/procElem.scala similarity index 56% rename from src/main/scala/procElem/procElem.scala rename to src/main/scala/ncore/pe/procElem.scala index 503e92d..ff1a662 100644 --- a/src/main/scala/procElem/procElem.scala +++ b/src/main/scala/ncore/pe/procElem.scala @@ -1,6 +1,6 @@ // See README.md for license details. -package procElem +package ncore.pe import chisel3._ @@ -12,10 +12,8 @@ class PE(val nbits: Int = 8) extends Module { val io = IO( new Bundle { val accum = Input(Bool()) - val top_in = Input(UInt(nbits.W)) - val left_in = Input(UInt(nbits.W)) - val bottom_out = Output(UInt((nbits).W)) - val right_out = Output(UInt((nbits).W)) + val in_a = Input(UInt(nbits.W)) + val in_b = Input(UInt(nbits.W)) // The register bandwith is optimized for large transformer // The lower bound of max cap matrix size is: // 2^12 x 2^12 = (4096 x 4096) @@ -23,19 +21,11 @@ class PE(val nbits: Int = 8) extends Module { }) val res = RegInit(0.U((nbits*2 + 12).W)) - val reg_h = RegInit(0.U(nbits.W)) - val reg_v = RegInit(0.U(nbits.W)) when (io.accum) { - res := res + (io.top_in * io.left_in) + res := res + (io.in_a * io.in_b) } .otherwise { - res := (io.top_in * io.left_in) + res := (io.in_a * io.in_b) } - - reg_v := io.top_in - reg_h := io.left_in - - io.bottom_out := reg_v - io.right_out := reg_h io.out := res } \ No newline at end of file diff --git a/src/main/scala/npu/npu.scala b/src/main/scala/npu/npu.scala index d31a0c8..2fba8a3 100644 --- a/src/main/scala/npu/npu.scala +++ b/src/main/scala/npu/npu.scala @@ -4,29 +4,25 @@ import chisel3._ import java.nio.file.{Paths, Files} import java.nio.charset.StandardCharsets import circt.stage.ChiselStage -import procElem.PE +import ncore.pe.PE class NPU extends Module { val nbits: Int = 8 val io = IO(new Bundle { - val top_in = Input(UInt(nbits.W)) - val left_in = Input(UInt(nbits.W)) + val in_a = Input(UInt(nbits.W)) + val in_b = Input(UInt(nbits.W)) val accum = Input(Bool()) - val bottom_out = Output(UInt((nbits*2).W)) - val right_out = Output(UInt((nbits*2).W)) val out = Output(UInt((nbits*2).W)) }) val pe = Module(new PE(8)) // get value when ready - pe.io.top_in := io.top_in - pe.io.left_in := io.left_in + pe.io.in_a := io.in_a + pe.io.in_b := io.in_b pe.io.accum := io.accum io.out := pe.io.out - io.bottom_out := pe.io.bottom_out - io.right_out := pe.io.right_out } object Main extends App { diff --git a/src/main/scala/systolicArray/systolicArray.scala b/src/main/scala/systolicArray/systolicArray.scala deleted file mode 100644 index 6656a21..0000000 --- a/src/main/scala/systolicArray/systolicArray.scala +++ /dev/null @@ -1,83 +0,0 @@ -// See README.md for license details -package systolicArray - -import chisel3._ -import procElem._ - -/** - * Control bus also uses systolic array to pass instructions - */ -class _ControlArray(val n: Int = 8, val ctrl_width: Int = 8) extends Module { - val io = IO(new Bundle { - val cbus_in = Input(UInt(ctrl_width.W)) - val cbus_out = Output(Vec(n * n, UInt(ctrl_width.W))) - }) - // Assign each element with diagnal control signal - val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U(ctrl_width.W)))) - - // 1D systolic array for control - reg(0) := io.cbus_in - for(i<- 1 until 2*n-1){ - reg(i) := reg(i-1) - } - // Boardcast to all elements in the array - for(i <- 0 until n){ - for(j <- 0 until n){ - io.cbus_out(n*i+j) := reg(i+j) - } - } -} - -/** - * This is the systolic array design - */ - class SystolicArray(val n: Int = 8, val nbits: Int = 8, val ctrl_width: Int = 8) extends Module { - val io = IO(new Bundle { - val vec_a = Input(Vec(n, UInt(nbits.W))) // vector `a` is the left input - val vec_b = Input(Vec(n, UInt(nbits.W))) // vector `b` is the top input - val ctrl = Input(UInt(ctrl_width.W)) - val out = Output(Vec(n * n, UInt((2 * nbits + 12).W))) - }) - - // Create n x n pe blocks - val pe_io = VecInit(Seq.fill(n * n) {Module(new PE(nbits)).io}) - - // we use systolic array to pipeline the instructions - // this will avoid bubble and inst complexity - // while simplifying design with higher efficiency - val ctrl_array = Module(new _ControlArray(n, ctrl_width)) - ctrl_array.io.cbus_in := io.ctrl - // for (i <- 0 until n * n) { - // pe_io(i).accum := io.ctrl(0) - // } - - for (i <- 0 until n){ - for (j <- 0 until n) { - // ==== OUTPUT ==== - // pe array's output mapped to the matrix position - io.out(n * i + j) := pe_io(n * i + j).out - - // ==== INPUT ==== - // vertical - if (i==0) { - pe_io(j).top_in := io.vec_b(j) - } else { - pe_io(n * i + j).top_in := pe_io(n * (i - 1) + j).bottom_out - } - // horizontal - if (j==0) { - pe_io(n * i).left_in := io.vec_a(i) - } else { - pe_io(n * i + j).left_in := pe_io(n * i + (j - 1)).right_out - } - - // ==== CONTROL ==== - // Currently we only have one bit control - // which is `ACCUM` - // TODO: - // Add ALU control to pe elements - val ctrl = ctrl_array.io.cbus_out(n * i + j).asBools - pe_io(n * i + j).accum := ctrl(0) - } - } - } \ No newline at end of file diff --git a/src/test/scala/systolicArray/SASpec.scala b/src/test/scala/ncore/CoreSpec.scala similarity index 65% rename from src/test/scala/systolicArray/SASpec.scala rename to src/test/scala/ncore/CoreSpec.scala index 7dc7a5d..0f80604 100644 --- a/src/test/scala/systolicArray/SASpec.scala +++ b/src/test/scala/ncore/CoreSpec.scala @@ -1,72 +1,26 @@ //// See README.md for license details. -package systolicArray +package ncore +import testUtil._ import scala.util.Random import chisel3._ import chiseltest._ import org.scalatest.flatspec.AnyFlatSpec import chisel3.experimental.BundleLiterals._ -class SASpec extends AnyFlatSpec with ChiselScalatestTester { +class CoreSpec extends AnyFlatSpec with ChiselScalatestTester { - def printMatrix(mat: Array[Int], n: Int): Unit = { - println("[") - for (i <- 0 until n) { - var _row = "" - for (j <- 0 until n) { - _row += mat(i * n + j).toString() + ", " - } - println("[" + _row + "],") - } - println("]") - } - - def printMatrixChisel(mat: chisel3.Vec[chisel3.UInt], n: Int): Unit = { - println("[") - for (i <- 0 until n) { - var _row = "" - for (j <- 0 until n) { - _row += mat(i * n + j).peekInt().toString() + ", " - } - println("[" + _row + "],") - } - println("]") - } - - "SA" should "control with a systolic array" in { - test(new _ControlArray(4)) { dut => - val _n = 4 - val rand = new Random - var history = new Array[Int](2 * _n - 1) - var prod = 0 - for (n <- 0 until 16) { - val _cbus_in = rand.between(0, 255) - history +:= _cbus_in - dut.io.cbus_in.poke(_cbus_in) - dut.clock.step() - history = history.slice(0, 2 * _n - 1) - println("Input tick @ " + n + ": " + _cbus_in) - for(i: Int <- 0 until _n){ - for(j:Int <- 0 until _n) { - dut.io.cbus_out(_n * i + j).expect(history(i + j)) - } - } - println("Control tick @ " + n + " : ") - this.printMatrixChisel(dut.io.cbus_out, _n) - } - } - } - - "SA" should "do a normal matrix multiplication" in { - test(new SystolicArray(4, 8)) { dut => + "NeuralCore" should "do a normal matrix multiplication" in { + test(new NeuralCore(4, 8)) { dut => + val print_helper = new testUtil.PrintHelper() val _n = 4 val rand = new Random val _mat_a = new Array[Int](_n * _n) val _mat_b = new Array[Int](_n * _n) val _expected = new Array[Int](_n * _n) var _res = new Array[Int](_n * _n) - + // random initialize the for (i <- 0 until _n * _n) { _mat_a(i) = rand.between(0, 255) @@ -84,11 +38,11 @@ class SASpec extends AnyFlatSpec with ChiselScalatestTester { // print the expected results println("===== MAT A =====") - this.printMatrix(_mat_a, _n) + print_helper.printMatrix(_mat_a, _n) println("===== MAT B =====") - this.printMatrix(_mat_b, _n) + print_helper.printMatrix(_mat_b, _n) println("+++++ MAT C +++++") - this.printMatrix(_expected, _n) + print_helper.printMatrix(_expected, _n) // systolic arrays has latency of 3 * _n - 2 for (i_tick <- 0 until 3 * _n - 2) { @@ -146,7 +100,7 @@ class SASpec extends AnyFlatSpec with ChiselScalatestTester { } } println("+++++ MAT C from HW ++++") - this.printMatrix(_res, _n) + print_helper.printMatrix(_res, _n) } } } \ No newline at end of file diff --git a/src/test/scala/ncore/cu/CUSpec.scala b/src/test/scala/ncore/cu/CUSpec.scala new file mode 100644 index 0000000..03c02a7 --- /dev/null +++ b/src/test/scala/ncore/cu/CUSpec.scala @@ -0,0 +1,38 @@ +//// See README.md for license details. + +package ncore.cu + +import testUtil._ +import scala.util.Random +import chisel3._ +import chiseltest._ +import org.scalatest.flatspec.AnyFlatSpec +import chisel3.experimental.BundleLiterals._ + +class CUSpec extends AnyFlatSpec with ChiselScalatestTester { + + "CU" should "send control to 2D systolic array" in { + test(new ControlUnit(4)) { dut => + val print_helper = new testUtil.PrintHelper() + val _n = 4 + val rand = new Random + var history = new Array[Int](2 * _n - 1) + var prod = 0 + for (n <- 0 until 16) { + val _cbus_in = rand.between(0, 255) + history +:= _cbus_in + dut.io.cbus_in.poke(_cbus_in) + dut.clock.step() + history = history.slice(0, 2 * _n - 1) + println("Input tick @ " + n + ": " + _cbus_in) + for(i: Int <- 0 until _n){ + for(j:Int <- 0 until _n) { + dut.io.cbus_out(_n * i + j).expect(history(i + j)) + } + } + println("Control tick @ " + n + " : ") + print_helper.printMatrixChisel(dut.io.cbus_out, _n) + } + } + } +} \ No newline at end of file diff --git a/src/test/scala/procElem/PESpec.scala b/src/test/scala/ncore/pe/PESpec.scala similarity index 75% rename from src/test/scala/procElem/PESpec.scala rename to src/test/scala/ncore/pe/PESpec.scala index 9801962..bd9c3ae 100644 --- a/src/test/scala/procElem/PESpec.scala +++ b/src/test/scala/ncore/pe/PESpec.scala @@ -1,6 +1,6 @@ // See README.md for license details. -package procElem +package ncore.pe import scala.util.Random import chisel3._ @@ -18,12 +18,10 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester { for (n <- 0 until 128) { val _top_in_ = rand.between(0, 255) val _left_in_ = rand.between(0, 255) - dut.io.top_in.poke(_top_in_) - dut.io.left_in.poke(_left_in_) + dut.io.in_a.poke(_top_in_) + dut.io.in_b.poke(_left_in_) dut.io.accum.poke(true) dut.clock.step() - dut.io.bottom_out.expect(_top_in_) - dut.io.right_out.expect(_left_in_) prod = prod + _top_in_ * _left_in_ dut.io.out.expect(prod) println("Result tick @ " + n + ": " + dut.io.out.peekInt() + " with input top: " + _top_in_ + " and left: " + _left_in_) @@ -32,24 +30,20 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester { prod = 0 var _top_in_ = rand.between(1, 255) var _left_in_ = rand.between(1, 255) - dut.io.top_in.poke(_top_in_) - dut.io.left_in.poke(_left_in_) + dut.io.in_a.poke(_top_in_) + dut.io.in_b.poke(_left_in_) dut.io.accum.poke(false) dut.clock.step() - dut.io.bottom_out.expect(_top_in_) - dut.io.right_out.expect(_left_in_) prod = prod + _top_in_ * _left_in_ dut.io.out.expect(prod) println("Result tick @ new: " + dut.io.out.peekInt() + " with input top: " + _top_in_ + " and left: " + _left_in_) _top_in_ = rand.between(1, 255) _left_in_ = rand.between(1, 255) - dut.io.top_in.poke(_top_in_) - dut.io.left_in.poke(_left_in_) + dut.io.in_a.poke(_top_in_) + dut.io.in_b.poke(_left_in_) dut.io.accum.poke(true) dut.clock.step() - dut.io.bottom_out.expect(_top_in_) - dut.io.right_out.expect(_left_in_) prod = prod + _top_in_ * _left_in_ dut.io.out.expect(prod) println("Result tick @ new's next: " + dut.io.out.peekInt() + " with input top: " + _top_in_ + " and left: " + _left_in_) diff --git a/src/test/scala/utils/printHelper.scala b/src/test/scala/utils/printHelper.scala new file mode 100644 index 0000000..c520bda --- /dev/null +++ b/src/test/scala/utils/printHelper.scala @@ -0,0 +1,31 @@ + +package testUtil + +import chisel3._ +import chiseltest._ + +class PrintHelper(){ + def printMatrix(mat: Array[Int], n: Int): Unit = { + println("[") + for (i <- 0 until n) { + var _row = "" + for (j <- 0 until n) { + _row += mat(i * n + j).toString() + ", " + } + println("[" + _row + "],") + } + println("]") + } + + def printMatrixChisel(mat: chisel3.Vec[chisel3.UInt], n: Int): Unit = { + println("[") + for (i <- 0 until n) { + var _row = "" + for (j <- 0 until n) { + _row += mat(i * n + j).peekInt().toString() + ", " + } + println("[" + _row + "],") + } + println("]") + } +} \ No newline at end of file