diff --git a/Makefile b/Makefile
index a230762..fc4f05f 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ push-image-arm64:
docker push fangruil/chisel-dev:arm64-${VER}
docs:
- pip3 install markdown-wavedrom mkdocs mkdocs-material python-markdown-math
+ pip3 install markdown-wavedrom mkdocs mkdocs-material python-markdown-math mkdocs-mermaid2-plugin
mkdocs serve
clean:
diff --git a/README.md b/README.md
index dcaa0d5..9efc23d 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
[![Documentation Status](https://readthedocs.org/projects/chisel-opennpu/badge/?version=latest)](https://chisel-opennpu.readthedocs.io/en/latest/?badge=latest)
+Docs: https://chisel-opennpu.readthedocs.io
+
This is a chisel workbench designed for someone who like docker containers and vscode dev container plugin.
DEVELOP IN PROGRESS. COMMERCIAL USE IS NOT ALLOWED.
diff --git a/docs/images/neural_core.png b/docs/images/neural_core.png
new file mode 100644
index 0000000..5e3c40b
Binary files /dev/null and b/docs/images/neural_core.png differ
diff --git a/docs/implementations/NeuralCore.md b/docs/implementations/NeuralCore.md
new file mode 100644
index 0000000..ee7adfc
--- /dev/null
+++ b/docs/implementations/NeuralCore.md
@@ -0,0 +1,48 @@
+# Neural Core
+
+[Systolic Arrays](SystolicArray.md) are high throughput high latency computing architectures. They can be very efficient if we controll them with care.
+
+To support more general operations for linear algebra, we need to split the computing logic from the addressing and controlling logic. So the architecture should look like below:
+
+
+
![](../../images/neural_core.png)
+
+
+The overall architecture of the proposed Neural Core will look like a multi-layered 3D grid. If you look along z-axis, you will find them forming a pipeline.
+
+```mermaid
+graph LR
+ subgraph I[Scratch Pad Memory]
+ C[SPM inbound]
+ F[SPM outbound]
+ end
+
+ G[DMA]
+
+ subgraph Neural Core
+ A[CU]
+ B[i-MMU]
+ D[PE]
+ E[o-MMU]
+ end
+
+
+ B --> |addr| C
+ C --> |data| B
+ A --> |ctrl| B
+ A --> |i-base-addr| B
+ B --> |data| D
+ A --> |ctrl| D
+ D --> |data| E
+ A --> |ctrl| E
+ E --> |addr| F
+ E --> |data| F
+ A --> |o-base-addr| E
+ I <--> G
+```
+
+Above is the pipeline of a Neural Unit (NU), which is an element of the processing element pipeline in the Neural Core. They can organize as systolic arrays or parallelized thread cores.
+
+This flexible architecture is managed by MMU, where all the data flow is controlled. To reduce the number of running transistors, we fused the systolic design with a parallelism design. All $\mu$-CU and i-$\mu$MMU will have a stair-like scheduling characteristics. Though this design choice may lead to high latency, I think it is still quite efficient: It preserves high throughput with fair amount of registers and arithmetic units. Of course you can have a multiplexed control set to manage this grid, but that will have more overhead. For example, you need a large piece of logic to implement the parallelism and another one to avoid bubbles in Neural Units.
+
+An Neural Processing Unit (NPU) can have multiple Neural Cores (NCore). Each Neural Core has a 2 dimensional grid of Neural Uint (NU). Each Neural Unit has its own micro-CU ($\mu$-CU), micro-MMU for both input and output(i-$\mu$MMU/o-$\mu$MMU) and [processing element (PE)](ProcessingElement.md). Having large registers that hold the matrix is impossible. So the design follows other NPU designs, using a Scratch Pad Memory to store input and output data. Each $\mu$MMU is directly connected to SPM to obtain a instant access to the data.
\ No newline at end of file
diff --git a/docs/implementations/ProcessingElement.md b/docs/implementations/ProcessingElement.md
index 2a4e277..cabc09a 100644
--- a/docs/implementations/ProcessingElement.md
+++ b/docs/implementations/ProcessingElement.md
@@ -1,16 +1,16 @@
# Processing Element
```ascii_art
- ACCUM TOP
+ ACCUM IN_B
\ |
\ |
\___v___
| Proc |
- LEFT ---->| Elem |-----> RIGHT
- ¯¯¯|¯¯¯\
- | \
- v \
- BOTTOM OUT (to TLB mapped memory)
+ IN_A ---->| Elem |
+ ¯¯¯|¯¯¯
+ |
+ v
+ OUT (to TLB mapped memory)
```
Processing Element is the fundamental element in systolic array. This is a basic implementation of a 2D PE for 2D systolic array or DSP grid.
@@ -22,11 +22,9 @@ PE component will only accumulate the result if the `ACCUM` is high. This is eff
wavedrom (
{ signal: [
{ name: "clk", wave:"P......", period: 4 },
- { name: "top_in", wave: "x====xx", data:["top_1", "top_2", "top_3", "top_4"], period: 4},
- { name: "left_in", wave: "x====xx", data:["left_1", "left_2", "left_3", "left_4"], period: 4},
+ { name: "in_a", wave: "x====xx", data:["a_1", "a_2", "a_3", "a_4"], period: 4},
+ { name: "in_b", wave: "x====xx", data:["b_1", "b_2", "b_3", "b_4"], period: 4},
{ name: "accu", wave: "1...01.", period: 4},
- { name: "right_out", wave: "xx====x", data:["top_1", "top_2", "top_3", "top_4"], period: 4},
- { name: "bottom_out", wave: "xx====x", data:["left_1", "left_2", "left_3", "left_4"], period: 4},
- { name: "out", wave: "xx====x", data:["prod1=top_1*left_1", "prod_1 + top_2 * left_2", "prod_3=top_3 * left_3", "prod_3 + top_4 * left_4"], period: 4},
+ { name: "out", wave: "xx====x", data:["prod1=a_1*b_1", "prod_1 + a_2 * b_2", "prod_3=a_3 * b_3", "prod_3 + a_4 * b_4"], period: 4},
] }
)
diff --git a/docs/index.md b/docs/index.md
index 2b04404..d711efe 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -4,14 +4,17 @@ This is an open-source neural processing unit implementation in Chisel3.
Specifically, this NPU is targeted at to be integerated to a low-power and edge-oriented SoC systems. So all design choices are facing those demands.
+You can check the source code on [GitHub](https://github.com/mpskex/chisel-npu).
+
For overall chip design, you may find [the FullChipDesign website](https://www.fullchipdesign.com/) pretty helpful there.
-## Designs
+## ISA Designs
- [Instructions](designs/01.isa.md)
- [Memory](designs/02.memory.md)
- [Buses](designs/03.bus.md)
## Implementation Details
-- [Processing Element (PE)](implementations/ProcessingElement.md)
-- [Systolic Array (SA)](implementations/SystolicArray.md)
\ No newline at end of file
+- [Neural Core (NCore)](implementations/NeuralCore.md)
+ - [Processing Element (PE)](implementations/ProcessingElement.md)
+ - [Systolic Array (SA)](implementations/SystolicArray.md)
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index 69dcd77..7627440 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -4,6 +4,9 @@ repo_name: Chisel NPU
theme:
name: material
+plugins:
+ - search
+ - mermaid2
markdown_extensions:
- admonition
diff --git a/src/main/scala/ncore/cu/controlUnit.scala b/src/main/scala/ncore/cu/controlUnit.scala
new file mode 100644
index 0000000..d71189f
--- /dev/null
+++ b/src/main/scala/ncore/cu/controlUnit.scala
@@ -0,0 +1,28 @@
+// See README.md for license details
+package ncore.cu
+
+import chisel3._
+
+/**
+ * Control unit also uses systolic array to pass instructions
+ */
+class ControlUnit(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
+ val io = IO(new Bundle {
+ val cbus_in = Input(UInt(ctrl_width.W))
+ val cbus_out = Output(Vec(n * n, UInt(ctrl_width.W)))
+ })
+ // Assign each element with diagnal control signal
+ val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U(ctrl_width.W))))
+
+ // 1D systolic array for control
+ reg(0) := io.cbus_in
+ for(i<- 1 until 2*n-1){
+ reg(i) := reg(i-1)
+ }
+ // Boardcast to all elements in the array
+ for(i <- 0 until n){
+ for(j <- 0 until n){
+ io.cbus_out(n*i+j) := reg(i+j)
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/scala/ncore/neuralCore.scala b/src/main/scala/ncore/neuralCore.scala
new file mode 100644
index 0000000..3ef0eb9
--- /dev/null
+++ b/src/main/scala/ncore/neuralCore.scala
@@ -0,0 +1,63 @@
+// See README.md for license details
+package ncore
+
+import chisel3._
+
+/**
+ * This is the neural core design
+ */
+ class NeuralCore(val n: Int = 8, val nbits: Int = 8, val ctrl_width: Int = 8) extends Module {
+ val io = IO(new Bundle {
+ val vec_a = Input(Vec(n, UInt(nbits.W))) // vector `a` is the left input
+ val vec_b = Input(Vec(n, UInt(nbits.W))) // vector `b` is the top input
+ val ctrl = Input(UInt(ctrl_width.W))
+ val out = Output(Vec(n * n, UInt((2 * nbits + 12).W)))
+ })
+
+ // Create n x n pe blocks
+ val pe_io = VecInit(Seq.fill(n * n) {Module(new pe.PE(nbits)).io})
+ // Create 2d register for horizontal & vertical
+ val pe_reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+ val pe_reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+
+ // we use systolic array to pipeline the instructions
+ // this will avoid bubble and inst complexity
+ // while simplifying design with higher efficiency
+ val ctrl_array = Module(new cu.ControlUnit(n, ctrl_width))
+ ctrl_array.io.cbus_in := io.ctrl
+
+ for (i <- 0 until n){
+ for (j <- 0 until n) {
+ // ==== OUTPUT ====
+ // pe array's output mapped to the matrix position
+ io.out(n * i + j) := pe_io(n * i + j).out
+
+ // ==== INPUT ====
+ // vertical
+ if (i==0) {
+ pe_io(j).in_b := io.vec_b(j)
+ } else {
+ pe_io(n * i + j).in_b := pe_reg_v(n * (i - 1) + j)
+ }
+ if (i < n - 1 && j < n)
+ pe_reg_v(n * i + j) := pe_io(n * i + j).in_b
+
+ // horizontal
+ if (j==0) {
+ pe_io(n * i).in_a := io.vec_a(i)
+ } else {
+ pe_io(n * i + j).in_a := pe_reg_h((n - 1) * i + (j - 1))
+ }
+ if (i < n && j < n - 1)
+ pe_reg_h((n - 1) * i + j) := pe_io(n * i + j).in_a
+
+ // ==== CONTROL ====
+ // Currently we only have one bit control
+ // which is `ACCUM`
+ // TODO:
+ // Add ALU control to pe elements
+ val ctrl = ctrl_array.io.cbus_out(n * i + j).asBools
+ pe_io(n * i + j).accum := ctrl(0)
+ }
+ }
+ }
\ No newline at end of file
diff --git a/src/main/scala/procElem/procElem.scala b/src/main/scala/ncore/pe/procElem.scala
similarity index 56%
rename from src/main/scala/procElem/procElem.scala
rename to src/main/scala/ncore/pe/procElem.scala
index 503e92d..ff1a662 100644
--- a/src/main/scala/procElem/procElem.scala
+++ b/src/main/scala/ncore/pe/procElem.scala
@@ -1,6 +1,6 @@
// See README.md for license details.
-package procElem
+package ncore.pe
import chisel3._
@@ -12,10 +12,8 @@ class PE(val nbits: Int = 8) extends Module {
val io = IO(
new Bundle {
val accum = Input(Bool())
- val top_in = Input(UInt(nbits.W))
- val left_in = Input(UInt(nbits.W))
- val bottom_out = Output(UInt((nbits).W))
- val right_out = Output(UInt((nbits).W))
+ val in_a = Input(UInt(nbits.W))
+ val in_b = Input(UInt(nbits.W))
// The register bandwith is optimized for large transformer
// The lower bound of max cap matrix size is:
// 2^12 x 2^12 = (4096 x 4096)
@@ -23,19 +21,11 @@ class PE(val nbits: Int = 8) extends Module {
})
val res = RegInit(0.U((nbits*2 + 12).W))
- val reg_h = RegInit(0.U(nbits.W))
- val reg_v = RegInit(0.U(nbits.W))
when (io.accum) {
- res := res + (io.top_in * io.left_in)
+ res := res + (io.in_a * io.in_b)
} .otherwise {
- res := (io.top_in * io.left_in)
+ res := (io.in_a * io.in_b)
}
-
- reg_v := io.top_in
- reg_h := io.left_in
-
- io.bottom_out := reg_v
- io.right_out := reg_h
io.out := res
}
\ No newline at end of file
diff --git a/src/main/scala/npu/npu.scala b/src/main/scala/npu/npu.scala
index d31a0c8..2fba8a3 100644
--- a/src/main/scala/npu/npu.scala
+++ b/src/main/scala/npu/npu.scala
@@ -4,29 +4,25 @@ import chisel3._
import java.nio.file.{Paths, Files}
import java.nio.charset.StandardCharsets
import circt.stage.ChiselStage
-import procElem.PE
+import ncore.pe.PE
class NPU extends Module {
val nbits: Int = 8
val io = IO(new Bundle {
- val top_in = Input(UInt(nbits.W))
- val left_in = Input(UInt(nbits.W))
+ val in_a = Input(UInt(nbits.W))
+ val in_b = Input(UInt(nbits.W))
val accum = Input(Bool())
- val bottom_out = Output(UInt((nbits*2).W))
- val right_out = Output(UInt((nbits*2).W))
val out = Output(UInt((nbits*2).W))
})
val pe = Module(new PE(8))
// get value when ready
- pe.io.top_in := io.top_in
- pe.io.left_in := io.left_in
+ pe.io.in_a := io.in_a
+ pe.io.in_b := io.in_b
pe.io.accum := io.accum
io.out := pe.io.out
- io.bottom_out := pe.io.bottom_out
- io.right_out := pe.io.right_out
}
object Main extends App {
diff --git a/src/main/scala/systolicArray/systolicArray.scala b/src/main/scala/systolicArray/systolicArray.scala
deleted file mode 100644
index 6656a21..0000000
--- a/src/main/scala/systolicArray/systolicArray.scala
+++ /dev/null
@@ -1,83 +0,0 @@
-// See README.md for license details
-package systolicArray
-
-import chisel3._
-import procElem._
-
-/**
- * Control bus also uses systolic array to pass instructions
- */
-class _ControlArray(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
- val io = IO(new Bundle {
- val cbus_in = Input(UInt(ctrl_width.W))
- val cbus_out = Output(Vec(n * n, UInt(ctrl_width.W)))
- })
- // Assign each element with diagnal control signal
- val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U(ctrl_width.W))))
-
- // 1D systolic array for control
- reg(0) := io.cbus_in
- for(i<- 1 until 2*n-1){
- reg(i) := reg(i-1)
- }
- // Boardcast to all elements in the array
- for(i <- 0 until n){
- for(j <- 0 until n){
- io.cbus_out(n*i+j) := reg(i+j)
- }
- }
-}
-
-/**
- * This is the systolic array design
- */
- class SystolicArray(val n: Int = 8, val nbits: Int = 8, val ctrl_width: Int = 8) extends Module {
- val io = IO(new Bundle {
- val vec_a = Input(Vec(n, UInt(nbits.W))) // vector `a` is the left input
- val vec_b = Input(Vec(n, UInt(nbits.W))) // vector `b` is the top input
- val ctrl = Input(UInt(ctrl_width.W))
- val out = Output(Vec(n * n, UInt((2 * nbits + 12).W)))
- })
-
- // Create n x n pe blocks
- val pe_io = VecInit(Seq.fill(n * n) {Module(new PE(nbits)).io})
-
- // we use systolic array to pipeline the instructions
- // this will avoid bubble and inst complexity
- // while simplifying design with higher efficiency
- val ctrl_array = Module(new _ControlArray(n, ctrl_width))
- ctrl_array.io.cbus_in := io.ctrl
- // for (i <- 0 until n * n) {
- // pe_io(i).accum := io.ctrl(0)
- // }
-
- for (i <- 0 until n){
- for (j <- 0 until n) {
- // ==== OUTPUT ====
- // pe array's output mapped to the matrix position
- io.out(n * i + j) := pe_io(n * i + j).out
-
- // ==== INPUT ====
- // vertical
- if (i==0) {
- pe_io(j).top_in := io.vec_b(j)
- } else {
- pe_io(n * i + j).top_in := pe_io(n * (i - 1) + j).bottom_out
- }
- // horizontal
- if (j==0) {
- pe_io(n * i).left_in := io.vec_a(i)
- } else {
- pe_io(n * i + j).left_in := pe_io(n * i + (j - 1)).right_out
- }
-
- // ==== CONTROL ====
- // Currently we only have one bit control
- // which is `ACCUM`
- // TODO:
- // Add ALU control to pe elements
- val ctrl = ctrl_array.io.cbus_out(n * i + j).asBools
- pe_io(n * i + j).accum := ctrl(0)
- }
- }
- }
\ No newline at end of file
diff --git a/src/test/scala/systolicArray/SASpec.scala b/src/test/scala/ncore/CoreSpec.scala
similarity index 65%
rename from src/test/scala/systolicArray/SASpec.scala
rename to src/test/scala/ncore/CoreSpec.scala
index 7dc7a5d..0f80604 100644
--- a/src/test/scala/systolicArray/SASpec.scala
+++ b/src/test/scala/ncore/CoreSpec.scala
@@ -1,72 +1,26 @@
//// See README.md for license details.
-package systolicArray
+package ncore
+import testUtil._
import scala.util.Random
import chisel3._
import chiseltest._
import org.scalatest.flatspec.AnyFlatSpec
import chisel3.experimental.BundleLiterals._
-class SASpec extends AnyFlatSpec with ChiselScalatestTester {
+class CoreSpec extends AnyFlatSpec with ChiselScalatestTester {
- def printMatrix(mat: Array[Int], n: Int): Unit = {
- println("[")
- for (i <- 0 until n) {
- var _row = ""
- for (j <- 0 until n) {
- _row += mat(i * n + j).toString() + ", "
- }
- println("[" + _row + "],")
- }
- println("]")
- }
-
- def printMatrixChisel(mat: chisel3.Vec[chisel3.UInt], n: Int): Unit = {
- println("[")
- for (i <- 0 until n) {
- var _row = ""
- for (j <- 0 until n) {
- _row += mat(i * n + j).peekInt().toString() + ", "
- }
- println("[" + _row + "],")
- }
- println("]")
- }
-
- "SA" should "control with a systolic array" in {
- test(new _ControlArray(4)) { dut =>
- val _n = 4
- val rand = new Random
- var history = new Array[Int](2 * _n - 1)
- var prod = 0
- for (n <- 0 until 16) {
- val _cbus_in = rand.between(0, 255)
- history +:= _cbus_in
- dut.io.cbus_in.poke(_cbus_in)
- dut.clock.step()
- history = history.slice(0, 2 * _n - 1)
- println("Input tick @ " + n + ": " + _cbus_in)
- for(i: Int <- 0 until _n){
- for(j:Int <- 0 until _n) {
- dut.io.cbus_out(_n * i + j).expect(history(i + j))
- }
- }
- println("Control tick @ " + n + " : ")
- this.printMatrixChisel(dut.io.cbus_out, _n)
- }
- }
- }
-
- "SA" should "do a normal matrix multiplication" in {
- test(new SystolicArray(4, 8)) { dut =>
+ "NeuralCore" should "do a normal matrix multiplication" in {
+ test(new NeuralCore(4, 8)) { dut =>
+ val print_helper = new testUtil.PrintHelper()
val _n = 4
val rand = new Random
val _mat_a = new Array[Int](_n * _n)
val _mat_b = new Array[Int](_n * _n)
val _expected = new Array[Int](_n * _n)
var _res = new Array[Int](_n * _n)
-
+
// random initialize the
for (i <- 0 until _n * _n) {
_mat_a(i) = rand.between(0, 255)
@@ -84,11 +38,11 @@ class SASpec extends AnyFlatSpec with ChiselScalatestTester {
// print the expected results
println("===== MAT A =====")
- this.printMatrix(_mat_a, _n)
+ print_helper.printMatrix(_mat_a, _n)
println("===== MAT B =====")
- this.printMatrix(_mat_b, _n)
+ print_helper.printMatrix(_mat_b, _n)
println("+++++ MAT C +++++")
- this.printMatrix(_expected, _n)
+ print_helper.printMatrix(_expected, _n)
// systolic arrays has latency of 3 * _n - 2
for (i_tick <- 0 until 3 * _n - 2) {
@@ -146,7 +100,7 @@ class SASpec extends AnyFlatSpec with ChiselScalatestTester {
}
}
println("+++++ MAT C from HW ++++")
- this.printMatrix(_res, _n)
+ print_helper.printMatrix(_res, _n)
}
}
}
\ No newline at end of file
diff --git a/src/test/scala/ncore/cu/CUSpec.scala b/src/test/scala/ncore/cu/CUSpec.scala
new file mode 100644
index 0000000..03c02a7
--- /dev/null
+++ b/src/test/scala/ncore/cu/CUSpec.scala
@@ -0,0 +1,38 @@
+//// See README.md for license details.
+
+package ncore.cu
+
+import testUtil._
+import scala.util.Random
+import chisel3._
+import chiseltest._
+import org.scalatest.flatspec.AnyFlatSpec
+import chisel3.experimental.BundleLiterals._
+
+class CUSpec extends AnyFlatSpec with ChiselScalatestTester {
+
+ "CU" should "send control to 2D systolic array" in {
+ test(new ControlUnit(4)) { dut =>
+ val print_helper = new testUtil.PrintHelper()
+ val _n = 4
+ val rand = new Random
+ var history = new Array[Int](2 * _n - 1)
+ var prod = 0
+ for (n <- 0 until 16) {
+ val _cbus_in = rand.between(0, 255)
+ history +:= _cbus_in
+ dut.io.cbus_in.poke(_cbus_in)
+ dut.clock.step()
+ history = history.slice(0, 2 * _n - 1)
+ println("Input tick @ " + n + ": " + _cbus_in)
+ for(i: Int <- 0 until _n){
+ for(j:Int <- 0 until _n) {
+ dut.io.cbus_out(_n * i + j).expect(history(i + j))
+ }
+ }
+ println("Control tick @ " + n + " : ")
+ print_helper.printMatrixChisel(dut.io.cbus_out, _n)
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/test/scala/procElem/PESpec.scala b/src/test/scala/ncore/pe/PESpec.scala
similarity index 75%
rename from src/test/scala/procElem/PESpec.scala
rename to src/test/scala/ncore/pe/PESpec.scala
index 9801962..bd9c3ae 100644
--- a/src/test/scala/procElem/PESpec.scala
+++ b/src/test/scala/ncore/pe/PESpec.scala
@@ -1,6 +1,6 @@
// See README.md for license details.
-package procElem
+package ncore.pe
import scala.util.Random
import chisel3._
@@ -18,12 +18,10 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester {
for (n <- 0 until 128) {
val _top_in_ = rand.between(0, 255)
val _left_in_ = rand.between(0, 255)
- dut.io.top_in.poke(_top_in_)
- dut.io.left_in.poke(_left_in_)
+ dut.io.in_a.poke(_top_in_)
+ dut.io.in_b.poke(_left_in_)
dut.io.accum.poke(true)
dut.clock.step()
- dut.io.bottom_out.expect(_top_in_)
- dut.io.right_out.expect(_left_in_)
prod = prod + _top_in_ * _left_in_
dut.io.out.expect(prod)
println("Result tick @ " + n + ": " + dut.io.out.peekInt() + " with input top: " + _top_in_ + " and left: " + _left_in_)
@@ -32,24 +30,20 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester {
prod = 0
var _top_in_ = rand.between(1, 255)
var _left_in_ = rand.between(1, 255)
- dut.io.top_in.poke(_top_in_)
- dut.io.left_in.poke(_left_in_)
+ dut.io.in_a.poke(_top_in_)
+ dut.io.in_b.poke(_left_in_)
dut.io.accum.poke(false)
dut.clock.step()
- dut.io.bottom_out.expect(_top_in_)
- dut.io.right_out.expect(_left_in_)
prod = prod + _top_in_ * _left_in_
dut.io.out.expect(prod)
println("Result tick @ new: " + dut.io.out.peekInt() + " with input top: " + _top_in_ + " and left: " + _left_in_)
_top_in_ = rand.between(1, 255)
_left_in_ = rand.between(1, 255)
- dut.io.top_in.poke(_top_in_)
- dut.io.left_in.poke(_left_in_)
+ dut.io.in_a.poke(_top_in_)
+ dut.io.in_b.poke(_left_in_)
dut.io.accum.poke(true)
dut.clock.step()
- dut.io.bottom_out.expect(_top_in_)
- dut.io.right_out.expect(_left_in_)
prod = prod + _top_in_ * _left_in_
dut.io.out.expect(prod)
println("Result tick @ new's next: " + dut.io.out.peekInt() + " with input top: " + _top_in_ + " and left: " + _left_in_)
diff --git a/src/test/scala/utils/printHelper.scala b/src/test/scala/utils/printHelper.scala
new file mode 100644
index 0000000..c520bda
--- /dev/null
+++ b/src/test/scala/utils/printHelper.scala
@@ -0,0 +1,31 @@
+
+package testUtil
+
+import chisel3._
+import chiseltest._
+
+class PrintHelper(){
+ def printMatrix(mat: Array[Int], n: Int): Unit = {
+ println("[")
+ for (i <- 0 until n) {
+ var _row = ""
+ for (j <- 0 until n) {
+ _row += mat(i * n + j).toString() + ", "
+ }
+ println("[" + _row + "],")
+ }
+ println("]")
+ }
+
+ def printMatrixChisel(mat: chisel3.Vec[chisel3.UInt], n: Int): Unit = {
+ println("[")
+ for (i <- 0 until n) {
+ var _row = ""
+ for (j <- 0 until n) {
+ _row += mat(i * n + j).peekInt().toString() + ", "
+ }
+ println("[" + _row + "],")
+ }
+ println("]")
+ }
+}
\ No newline at end of file