diff --git a/Makefile b/Makefile
index a230762..fc4f05f 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ push-image-arm64:
 	docker push fangruil/chisel-dev:arm64-${VER}
 
 docs:
-	pip3 install markdown-wavedrom mkdocs mkdocs-material python-markdown-math
+	pip3 install markdown-wavedrom mkdocs mkdocs-material python-markdown-math mkdocs-mermaid2-plugin
 	mkdocs serve
 
 clean:
diff --git a/README.md b/README.md
index dcaa0d5..9efc23d 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
 
 [![Documentation Status](https://readthedocs.org/projects/chisel-opennpu/badge/?version=latest)](https://chisel-opennpu.readthedocs.io/en/latest/?badge=latest)
 
+Docs: https://chisel-opennpu.readthedocs.io
+
 This is a chisel workbench designed for someone who like docker containers and vscode dev container plugin.
 
 DEVELOP IN PROGRESS. COMMERCIAL USE IS NOT ALLOWED.
diff --git a/docs/images/neural_core.png b/docs/images/neural_core.png
new file mode 100644
index 0000000..5e3c40b
Binary files /dev/null and b/docs/images/neural_core.png differ
diff --git a/docs/implementations/NeuralCore.md b/docs/implementations/NeuralCore.md
new file mode 100644
index 0000000..ee7adfc
--- /dev/null
+++ b/docs/implementations/NeuralCore.md
@@ -0,0 +1,48 @@
+# Neural Core
+
+[Systolic Arrays](SystolicArray.md) are high throughput high latency computing architectures. They can be very efficient if we controll them with care.
+
+To support more general operations for linear algebra, we need to split the computing logic from the addressing and controlling logic. So the architecture should look like below:
+
+<div style="text-align: center">
+<img src="../../images/neural_core.png" width=80%/>
+</div>
+
+The overall architecture of the proposed Neural Core will look like a multi-layered 3D grid. If you look along z-axis, you will find them forming a pipeline.
+
+```mermaid
+graph LR
+    subgraph I[Scratch Pad Memory]
+        C[SPM inbound]
+        F[SPM outbound]
+    end
+    
+    G[DMA]
+
+    subgraph Neural Core
+        A[CU]
+        B[i-MMU]
+        D[PE]
+        E[o-MMU]
+    end
+
+
+    B --> |addr| C
+    C --> |data| B
+    A --> |ctrl| B
+    A --> |i-base-addr| B
+    B --> |data| D
+    A --> |ctrl| D
+    D --> |data| E
+    A --> |ctrl| E
+    E --> |addr| F
+    E --> |data| F
+    A --> |o-base-addr| E
+    I <--> G
+```
+
+Above is the pipeline of a Neural Unit (NU), which is an element of the processing element pipeline in the Neural Core. They can organize as systolic arrays or parallelized thread cores. 
+
+This flexible architecture is managed by MMU, where all the data flow is controlled. To reduce the number of running transistors, we fused the systolic design with a parallelism design. All $\mu$-CU and i-$\mu$MMU will have a stair-like scheduling characteristics. Though this design choice may lead to high latency, I think it is still quite efficient: It preserves high throughput with fair amount of registers and arithmetic units. Of course you can have a multiplexed control set to manage this grid, but that will have more overhead. For example, you need a large piece of logic to implement the parallelism and another one to avoid bubbles in Neural Units.
+
+An Neural Processing Unit (NPU) can have multiple Neural Cores (NCore). Each Neural Core has a 2 dimensional grid of Neural Uint (NU). Each Neural Unit has its own micro-CU ($\mu$-CU), micro-MMU for both input and output(i-$\mu$MMU/o-$\mu$MMU) and [processing element (PE)](ProcessingElement.md). Having large registers that hold the matrix is impossible. So the design follows other NPU designs, using a Scratch Pad Memory to store input and output data. Each $\mu$MMU is directly connected to SPM to obtain a instant access to the data.
\ No newline at end of file
diff --git a/docs/implementations/ProcessingElement.md b/docs/implementations/ProcessingElement.md
index 2a4e277..cabc09a 100644
--- a/docs/implementations/ProcessingElement.md
+++ b/docs/implementations/ProcessingElement.md
@@ -1,16 +1,16 @@
 # Processing Element
 
 ```ascii_art
-        ACCUM   TOP
+        ACCUM   IN_B
            \     |
             \    |
              \___v___
              | Proc  |
-   LEFT ---->| Elem  |-----> RIGHT
-              ¯¯¯|¯¯¯\
-                 |    \
-                 v     \
-               BOTTOM   OUT (to TLB mapped memory)
+   IN_A ---->| Elem  |
+              ¯¯¯|¯¯¯
+                 |
+                 v
+                OUT (to TLB mapped memory)
 ```
 
 Processing Element is the fundamental element in systolic array. This is a basic implementation of a 2D PE for 2D systolic array or DSP grid.
@@ -22,11 +22,9 @@ PE component will only accumulate the result if the `ACCUM` is high. This is eff
 wavedrom (
     { signal: [
       { name: "clk", wave:"P......", period: 4 },
-      { name: "top_in", wave: "x====xx", data:["top_1", "top_2", "top_3", "top_4"], period: 4},
-      { name: "left_in", wave: "x====xx", data:["left_1", "left_2", "left_3", "left_4"], period: 4},
+      { name: "in_a", wave: "x====xx", data:["a_1", "a_2", "a_3", "a_4"], period: 4},
+      { name: "in_b", wave: "x====xx", data:["b_1", "b_2", "b_3", "b_4"], period: 4},
       { name: "accu", wave: "1...01.", period: 4},
-      { name: "right_out", wave: "xx====x", data:["top_1", "top_2", "top_3", "top_4"], period: 4},
-      { name: "bottom_out", wave: "xx====x", data:["left_1", "left_2", "left_3", "left_4"], period: 4},
-      { name: "out", wave: "xx====x", data:["prod1=top_1*left_1", "prod_1 + top_2 * left_2", "prod_3=top_3 * left_3", "prod_3 + top_4 * left_4"], period: 4},
+      { name: "out", wave: "xx====x", data:["prod1=a_1*b_1", "prod_1 + a_2 * b_2", "prod_3=a_3 * b_3", "prod_3 + a_4 * b_4"], period: 4},
       ] }
 )
diff --git a/docs/index.md b/docs/index.md
index 2b04404..d711efe 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -4,14 +4,17 @@ This is an open-source neural processing unit implementation in Chisel3.
 
 Specifically, this NPU is targeted at to be integerated to a low-power and edge-oriented SoC systems. So all design choices are facing those demands.
 
+You can check the source code on [GitHub](https://github.com/mpskex/chisel-npu).
+
 For overall chip design, you may find [the FullChipDesign website](https://www.fullchipdesign.com/) pretty helpful there.
 
-## Designs
+## ISA Designs
 - [Instructions](designs/01.isa.md)
 - [Memory](designs/02.memory.md)
 - [Buses](designs/03.bus.md)
 
 ## Implementation Details
 
-- [Processing Element (PE)](implementations/ProcessingElement.md)
-- [Systolic Array (SA)](implementations/SystolicArray.md)
\ No newline at end of file
+- [Neural Core (NCore)](implementations/NeuralCore.md)
+  - [Processing Element (PE)](implementations/ProcessingElement.md)
+  - [Systolic Array (SA)](implementations/SystolicArray.md)
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index 69dcd77..7627440 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -4,6 +4,9 @@ repo_name: Chisel NPU
 theme:
   name: material
 
+plugins:
+    - search
+    - mermaid2
 
 markdown_extensions:
   - admonition
diff --git a/src/main/scala/ncore/cu/controlUnit.scala b/src/main/scala/ncore/cu/controlUnit.scala
new file mode 100644
index 0000000..d71189f
--- /dev/null
+++ b/src/main/scala/ncore/cu/controlUnit.scala
@@ -0,0 +1,28 @@
+// See README.md for license details
+package ncore.cu
+
+import chisel3._
+
+/**
+ * Control unit also uses systolic array to pass instructions
+ */
+class ControlUnit(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
+    val io = IO(new Bundle {
+        val cbus_in     = Input(UInt(ctrl_width.W))
+        val cbus_out    = Output(Vec(n * n, UInt(ctrl_width.W)))
+    })
+    // Assign each element with diagnal control signal
+    val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U(ctrl_width.W))))
+
+    // 1D systolic array for control
+    reg(0) := io.cbus_in
+    for(i<- 1 until 2*n-1){
+        reg(i) := reg(i-1)
+    }
+    // Boardcast to all elements in the array
+    for(i <- 0 until n){
+        for(j <- 0 until n){
+            io.cbus_out(n*i+j) := reg(i+j)
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/main/scala/ncore/neuralCore.scala b/src/main/scala/ncore/neuralCore.scala
new file mode 100644
index 0000000..3ef0eb9
--- /dev/null
+++ b/src/main/scala/ncore/neuralCore.scala
@@ -0,0 +1,63 @@
+// See README.md for license details
+package ncore
+
+import chisel3._
+
+/**
+ * This is the neural core design
+ */
+ class NeuralCore(val n: Int = 8, val nbits: Int = 8, val ctrl_width: Int = 8) extends Module {
+    val io = IO(new Bundle {
+        val vec_a   = Input(Vec(n, UInt(nbits.W)))  // vector `a` is the left input
+        val vec_b   = Input(Vec(n, UInt(nbits.W)))  // vector `b` is the top input
+        val ctrl    = Input(UInt(ctrl_width.W))
+        val out     = Output(Vec(n * n, UInt((2 * nbits + 12).W)))
+    })
+
+    // Create n x n pe blocks
+    val pe_io = VecInit(Seq.fill(n * n) {Module(new pe.PE(nbits)).io})
+    // Create 2d register for horizontal & vertical
+    val pe_reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+    val pe_reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+
+    // we use systolic array to pipeline the instructions
+    // this will avoid bubble and inst complexity 
+    // while simplifying design with higher efficiency
+    val ctrl_array = Module(new cu.ControlUnit(n, ctrl_width))
+    ctrl_array.io.cbus_in := io.ctrl
+
+    for (i <- 0 until n){
+        for (j <- 0 until n) {
+            // ==== OUTPUT ====
+            // pe array's output mapped to the matrix position
+            io.out(n * i + j) := pe_io(n * i + j).out
+
+            // ==== INPUT ====
+            // vertical
+            if (i==0) {
+                pe_io(j).in_b := io.vec_b(j)
+            } else {
+                pe_io(n * i + j).in_b := pe_reg_v(n * (i - 1) + j)
+            }
+            if (i < n - 1 && j < n)
+                pe_reg_v(n * i + j) := pe_io(n * i + j).in_b
+
+            // horizontal
+            if (j==0) {
+                pe_io(n * i).in_a := io.vec_a(i)
+            } else {
+                pe_io(n * i + j).in_a := pe_reg_h((n - 1) * i + (j - 1))
+            }
+            if (i < n && j < n - 1)
+                pe_reg_h((n - 1) * i + j) := pe_io(n * i + j).in_a
+
+            // ==== CONTROL ====
+            // Currently we only have one bit control
+            // which is `ACCUM`
+            // TODO:
+            // Add ALU control to pe elements
+            val ctrl = ctrl_array.io.cbus_out(n * i + j).asBools
+            pe_io(n * i + j).accum := ctrl(0)
+        }
+    }
+ }
\ No newline at end of file
diff --git a/src/main/scala/procElem/procElem.scala b/src/main/scala/ncore/pe/procElem.scala
similarity index 56%
rename from src/main/scala/procElem/procElem.scala
rename to src/main/scala/ncore/pe/procElem.scala
index 503e92d..ff1a662 100644
--- a/src/main/scala/procElem/procElem.scala
+++ b/src/main/scala/ncore/pe/procElem.scala
@@ -1,6 +1,6 @@
 // See README.md for license details.
 
-package procElem
+package ncore.pe
 
 import chisel3._
 
@@ -12,10 +12,8 @@ class PE(val nbits: Int = 8) extends Module {
   val io = IO(
     new Bundle {
       val accum       = Input(Bool())
-      val top_in      = Input(UInt(nbits.W))
-      val left_in     = Input(UInt(nbits.W))
-      val bottom_out  = Output(UInt((nbits).W))
-      val right_out   = Output(UInt((nbits).W))
+      val in_a      = Input(UInt(nbits.W))
+      val in_b     = Input(UInt(nbits.W))
       //  The register bandwith is optimized for large transformer 
       //  The lower bound of max cap matrix size is:
       //    2^12 x 2^12 = (4096 x 4096)
@@ -23,19 +21,11 @@ class PE(val nbits: Int = 8) extends Module {
   })
 
   val res = RegInit(0.U((nbits*2 + 12).W))
-  val reg_h = RegInit(0.U(nbits.W))
-  val reg_v = RegInit(0.U(nbits.W))
 
   when (io.accum) {
-    res := res + (io.top_in * io.left_in)
+    res := res + (io.in_a * io.in_b)
   } .otherwise {
-    res := (io.top_in * io.left_in)
+    res := (io.in_a * io.in_b)
   }
-
-  reg_v := io.top_in
-  reg_h := io.left_in
-
-  io.bottom_out := reg_v
-  io.right_out := reg_h
   io.out := res
 }
\ No newline at end of file
diff --git a/src/main/scala/npu/npu.scala b/src/main/scala/npu/npu.scala
index d31a0c8..2fba8a3 100644
--- a/src/main/scala/npu/npu.scala
+++ b/src/main/scala/npu/npu.scala
@@ -4,29 +4,25 @@ import chisel3._
 import java.nio.file.{Paths, Files}
 import java.nio.charset.StandardCharsets
 import circt.stage.ChiselStage
-import procElem.PE
+import ncore.pe.PE
 
 class NPU extends Module {
 
   val nbits: Int = 8
   val io = IO(new Bundle {
-    val top_in      = Input(UInt(nbits.W))
-    val left_in     = Input(UInt(nbits.W))
+    val in_a        = Input(UInt(nbits.W))
+    val in_b        = Input(UInt(nbits.W))
     val accum       = Input(Bool())
-    val bottom_out  = Output(UInt((nbits*2).W))
-    val right_out   = Output(UInt((nbits*2).W))
     val out         = Output(UInt((nbits*2).W))
   })  
 
   val pe = Module(new PE(8))
   
   // get value when ready
-  pe.io.top_in := io.top_in
-  pe.io.left_in := io.left_in
+  pe.io.in_a := io.in_a
+  pe.io.in_b := io.in_b
   pe.io.accum := io.accum
   io.out := pe.io.out
-  io.bottom_out := pe.io.bottom_out
-  io.right_out := pe.io.right_out
 }
 
 object Main extends App {
diff --git a/src/main/scala/systolicArray/systolicArray.scala b/src/main/scala/systolicArray/systolicArray.scala
deleted file mode 100644
index 6656a21..0000000
--- a/src/main/scala/systolicArray/systolicArray.scala
+++ /dev/null
@@ -1,83 +0,0 @@
-// See README.md for license details
-package systolicArray
-
-import chisel3._
-import procElem._
-
-/**
- * Control bus also uses systolic array to pass instructions
- */
-class _ControlArray(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
-    val io = IO(new Bundle {
-        val cbus_in     = Input(UInt(ctrl_width.W))
-        val cbus_out    = Output(Vec(n * n, UInt(ctrl_width.W)))
-    })
-    // Assign each element with diagnal control signal
-    val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U(ctrl_width.W))))
-
-    // 1D systolic array for control
-    reg(0) := io.cbus_in
-    for(i<- 1 until 2*n-1){
-        reg(i) := reg(i-1)
-    }
-    // Boardcast to all elements in the array
-    for(i <- 0 until n){
-        for(j <- 0 until n){
-            io.cbus_out(n*i+j) := reg(i+j)
-        }
-    }
-}
-
-/**
- * This is the systolic array design
- */
- class SystolicArray(val n: Int = 8, val nbits: Int = 8, val ctrl_width: Int = 8) extends Module {
-    val io = IO(new Bundle {
-        val vec_a   = Input(Vec(n, UInt(nbits.W)))  // vector `a` is the left input
-        val vec_b   = Input(Vec(n, UInt(nbits.W)))  // vector `b` is the top input
-        val ctrl    = Input(UInt(ctrl_width.W))
-        val out     = Output(Vec(n * n, UInt((2 * nbits + 12).W)))
-    })
-
-    // Create n x n pe blocks
-    val pe_io = VecInit(Seq.fill(n * n) {Module(new PE(nbits)).io})
-
-    // we use systolic array to pipeline the instructions
-    // this will avoid bubble and inst complexity 
-    // while simplifying design with higher efficiency
-    val ctrl_array = Module(new _ControlArray(n, ctrl_width))
-    ctrl_array.io.cbus_in := io.ctrl
-    // for (i <- 0 until n * n) {
-    //     pe_io(i).accum := io.ctrl(0)
-    // }
-
-    for (i <- 0 until n){
-        for (j <- 0 until n) {
-            // ==== OUTPUT ====
-            // pe array's output mapped to the matrix position
-            io.out(n * i + j) := pe_io(n * i + j).out
-
-            // ==== INPUT ====
-            // vertical
-            if (i==0) {
-                pe_io(j).top_in := io.vec_b(j)
-            } else {
-                pe_io(n * i + j).top_in := pe_io(n * (i - 1) + j).bottom_out
-            }
-            // horizontal
-            if (j==0) {
-                pe_io(n * i).left_in := io.vec_a(i)
-            } else {
-                pe_io(n * i + j).left_in := pe_io(n * i + (j - 1)).right_out
-            }
-
-            // ==== CONTROL ====
-            // Currently we only have one bit control
-            // which is `ACCUM`
-            // TODO:
-            // Add ALU control to pe elements
-            val ctrl = ctrl_array.io.cbus_out(n * i + j).asBools
-            pe_io(n * i + j).accum := ctrl(0)
-        }
-    }
- }
\ No newline at end of file
diff --git a/src/test/scala/systolicArray/SASpec.scala b/src/test/scala/ncore/CoreSpec.scala
similarity index 65%
rename from src/test/scala/systolicArray/SASpec.scala
rename to src/test/scala/ncore/CoreSpec.scala
index 7dc7a5d..0f80604 100644
--- a/src/test/scala/systolicArray/SASpec.scala
+++ b/src/test/scala/ncore/CoreSpec.scala
@@ -1,72 +1,26 @@
 //// See README.md for license details.
 
-package systolicArray
+package ncore
 
+import testUtil._
 import scala.util.Random
 import chisel3._
 import chiseltest._
 import org.scalatest.flatspec.AnyFlatSpec
 import chisel3.experimental.BundleLiterals._
 
-class SASpec extends AnyFlatSpec with ChiselScalatestTester {
+class CoreSpec extends AnyFlatSpec with ChiselScalatestTester {
 
-    def printMatrix(mat: Array[Int], n: Int): Unit = {
-        println("[")
-        for (i <- 0 until n) {
-            var _row = ""
-            for (j <- 0 until n) {
-                _row += mat(i * n + j).toString() + ", "
-            }
-            println("[" + _row + "],")
-        }
-        println("]")
-    }
-
-    def printMatrixChisel(mat: chisel3.Vec[chisel3.UInt], n: Int): Unit = {
-        println("[")
-        for (i <- 0 until n) {
-            var _row = ""
-            for (j <- 0 until n) {
-                _row += mat(i * n + j).peekInt().toString() + ", "
-            }
-            println("[" + _row + "],")
-        }
-        println("]")
-    }
-
-    "SA" should "control with a systolic array" in {
-        test(new _ControlArray(4)) { dut =>
-            val _n = 4
-            val rand = new Random
-            var history = new Array[Int](2 * _n - 1)
-            var prod = 0
-            for (n <- 0 until 16) {
-                val _cbus_in = rand.between(0, 255)
-                history +:= _cbus_in
-                dut.io.cbus_in.poke(_cbus_in)
-                dut.clock.step()
-                history = history.slice(0, 2 * _n - 1)
-                println("Input tick @ " + n + ": " + _cbus_in)
-                for(i: Int <- 0 until _n){
-                    for(j:Int <- 0 until _n) {
-                        dut.io.cbus_out(_n * i + j).expect(history(i + j))
-                    }
-                }
-                println("Control tick @ " + n + " : ")
-                this.printMatrixChisel(dut.io.cbus_out, _n)
-            }
-        }
-    }
-
-    "SA" should "do a normal matrix multiplication" in {
-        test(new SystolicArray(4, 8)) { dut =>
+    "NeuralCore" should "do a normal matrix multiplication" in {
+        test(new NeuralCore(4, 8)) { dut =>
+            val print_helper = new testUtil.PrintHelper()
             val _n = 4
             val rand = new Random
             val _mat_a = new Array[Int](_n * _n)
             val _mat_b = new Array[Int](_n * _n)
             val _expected = new Array[Int](_n * _n)
             var _res = new Array[Int](_n * _n)
-            
+
             // random initialize the
             for (i <- 0 until _n * _n) {
                 _mat_a(i) = rand.between(0, 255)
@@ -84,11 +38,11 @@ class SASpec extends AnyFlatSpec with ChiselScalatestTester {
 
             // print the expected results
             println("===== MAT A =====")
-            this.printMatrix(_mat_a, _n)
+            print_helper.printMatrix(_mat_a, _n)
             println("===== MAT B =====")
-            this.printMatrix(_mat_b, _n)
+            print_helper.printMatrix(_mat_b, _n)
             println("+++++ MAT C +++++")
-            this.printMatrix(_expected, _n)
+            print_helper.printMatrix(_expected, _n)
 
             // systolic arrays has latency of 3 * _n - 2
             for (i_tick <- 0 until 3 * _n - 2) {
@@ -146,7 +100,7 @@ class SASpec extends AnyFlatSpec with ChiselScalatestTester {
                 }
             }
             println("+++++ MAT C from HW ++++")
-            this.printMatrix(_res, _n)
+            print_helper.printMatrix(_res, _n)
         }
     }   
 }
\ No newline at end of file
diff --git a/src/test/scala/ncore/cu/CUSpec.scala b/src/test/scala/ncore/cu/CUSpec.scala
new file mode 100644
index 0000000..03c02a7
--- /dev/null
+++ b/src/test/scala/ncore/cu/CUSpec.scala
@@ -0,0 +1,38 @@
+//// See README.md for license details.
+
+package ncore.cu
+
+import testUtil._
+import scala.util.Random
+import chisel3._
+import chiseltest._
+import org.scalatest.flatspec.AnyFlatSpec
+import chisel3.experimental.BundleLiterals._
+
+class CUSpec extends AnyFlatSpec with ChiselScalatestTester {
+
+    "CU" should "send control to 2D systolic array" in {
+        test(new ControlUnit(4)) { dut =>
+            val print_helper = new testUtil.PrintHelper()
+            val _n = 4
+            val rand = new Random
+            var history = new Array[Int](2 * _n - 1)
+            var prod = 0
+            for (n <- 0 until 16) {
+                val _cbus_in = rand.between(0, 255)
+                history +:= _cbus_in
+                dut.io.cbus_in.poke(_cbus_in)
+                dut.clock.step()
+                history = history.slice(0, 2 * _n - 1)
+                println("Input tick @ " + n + ": " + _cbus_in)
+                for(i: Int <- 0 until _n){
+                    for(j:Int <- 0 until _n) {
+                        dut.io.cbus_out(_n * i + j).expect(history(i + j))
+                    }
+                }
+                println("Control tick @ " + n + " : ")
+                print_helper.printMatrixChisel(dut.io.cbus_out, _n)
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/test/scala/procElem/PESpec.scala b/src/test/scala/ncore/pe/PESpec.scala
similarity index 75%
rename from src/test/scala/procElem/PESpec.scala
rename to src/test/scala/ncore/pe/PESpec.scala
index 9801962..bd9c3ae 100644
--- a/src/test/scala/procElem/PESpec.scala
+++ b/src/test/scala/ncore/pe/PESpec.scala
@@ -1,6 +1,6 @@
 // See README.md for license details.
 
-package procElem
+package ncore.pe
 
 import scala.util.Random
 import chisel3._
@@ -18,12 +18,10 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester {
       for (n <- 0 until 128) {
         val _top_in_ = rand.between(0, 255)
         val _left_in_ = rand.between(0, 255)
-        dut.io.top_in.poke(_top_in_)
-        dut.io.left_in.poke(_left_in_)
+        dut.io.in_a.poke(_top_in_)
+        dut.io.in_b.poke(_left_in_)
         dut.io.accum.poke(true)
         dut.clock.step()
-        dut.io.bottom_out.expect(_top_in_)
-        dut.io.right_out.expect(_left_in_)
         prod = prod + _top_in_ * _left_in_
         dut.io.out.expect(prod)
         println("Result tick @ " + n + ": " + dut.io.out.peekInt() + " with input top: " + _top_in_ + " and left: " + _left_in_)
@@ -32,24 +30,20 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester {
       prod = 0
       var _top_in_ = rand.between(1, 255)
       var _left_in_ = rand.between(1, 255)
-      dut.io.top_in.poke(_top_in_)
-      dut.io.left_in.poke(_left_in_)
+      dut.io.in_a.poke(_top_in_)
+      dut.io.in_b.poke(_left_in_)
       dut.io.accum.poke(false)
       dut.clock.step()
-      dut.io.bottom_out.expect(_top_in_)
-      dut.io.right_out.expect(_left_in_)
       prod = prod + _top_in_ * _left_in_
       dut.io.out.expect(prod)
       println("Result tick @ new: " + dut.io.out.peekInt() + " with input top: " + _top_in_ + " and left: " + _left_in_)
 
       _top_in_ = rand.between(1, 255)
       _left_in_ = rand.between(1, 255)
-      dut.io.top_in.poke(_top_in_)
-      dut.io.left_in.poke(_left_in_)
+      dut.io.in_a.poke(_top_in_)
+      dut.io.in_b.poke(_left_in_)
       dut.io.accum.poke(true)
       dut.clock.step()
-      dut.io.bottom_out.expect(_top_in_)
-      dut.io.right_out.expect(_left_in_)
       prod = prod + _top_in_ * _left_in_
       dut.io.out.expect(prod)
       println("Result tick @ new's next: " + dut.io.out.peekInt() + " with input top: " + _top_in_ + " and left: " + _left_in_)
diff --git a/src/test/scala/utils/printHelper.scala b/src/test/scala/utils/printHelper.scala
new file mode 100644
index 0000000..c520bda
--- /dev/null
+++ b/src/test/scala/utils/printHelper.scala
@@ -0,0 +1,31 @@
+
+package testUtil
+
+import chisel3._
+import chiseltest._
+
+class PrintHelper(){
+    def printMatrix(mat: Array[Int], n: Int): Unit = {
+        println("[")
+        for (i <- 0 until n) {
+            var _row = ""
+            for (j <- 0 until n) {
+                _row += mat(i * n + j).toString() + ", "
+            }
+            println("[" + _row + "],")
+        }
+        println("]")
+    }
+
+    def printMatrixChisel(mat: chisel3.Vec[chisel3.UInt], n: Int): Unit = {
+        println("[")
+        for (i <- 0 until n) {
+            var _row = ""
+            for (j <- 0 until n) {
+                _row += mat(i * n + j).peekInt().toString() + ", "
+            }
+            println("[" + _row + "],")
+        }
+        println("]")
+    }
+}
\ No newline at end of file