From ef8b40d8f193de5b647da143b1e9e75d7c68c178 Mon Sep 17 00:00:00 2001
From: "Fangrui.Liu" <fangrui.liu@ubc.ca>
Date: Sun, 17 Mar 2024 23:34:11 +0800
Subject: [PATCH 1/5] add offset generator

---
 src/main/scala/isa/backend/memMicroCode.scala | 21 ++++++++
 src/main/scala/isa/instSetArch.scala          | 10 +++-
 src/main/scala/isa/memMicroCode.scala         | 44 -----------------
 src/main/scala/ncore/mmu/memMngUnit.scala     | 48 +++++++++++++++++++
 src/main/scala/ncore/tcm/tightCpldMem.scala   | 46 ++++++++----------
 src/test/scala/ncore/mmu/MMUSpec.scala        | 37 ++++++++++++++
 src/test/scala/utils/printHelper.scala        |  8 ++++
 7 files changed, 142 insertions(+), 72 deletions(-)
 create mode 100644 src/main/scala/isa/backend/memMicroCode.scala
 delete mode 100644 src/main/scala/isa/memMicroCode.scala
 create mode 100644 src/main/scala/ncore/mmu/memMngUnit.scala
 create mode 100644 src/test/scala/ncore/mmu/MMUSpec.scala

diff --git a/src/main/scala/isa/backend/memMicroCode.scala b/src/main/scala/isa/backend/memMicroCode.scala
new file mode 100644
index 0000000..2f243f4
--- /dev/null
+++ b/src/main/scala/isa/backend/memMicroCode.scala
@@ -0,0 +1,21 @@
+// See README.md for license details.
+
+package isa.backend
+import chisel3._
+import chisel3.util._
+
+object MemLayout extends ChiselEnum {
+    val bit8    = Value(0x0.U)
+    val bit16   = Value(0x1.U)
+    val bit32   = Value(0x2.U)
+}
+
+object MemChannel extends ChiselEnum {
+    val ch0     = Value(0x0.U)
+    // 16/32 bits will have no ch1
+    val ch1     = Value(0x1.U)
+    // 32 bits will have no ch2
+    val ch2     = Value(0x2.U)
+    // 16/32 bits will have no ch3
+    val ch3     = Value(0x3.U)
+}
diff --git a/src/main/scala/isa/instSetArch.scala b/src/main/scala/isa/instSetArch.scala
index f4b5359..84fd512 100644
--- a/src/main/scala/isa/instSetArch.scala
+++ b/src/main/scala/isa/instSetArch.scala
@@ -8,4 +8,12 @@ object NeuralISA extends ChiselEnum {
     val st = Value(0x2.U(4.W))
     val mma = Value(0x3.U(4.W))
     val ip = Value (0x4.U(4.W))
-}
\ No newline at end of file
+}
+
+object DType extends ChiselEnum {
+    val uint    = Value(0x0.U)
+    val int     = Value(0x1.U)
+    val fp      = Value(0x2.U)
+    // no bfp32c0
+    val bfp     = Value(0x3.U)
+}
diff --git a/src/main/scala/isa/memMicroCode.scala b/src/main/scala/isa/memMicroCode.scala
deleted file mode 100644
index a47e1e2..0000000
--- a/src/main/scala/isa/memMicroCode.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-// See README.md for license details.
-
-package isa
-import chisel3._
-import chisel3.util._
-
-
-object OffsetPattern extends ChiselEnum {
-    val not_def = Value(0x0.U)
-    val sca_0d  = Value(0x1.U)
-    val vec_1d  = Value(0x2.U)
-    val mat_2d  = Value(0x3.U)
-}
-
-object AddressMode extends ChiselEnum {
-    val immd        = Value(0x0.U)
-    val addr        = Value(0x1.U)
-    val addr_immd   = Value(0x2.U)
-}
-
-
-object MemLayout extends ChiselEnum {
-    val bit8    = Value(0x0.U)
-    val bit16   = Value(0x1.U)
-    val bit32   = Value(0x2.U)
-}
-
-object DType extends ChiselEnum {
-    val uint    = Value(0x0.U)
-    val int     = Value(0x1.U)
-    val fp      = Value(0x2.U)
-    // no bfp32c0
-    val bfp     = Value(0x3.U)
-}
-
-object MemChannel extends ChiselEnum {
-    val ch0     = Value(0x1.U)
-    // 16/32 bits will have no ch1
-    val ch1     = Value(0x2.U)
-    // 32 bits will have no ch2
-    val ch2     = Value(0x4.U)
-    // 16/32 bits will have no ch3
-    val ch3     = Value(0x8.U)
-}
\ No newline at end of file
diff --git a/src/main/scala/ncore/mmu/memMngUnit.scala b/src/main/scala/ncore/mmu/memMngUnit.scala
new file mode 100644
index 0000000..62f0149
--- /dev/null
+++ b/src/main/scala/ncore/mmu/memMngUnit.scala
@@ -0,0 +1,48 @@
+// See README.md for license details.
+
+package ncore.mmu
+
+import chisel3._
+import chisel3.util._
+import isa.backend._
+import ncore._
+
+class MMUBundle extends Bundle {
+    val mem_ch  = MemChannel()
+    val mem_lo  = MemLayout()
+}
+
+class OffsetGenerator(val n: Int = 8) extends Module {
+    val io = IO(new Bundle {
+        val inc     = Input(Vec(n, Bool()))
+        val out     = Output(Vec(n, UInt(log2Ceil(n * n).W)))
+    })
+    val init_value = Seq.tabulate(n)(i => (n * i).U(log2Ceil(n * n).W))
+    val regs = RegInit(VecInit(init_value))
+
+    for (i <- 0 until n){
+        when (io.inc(i)) {
+            regs(i) := (regs(i) + 1.U) % (n * n).U
+        }.otherwise {
+            regs(i) := init_value(i)
+        }
+        io.out(i) := regs(i)
+    }
+}
+
+/**
+ * This is the neural core design
+ */
+class MemoryManageUnit(
+    val n: Int = 8, val nbits: Int = 8, val addr_width: Int = 24
+    ) extends Module {
+    val io = IO(new Bundle {
+        val base_addr   = Input(Vec(n, UInt(24.W)))
+        val ctrl        = Input(Vec(n * n, new MMUBundle()))
+        val out_a       = Output(Vec(n * n, UInt(32.W)))
+        val out_b       = Output(Vec(n * n, UInt(32.W)))
+    })
+
+    val offsetgen_a = new OffsetGenerator(n)
+    val offsetgen_b = new OffsetGenerator(n)
+}
\ No newline at end of file
diff --git a/src/main/scala/ncore/tcm/tightCpldMem.scala b/src/main/scala/ncore/tcm/tightCpldMem.scala
index f8fea8a..c0b8338 100644
--- a/src/main/scala/ncore/tcm/tightCpldMem.scala
+++ b/src/main/scala/ncore/tcm/tightCpldMem.scala
@@ -4,7 +4,6 @@ package ncore.tcm
 
 import chisel3._
 import chisel3.util._
-import isa._
 
 class TCMCell(val nbits: Int = 8) extends Module {
     val io = IO(
@@ -25,16 +24,14 @@ class TCMCell(val nbits: Int = 8) extends Module {
 
 class TCMBlock(val n: Int = 8, 
                val size: Int = 4096,
-               val r_addr_width: Int = 12,
-               val w_addr_width: Int = 12,
                val nbits: Int = 8
 ) extends Module {
     val io = IO(
         new Bundle {
             val d_in    = Input(Vec(n * n, UInt(nbits.W)))
             val d_out   = Output(Vec(n * n, UInt(nbits.W)))
-            val r_addr  = Input(Vec(n * n, UInt(r_addr_width.W)))
-            val w_addr  = Input(Vec(n * n, UInt(w_addr_width.W)))
+            val r_addr  = Input(Vec(n * n, UInt(log2Ceil(size).W)))
+            val w_addr  = Input(Vec(n * n, UInt(log2Ceil(size).W)))
             val en_wr   = Input(Bool())
         }
     )
@@ -60,33 +57,28 @@ class TCMBlock(val n: Int = 8,
 
 
 class DetachableTCM(
-    val n: Int = 8, 
+    val n: Int = 8,
+    val nblocks: Int = 4,
     val size: Int = 4096,
-    val r_addr_width: Int = 12,
-    val w_addr_width: Int = 12,
-    val mlayout_width: Int = 6,
 ) extends Module {
     val io = IO(new Bundle {
-        val d_in    = Input(Vec(n * n, UInt(32.W)))
-        val d_out   = Output(Vec(n * n, UInt(32.W)))
-        // read address will have channel selection for last 2 bits
-        val r_addr  = Input(Vec(n * n, UInt((r_addr_width + 2).W)))
-        // write address will have channel selection for last 2 bits
-        val w_addr  = Input(Vec(n * n, UInt((w_addr_width + 2).W)))
-        val mem_ch  = Input(MemChannel())
-        val mem_lo  = Input(MemLayout())
-        val en_wr   = Input(Bool())
+        val d_in        = Input(Vec(n * n, Vec(nblocks, UInt(8.W))))
+        val d_out       = Output(Vec(n * n, Vec(nblocks, UInt(8.W))))
+        val r_addr      = Input(Vec(n * n, UInt(log2Ceil(size).W)))
+        val w_addr      = Input(Vec(n * n, UInt(log2Ceil(size).W)))
+        val en_wr       = Input(Bool())
     })
 
-    switch (io.mem_lo) {
-        is (MemLayout.bit8) {
-            
-        }
-        is (MemLayout.bit16) {
-
-        }
-        is (MemLayout.bit32) {
-
+    val tcm_blocks_io  = VecInit(Seq.fill(nblocks) {
+        Module(new TCMBlock(n, size, 8)).io})
+    
+    for (i <- 0 until nblocks) {
+        tcm_blocks_io(i).en_wr := io.en_wr
+        for (j <- 0 until n) {
+            tcm_blocks_io(i).r_addr(j) := io.r_addr(j)
+            tcm_blocks_io(i).w_addr(j) := io.w_addr(j)
+            tcm_blocks_io(i).d_in(j) := io.d_in(j)(i)
+            io.d_out(j)(i) := tcm_blocks_io(i).d_out(j)
         }
     }
     
diff --git a/src/test/scala/ncore/mmu/MMUSpec.scala b/src/test/scala/ncore/mmu/MMUSpec.scala
new file mode 100644
index 0000000..b04e323
--- /dev/null
+++ b/src/test/scala/ncore/mmu/MMUSpec.scala
@@ -0,0 +1,37 @@
+//// See README.md for license details.
+
+package ncore.mmu
+
+import testUtil._
+import scala.util.Random
+import chisel3._
+import chiseltest._
+import org.scalatest.flatspec.AnyFlatSpec
+import chisel3.experimental.BundleLiterals._
+
+class MMUSpec extends AnyFlatSpec with ChiselScalatestTester {
+
+    "OffsetGenerator" should "provide correct offset" in {
+        test(new OffsetGenerator(4)) { dut =>
+            val print_helper = new testUtil.PrintHelper()
+            val _n = dut.n
+            val _array = List(List(false, false, false, false),
+                              List(true, false, false, false),  
+                              List(true, true, false, false), 
+                              List(true, true, true, false), 
+                              List(false, true, true, true), 
+                              List(true, false, true, true), 
+                              List(false, true, false, true), 
+                              List(false, false, true, false), 
+                              List(false, false, false, true), 
+                              )
+            for (i <- 0 until 16) {
+                for (j <- 0 until _n){
+                    dut.io.inc(j).poke(_array(i%_array.length)(j))
+                }
+                dut.clock.step()
+                print_helper.printVectorChisel(dut.io.out, _n)
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/test/scala/utils/printHelper.scala b/src/test/scala/utils/printHelper.scala
index c520bda..cf71ece 100644
--- a/src/test/scala/utils/printHelper.scala
+++ b/src/test/scala/utils/printHelper.scala
@@ -28,4 +28,12 @@ class PrintHelper(){
         }
         println("]")
     }
+
+    def printVectorChisel(vec: chisel3.Vec[chisel3.UInt], n: Int): Unit = {
+        var _row = ""
+        for (i <- 0 until n) {
+            _row += vec(i).peekInt().toString() + ", "
+        }
+        println("[" + _row + "]")
+    }
 }
\ No newline at end of file

From 2fd67a906c82ddd69a4a61603b409ca3263c63b7 Mon Sep 17 00:00:00 2001
From: "Fangrui.Liu" <fangrui.liu@ubc.ca>
Date: Sun, 17 Mar 2024 23:49:20 +0800
Subject: [PATCH 2/5] fix expect

---
 src/test/scala/ncore/mmu/MMUSpec.scala | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/test/scala/ncore/mmu/MMUSpec.scala b/src/test/scala/ncore/mmu/MMUSpec.scala
index b04e323..537328f 100644
--- a/src/test/scala/ncore/mmu/MMUSpec.scala
+++ b/src/test/scala/ncore/mmu/MMUSpec.scala
@@ -23,13 +23,26 @@ class MMUSpec extends AnyFlatSpec with ChiselScalatestTester {
                               List(true, false, true, true), 
                               List(false, true, false, true), 
                               List(false, false, true, false), 
-                              List(false, false, false, true), 
+                              List(false, false, false, true),
                               )
-            for (i <- 0 until 16) {
-                for (j <- 0 until _n){
+            val _expected = List(List(0, 4, 8, 12),
+                                List(1, 4, 8, 12),
+                                List(2, 5, 8, 12),
+                                List(3, 6, 9, 12),
+                                List(0, 7, 10, 13),
+                                List(1, 4, 11, 14),
+                                List(0, 5, 8, 15),
+                                List(0, 4, 9, 12),
+                                List(0, 4, 8, 13),
+            )
+            for (i <- 0 until _array.length) {
+                for (j <- 0 until _n) {
                     dut.io.inc(j).poke(_array(i%_array.length)(j))
                 }
                 dut.clock.step()
+                for (j <- 0 until _n) {
+                    dut.io.out(j).expect(_expected(i)(j))
+                }
                 print_helper.printVectorChisel(dut.io.out, _n)
             }
         }

From 0adaeca83205705c90c63d3ab735d81792d45ba1 Mon Sep 17 00:00:00 2001
From: "Fangrui.Liu" <fangrui.liu@ubc.ca>
Date: Sun, 17 Mar 2024 23:59:43 +0800
Subject: [PATCH 3/5] update ci

---
 .github/workflows/actions.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml
index e1865e1..2fb383c 100644
--- a/.github/workflows/actions.yml
+++ b/.github/workflows/actions.yml
@@ -5,8 +5,9 @@ on:
       - main
       - releases/**
   pull_request:
-    types:
-      - opened
+    branches:
+      - main
+      - 'releases/**'
 
 jobs:
   Lint:

From 9b12a92acc5894e958166b7fd3a206b3aa3e5dbb Mon Sep 17 00:00:00 2001
From: "Fangrui.Liu" <fangrui.liu@ubc.ca>
Date: Tue, 26 Mar 2024 23:49:38 +0800
Subject: [PATCH 4/5] multi channel input

---
 src/main/scala/ncore/mmu/memMngUnit.scala   | 76 ++++++++++++++++++++-
 src/main/scala/ncore/tcm/tightCpldMem.scala | 20 ++++--
 src/test/scala/ncore/tcm/TCMSpec.scala      | 60 +++++++++++++---
 3 files changed, 139 insertions(+), 17 deletions(-)

diff --git a/src/main/scala/ncore/mmu/memMngUnit.scala b/src/main/scala/ncore/mmu/memMngUnit.scala
index 62f0149..cfad5f3 100644
--- a/src/main/scala/ncore/mmu/memMngUnit.scala
+++ b/src/main/scala/ncore/mmu/memMngUnit.scala
@@ -6,6 +6,7 @@ import chisel3._
 import chisel3.util._
 import isa.backend._
 import ncore._
+import ncore.tcm._
 
 class MMUBundle extends Bundle {
     val mem_ch  = MemChannel()
@@ -30,14 +31,55 @@ class OffsetGenerator(val n: Int = 8) extends Module {
     }
 }
 
+
+class MemoryControlArray(val n: Int = 8) extends Module {
+    val io = IO(new Bundle {
+        val ctrl_in_a       = Input(Bool())
+        val ctrl_in_b       = Input(Bool())
+        val offset_inc_in   = Input(Bool())
+        val ctrl_out_a      = Output(Vec(n, Bool()))
+        val ctrl_out_b      = Output(Vec(n, Bool()))
+        val offset_inc_out  = Output(Vec((n-1) * (n-1), Bool()))
+    })
+    // Assign each element with diagnal control signal
+    val reg_inc = RegInit(VecInit(Seq.fill(2*n - 3)(0.B)))
+    val reg_a = RegInit(VecInit(Seq.fill(n)(0.B)))
+    val reg_b = RegInit(VecInit(Seq.fill(n)(0.B)))
+
+    reg_a(0) := io.ctrl_in_a(0)
+    reg_b(0) := io.ctrl_in_b(0)
+    for (i <- 1 until n - 1) {
+        reg_a(i) := reg_a(i-1)
+        reg_b(i) := reg_b(i-1)
+    }
+    
+    for (i <- 0 until n) {
+        io.ctrl_out_a(i) := reg_a(i)
+        io.ctrl_out_b(i) := reg_b(i)
+    }
+
+    reg_inc(0) := io.offset_inc_in
+    for (i <- 0 until 2 * n - 3) {
+        reg_inc(i) := reg_inc(i - 1)
+    }
+    for (i <- 0 until n - 1) {
+        for (j <- 0 until n - 1) {
+            io.offset_inc_out(n * i + j) := reg_inc(i + j)
+        }
+    }
+}
+
 /**
  * This is the neural core design
  */
 class MemoryManageUnit(
-    val n: Int = 8, val nbits: Int = 8, val addr_width: Int = 24
+    val n: Int = 8, 
+    val nbits: Int = 8, 
+    val word_size: Int = 4, 
+    val size: Int = 4096
     ) extends Module {
     val io = IO(new Bundle {
-        val base_addr   = Input(Vec(n, UInt(24.W)))
+        val base_addr   = Input(UInt(log2Ceil(size).W))
         val ctrl        = Input(Vec(n * n, new MMUBundle()))
         val out_a       = Output(Vec(n * n, UInt(32.W)))
         val out_b       = Output(Vec(n * n, UInt(32.W)))
@@ -45,4 +87,34 @@ class MemoryManageUnit(
 
     val offsetgen_a = new OffsetGenerator(n)
     val offsetgen_b = new OffsetGenerator(n)
+
+    val mem = new DetachableTCM(n, word_size, size, 2)
+
+    // Create 2d register for horizontal & vertical
+    val reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+    val reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+
+    for (i <- 0 until n){
+        for (j <- 0 until n) {
+            // ==== INPUT ====
+            // vertical
+            if (i==0) {
+                mem.io.r_addr(0)(j) := io.base_addr + offsetgen_b.io.out(j)
+            } else {
+                mem.io.r_addr(0)(n * i + j) := reg_v(n * (i - 1) + j)
+            }
+            if (i < n - 1 && j < n)
+                reg_v(n * i + j) := mem.io.r_addr(0)(n * i + j)
+
+            // horizontal
+            if (j==0) {
+                mem.io.r_addr(1)(n * i) := io.base_addr + offsetgen_a.io.out(i)
+            } else {
+                mem.io.r_addr(1)(n * i + j) := reg_h((n - 1) * i + (j - 1))
+            }
+            if (i < n && j < n - 1)
+                reg_h((n - 1) * i + j) := mem.io.r_addr(1)(n * i + j)
+        }
+    }
+
 }
\ No newline at end of file
diff --git a/src/main/scala/ncore/tcm/tightCpldMem.scala b/src/main/scala/ncore/tcm/tightCpldMem.scala
index c0b8338..b036255 100644
--- a/src/main/scala/ncore/tcm/tightCpldMem.scala
+++ b/src/main/scala/ncore/tcm/tightCpldMem.scala
@@ -24,13 +24,14 @@ class TCMCell(val nbits: Int = 8) extends Module {
 
 class TCMBlock(val n: Int = 8, 
                val size: Int = 4096,
+               val rd_ch_num: Int = 2,
                val nbits: Int = 8
 ) extends Module {
     val io = IO(
         new Bundle {
             val d_in    = Input(Vec(n * n, UInt(nbits.W)))
-            val d_out   = Output(Vec(n * n, UInt(nbits.W)))
-            val r_addr  = Input(Vec(n * n, UInt(log2Ceil(size).W)))
+            val d_out   = Output(Vec(rd_ch_num, Vec(n * n, UInt(nbits.W))))
+            val r_addr  = Input(Vec(rd_ch_num, Vec(n * n, UInt(log2Ceil(size).W))))
             val w_addr  = Input(Vec(n * n, UInt(log2Ceil(size).W)))
             val en_wr   = Input(Bool())
         }
@@ -47,7 +48,9 @@ class TCMBlock(val n: Int = 8,
     //TODO: add read & write conflict check
 
     for (i <- 0 until n * n) {
-        io.d_out(i) := cells_io(io.r_addr(i)).d_out
+        for (k <- 0 until rd_ch_num) {
+            io.d_out(k)(i) := cells_io(io.r_addr(k)(i)).d_out
+        }
         when (io.en_wr) {
             cells_io(io.w_addr(i)).en_wr := io.en_wr
             cells_io(io.w_addr(i)).d_in := io.d_in(i)
@@ -60,22 +63,25 @@ class DetachableTCM(
     val n: Int = 8,
     val nblocks: Int = 4,
     val size: Int = 4096,
+    val rd_ch_num: Int = 2,
 ) extends Module {
     val io = IO(new Bundle {
         val d_in        = Input(Vec(n * n, Vec(nblocks, UInt(8.W))))
-        val d_out       = Output(Vec(n * n, Vec(nblocks, UInt(8.W))))
-        val r_addr      = Input(Vec(n * n, UInt(log2Ceil(size).W)))
+        val d_out       = Output(Vec(rd_ch_num, Vec(n * n, Vec(nblocks, UInt(8.W)))))
+        val r_addr      = Input(Vec(rd_ch_num, Vec(n * n, UInt(log2Ceil(size).W))))
         val w_addr      = Input(Vec(n * n, UInt(log2Ceil(size).W)))
         val en_wr       = Input(Bool())
     })
 
     val tcm_blocks_io  = VecInit(Seq.fill(nblocks) {
-        Module(new TCMBlock(n, size, 8)).io})
+        Module(new TCMBlock(n, size, rd_ch_num, 8)).io})
     
     for (i <- 0 until nblocks) {
         tcm_blocks_io(i).en_wr := io.en_wr
         for (j <- 0 until n) {
-            tcm_blocks_io(i).r_addr(j) := io.r_addr(j)
+            for (k <- 0 until rd_ch_num) {
+                tcm_blocks_io(i).r_addr(k)(j) := io.r_addr(k)(j)
+            }
             tcm_blocks_io(i).w_addr(j) := io.w_addr(j)
             tcm_blocks_io(i).d_in(j) := io.d_in(j)(i)
             io.d_out(j)(i) := tcm_blocks_io(i).d_out(j)
diff --git a/src/test/scala/ncore/tcm/TCMSpec.scala b/src/test/scala/ncore/tcm/TCMSpec.scala
index 01bc2b5..6477973 100644
--- a/src/test/scala/ncore/tcm/TCMSpec.scala
+++ b/src/test/scala/ncore/tcm/TCMSpec.scala
@@ -30,7 +30,7 @@ class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
   }
 
   "TCM Block" should "write on signal and read anytime" in {
-    test(new TCMBlock(3, 192)) { dut =>
+    test(new TCMBlock(3, 192, 1)) { dut =>
       val _n = dut.n
       val _cells = dut.size
       val rand = new Random
@@ -46,20 +46,20 @@ class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
         dut.io.en_wr.poke(true)
         dut.clock.step()
         for (i <- 0 until _n * _n) {
-          dut.io.r_addr(i).poke(_in_addr(i))
+          dut.io.r_addr(0)(i).poke(_in_addr(i))
         }
         for (i <- 0 until _n * _n){
-          dut.io.d_out(i).expect(_in_data(i))
+          dut.io.d_out(0)(i).expect(_in_data(i))
         }
         println("Result tick @ " + _i + ": ")
         print_helper.printMatrix(_in_data, _n)
-        print_helper.printMatrixChisel(dut.io.d_out, _n)
+        print_helper.printMatrixChisel(dut.io.d_out(0), _n)
       }
     }
   }
 
   "TCM Block" should "read anytime" in {
-    test(new TCMBlock(2, 64)) { dut =>
+    test(new TCMBlock(2, 64, 1)) { dut =>
       val _n = dut.n
       val _cells = dut.size
       val rand = new Random
@@ -81,16 +81,60 @@ class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
         val _r_addr = rand.shuffle((0 until _cells).toList).take(_n * _n)
         val _expected = new Array[Int](_n * _n)
         for (i <- 0 until _n * _n) {
-          dut.io.r_addr(i).poke(_r_addr(i))
+          dut.io.r_addr(0)(i).poke(_r_addr(i))
         }
         for (i <- 0 until _n * _n) {
           _expected(i) = _data(_r_addr(i))
         }
         println("Result tick @ " + _i + ": ")
         print_helper.printMatrix(_expected, _n)
-        print_helper.printMatrixChisel(dut.io.d_out, _n)
+        print_helper.printMatrixChisel(dut.io.d_out(0), _n)
         for (i <- 0 until _n * _n){
-          dut.io.d_out(i).expect(_data(_r_addr(i)))
+          dut.io.d_out(0)(i).expect(_data(_r_addr(i)))
+        }
+      }
+    }
+  }
+
+  "TCM Block" should "read anytime on different channels" in {
+    test(new TCMBlock(2, 64, 2)) { dut =>
+      val _n = dut.n
+      val _cells = dut.size
+      val _rd_ch_num = dut.rd_ch_num
+      val rand = new Random
+      val print_helper = new testUtil.PrintHelper()
+      val _data = new Array[Int](_cells)
+      for (_i <- 0 until 10) {
+        val _in_data = new Array[Int](_rd_ch_num * _n * _n)
+        val _in_addr = rand.shuffle((0 until _cells).toList).take(_rd_ch_num * _n * _n)
+        for (k <- 0 until _rd_ch_num){
+          for (i <- 0 until _n * _n) {
+            val _ind = k * _n * _n + i
+            _in_data(_ind) = rand.between(0, 255)
+            dut.io.d_in(i).poke(_in_data(_ind))
+            dut.io.w_addr(i).poke(_in_addr(_ind))
+            _data(_in_addr(_ind)) = _in_data(_ind)
+          }
+          dut.io.en_wr.poke(true)
+          dut.clock.step()
+        }
+      }
+      for(_i <- 0 until 10){
+        val _r_addr = rand.shuffle((0 until _cells).toList).take(_rd_ch_num * _n * _n)
+        val _expected = new Array[Int](_rd_ch_num * _n * _n)
+        for (k <- 0 until _rd_ch_num){
+          for (i <- 0 until _n * _n) {
+            val _ind = k * _n * _n + i
+            dut.io.r_addr(k)(i).poke(_r_addr(_ind))
+            _expected(_ind) = _data(_r_addr(_ind))
+          }
+        }
+        println("Result tick @ " + _i + ": ")
+        for (k <- 0 until _rd_ch_num){
+          for (i <- 0 until _n * _n){
+            val _ind = k * _n * _n + i
+            dut.io.d_out(k)(i).expect(_data(_r_addr(_ind)))
+          }
         }
       }
     }

From e3c9217011e55a4c3647756060799887b36ce623 Mon Sep 17 00:00:00 2001
From: "Fangrui.Liu" <fangrui.liu@ubc.ca>
Date: Sat, 6 Apr 2024 16:22:48 +0800
Subject: [PATCH 5/5] clean code

---
 .../scala/isa/backend/controlMicroCode.scala  |   9 ++
 src/main/scala/isa/backend/memMicroCode.scala |   7 +
 src/main/scala/ncore/cu/controlUnit.scala     |  27 +++-
 src/main/scala/ncore/mmu/memMngUnit.scala     | 120 ------------------
 src/main/scala/ncore/neuralCore.scala         |  48 ++-----
 src/main/scala/ncore/sa/systolicArray.scala   |  46 +++++++
 src/main/scala/npu/npu.scala                  |  36 ------
 src/main/scala/{ncore => }/pe/procElem.scala  |   7 +-
 .../tightCpldMem.scala => sram/SRAM.scala}    |  24 ++--
 src/main/scala/top/top.scala                  |  18 +++
 src/main/scala/vcore/mmu/memMngUnit.scala     |  84 ++++++++++++
 src/test/scala/ncore/CoreSpec.scala           |   8 +-
 src/test/scala/ncore/cu/CUSpec.scala          |   8 +-
 src/test/scala/ncore/mmu/MMUSpec.scala        |  92 +++++++-------
 src/test/scala/{ncore => }/pe/PESpec.scala    |   8 +-
 .../tcm/TCMSpec.scala => sram/SRAMSpec.scala} |  20 +--
 16 files changed, 287 insertions(+), 275 deletions(-)
 create mode 100644 src/main/scala/isa/backend/controlMicroCode.scala
 delete mode 100644 src/main/scala/ncore/mmu/memMngUnit.scala
 create mode 100644 src/main/scala/ncore/sa/systolicArray.scala
 delete mode 100644 src/main/scala/npu/npu.scala
 rename src/main/scala/{ncore => }/pe/procElem.scala (86%)
 rename src/main/scala/{ncore/tcm/tightCpldMem.scala => sram/SRAM.scala} (78%)
 create mode 100644 src/main/scala/top/top.scala
 create mode 100644 src/main/scala/vcore/mmu/memMngUnit.scala
 rename src/test/scala/{ncore => }/pe/PESpec.scala (93%)
 rename src/test/scala/{ncore/tcm/TCMSpec.scala => sram/SRAMSpec.scala} (90%)

diff --git a/src/main/scala/isa/backend/controlMicroCode.scala b/src/main/scala/isa/backend/controlMicroCode.scala
new file mode 100644
index 0000000..d517852
--- /dev/null
+++ b/src/main/scala/isa/backend/controlMicroCode.scala
@@ -0,0 +1,9 @@
+// See README.md for license details.
+
+package isa.backend
+import chisel3._
+import chisel3.util._
+
+class NCoreCUBundle (val size: Int = 4096) extends Bundle {
+    val accum = Bool()
+}
\ No newline at end of file
diff --git a/src/main/scala/isa/backend/memMicroCode.scala b/src/main/scala/isa/backend/memMicroCode.scala
index 2f243f4..f1a1a9f 100644
--- a/src/main/scala/isa/backend/memMicroCode.scala
+++ b/src/main/scala/isa/backend/memMicroCode.scala
@@ -19,3 +19,10 @@ object MemChannel extends ChiselEnum {
     // 16/32 bits will have no ch3
     val ch3     = Value(0x3.U)
 }
+
+class MMUCtrlBundle (val n: Int = 8, val size: Int = 4096) extends Bundle {
+    val offset_keep     = Bool()
+    val h_only          = Bool()
+    val in_addr         = Vec(n * n, UInt(log2Ceil(size).W))
+    val out_addr        = Vec(n * n, UInt(log2Ceil(size).W))
+}
\ No newline at end of file
diff --git a/src/main/scala/ncore/cu/controlUnit.scala b/src/main/scala/ncore/cu/controlUnit.scala
index d71189f..f5d8fe4 100644
--- a/src/main/scala/ncore/cu/controlUnit.scala
+++ b/src/main/scala/ncore/cu/controlUnit.scala
@@ -2,11 +2,12 @@
 package ncore.cu
 
 import chisel3._
+import isa.backend._
 
 /**
  * Control unit also uses systolic array to pass instructions
  */
-class ControlUnit(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
+class ControlUnitforTest(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
     val io = IO(new Bundle {
         val cbus_in     = Input(UInt(ctrl_width.W))
         val cbus_out    = Output(Vec(n * n, UInt(ctrl_width.W)))
@@ -14,6 +15,30 @@ class ControlUnit(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
     // Assign each element with diagnal control signal
     val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U(ctrl_width.W))))
 
+    // 1D systolic array for control
+    reg(0) := io.cbus_in
+    for(i<- 1 until 2*n-1){
+        reg(i) := reg(i-1)
+    }
+    // Boardcast to all elements in the array
+    for(i <- 0 until n){
+        for(j <- 0 until n){
+            io.cbus_out(n*i+j) := reg(i+j)
+        }
+    }
+}
+
+/**
+ * Control unit also uses systolic array to pass instructions
+ */
+class ControlUnit(val n: Int = 8, val sram_size: Int = 4096) extends Module {
+    val io = IO(new Bundle {
+        val cbus_in         = Input(new NCoreCUBundle(sram_size))
+        val cbus_out        = Output(Vec(n * n, new NCoreCUBundle(sram_size)))
+    })
+    // Assign each element with diagnal control signal
+    val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U.asTypeOf(new NCoreCUBundle(sram_size)))))
+
     // 1D systolic array for control
     reg(0) := io.cbus_in
     for(i<- 1 until 2*n-1){
diff --git a/src/main/scala/ncore/mmu/memMngUnit.scala b/src/main/scala/ncore/mmu/memMngUnit.scala
deleted file mode 100644
index cfad5f3..0000000
--- a/src/main/scala/ncore/mmu/memMngUnit.scala
+++ /dev/null
@@ -1,120 +0,0 @@
-// See README.md for license details.
-
-package ncore.mmu
-
-import chisel3._
-import chisel3.util._
-import isa.backend._
-import ncore._
-import ncore.tcm._
-
-class MMUBundle extends Bundle {
-    val mem_ch  = MemChannel()
-    val mem_lo  = MemLayout()
-}
-
-class OffsetGenerator(val n: Int = 8) extends Module {
-    val io = IO(new Bundle {
-        val inc     = Input(Vec(n, Bool()))
-        val out     = Output(Vec(n, UInt(log2Ceil(n * n).W)))
-    })
-    val init_value = Seq.tabulate(n)(i => (n * i).U(log2Ceil(n * n).W))
-    val regs = RegInit(VecInit(init_value))
-
-    for (i <- 0 until n){
-        when (io.inc(i)) {
-            regs(i) := (regs(i) + 1.U) % (n * n).U
-        }.otherwise {
-            regs(i) := init_value(i)
-        }
-        io.out(i) := regs(i)
-    }
-}
-
-
-class MemoryControlArray(val n: Int = 8) extends Module {
-    val io = IO(new Bundle {
-        val ctrl_in_a       = Input(Bool())
-        val ctrl_in_b       = Input(Bool())
-        val offset_inc_in   = Input(Bool())
-        val ctrl_out_a      = Output(Vec(n, Bool()))
-        val ctrl_out_b      = Output(Vec(n, Bool()))
-        val offset_inc_out  = Output(Vec((n-1) * (n-1), Bool()))
-    })
-    // Assign each element with diagnal control signal
-    val reg_inc = RegInit(VecInit(Seq.fill(2*n - 3)(0.B)))
-    val reg_a = RegInit(VecInit(Seq.fill(n)(0.B)))
-    val reg_b = RegInit(VecInit(Seq.fill(n)(0.B)))
-
-    reg_a(0) := io.ctrl_in_a(0)
-    reg_b(0) := io.ctrl_in_b(0)
-    for (i <- 1 until n - 1) {
-        reg_a(i) := reg_a(i-1)
-        reg_b(i) := reg_b(i-1)
-    }
-    
-    for (i <- 0 until n) {
-        io.ctrl_out_a(i) := reg_a(i)
-        io.ctrl_out_b(i) := reg_b(i)
-    }
-
-    reg_inc(0) := io.offset_inc_in
-    for (i <- 0 until 2 * n - 3) {
-        reg_inc(i) := reg_inc(i - 1)
-    }
-    for (i <- 0 until n - 1) {
-        for (j <- 0 until n - 1) {
-            io.offset_inc_out(n * i + j) := reg_inc(i + j)
-        }
-    }
-}
-
-/**
- * This is the neural core design
- */
-class MemoryManageUnit(
-    val n: Int = 8, 
-    val nbits: Int = 8, 
-    val word_size: Int = 4, 
-    val size: Int = 4096
-    ) extends Module {
-    val io = IO(new Bundle {
-        val base_addr   = Input(UInt(log2Ceil(size).W))
-        val ctrl        = Input(Vec(n * n, new MMUBundle()))
-        val out_a       = Output(Vec(n * n, UInt(32.W)))
-        val out_b       = Output(Vec(n * n, UInt(32.W)))
-    })
-
-    val offsetgen_a = new OffsetGenerator(n)
-    val offsetgen_b = new OffsetGenerator(n)
-
-    val mem = new DetachableTCM(n, word_size, size, 2)
-
-    // Create 2d register for horizontal & vertical
-    val reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
-    val reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
-
-    for (i <- 0 until n){
-        for (j <- 0 until n) {
-            // ==== INPUT ====
-            // vertical
-            if (i==0) {
-                mem.io.r_addr(0)(j) := io.base_addr + offsetgen_b.io.out(j)
-            } else {
-                mem.io.r_addr(0)(n * i + j) := reg_v(n * (i - 1) + j)
-            }
-            if (i < n - 1 && j < n)
-                reg_v(n * i + j) := mem.io.r_addr(0)(n * i + j)
-
-            // horizontal
-            if (j==0) {
-                mem.io.r_addr(1)(n * i) := io.base_addr + offsetgen_a.io.out(i)
-            } else {
-                mem.io.r_addr(1)(n * i + j) := reg_h((n - 1) * i + (j - 1))
-            }
-            if (i < n && j < n - 1)
-                reg_h((n - 1) * i + j) := mem.io.r_addr(1)(n * i + j)
-        }
-    }
-
-}
\ No newline at end of file
diff --git a/src/main/scala/ncore/neuralCore.scala b/src/main/scala/ncore/neuralCore.scala
index 7ec0ddb..381a3a4 100644
--- a/src/main/scala/ncore/neuralCore.scala
+++ b/src/main/scala/ncore/neuralCore.scala
@@ -1,63 +1,41 @@
 // See README.md for license details
 package ncore
+import isa.backend._
+import pe._
 
 import chisel3._
 
+
 /**
  * This is the neural core design
  */
- class NeuralCoreforTest(val n: Int = 8, val nbits: Int = 8, val ctrl_width: Int = 8) extends Module {
+ class NeuralCore(val n: Int = 8, val nbits: Int = 8, val sram_size: Int = 4096) extends Module {
     val io = IO(new Bundle {
         val vec_a   = Input(Vec(n, UInt(nbits.W)))  // vector `a` is the left input
         val vec_b   = Input(Vec(n, UInt(nbits.W)))  // vector `b` is the top input
-        val ctrl    = Input(UInt(ctrl_width.W))
+        val ctrl    = Input(new NCoreCUBundle())
         val out     = Output(Vec(n * n, UInt((2 * nbits + 12).W)))
     })
 
     // Create n x n pe blocks
     val pe_io = VecInit(Seq.fill(n * n) {Module(new pe.PE(nbits)).io})
-    // Create 2d register for horizontal & vertical
-    val pe_reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
-    val pe_reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
 
     // we use systolic array to pipeline the instructions
     // this will avoid bubble and inst complexity 
     // while simplifying design with higher efficiency
-    val ctrl_array = Module(new cu.ControlUnit(n, ctrl_width))
+    val ctrl_array = Module(new cu.ControlUnit(n, sram_size))
     ctrl_array.io.cbus_in := io.ctrl
 
+    val sarray = Module(new sa.SystolicArray2D(n, nbits))
+    sarray.io.vec_a := io.vec_a
+    sarray.io.vec_b := io.vec_b
+
     for (i <- 0 until n){
         for (j <- 0 until n) {
-            // ==== OUTPUT ====
-            // pe array's output mapped to the matrix position
+            pe_io(n * i + j).in_a := sarray.io.out_a(n * i + j)
+            pe_io(n * i + j).in_b := sarray.io.out_b(n * i + j)
+            pe_io(n * i + j).ctrl := ctrl_array.io.cbus_out(n * i + j)
             io.out(n * i + j) := pe_io(n * i + j).out
-
-            // ==== INPUT ====
-            // vertical
-            if (i==0) {
-                pe_io(j).in_b := io.vec_b(j)
-            } else {
-                pe_io(n * i + j).in_b := pe_reg_v(n * (i - 1) + j)
-            }
-            if (i < n - 1 && j < n)
-                pe_reg_v(n * i + j) := pe_io(n * i + j).in_b
-
-            // horizontal
-            if (j==0) {
-                pe_io(n * i).in_a := io.vec_a(i)
-            } else {
-                pe_io(n * i + j).in_a := pe_reg_h((n - 1) * i + (j - 1))
-            }
-            if (i < n && j < n - 1)
-                pe_reg_h((n - 1) * i + j) := pe_io(n * i + j).in_a
-
-            // ==== CONTROL ====
-            // Currently we only have one bit control
-            // which is `ACCUM`
-            // TODO:
-            // Add ALU control to pe elements
-            val ctrl = ctrl_array.io.cbus_out(n * i + j).asBools
-            pe_io(n * i + j).accum := ctrl(0)
         }
     }
  }
\ No newline at end of file
diff --git a/src/main/scala/ncore/sa/systolicArray.scala b/src/main/scala/ncore/sa/systolicArray.scala
new file mode 100644
index 0000000..cd1213d
--- /dev/null
+++ b/src/main/scala/ncore/sa/systolicArray.scala
@@ -0,0 +1,46 @@
+// See README.md for license details
+package ncore.sa
+
+import chisel3._
+
+
+/**
+ * This is the neural core design
+ */
+ class SystolicArray2D(val n: Int = 8, val nbits: Int = 8) extends Module {
+    val io = IO(new Bundle {
+        val vec_a       = Input(Vec(n, UInt(nbits.W)))  // vector `a` is the left input
+        val vec_b       = Input(Vec(n, UInt(nbits.W)))  // vector `b` is the top input
+        val out_a       = Output(Vec(n * n, UInt(nbits.W)))
+        val out_b       = Output(Vec(n * n, UInt(nbits.W)))
+
+    })
+
+    // Create 2d register for horizontal & vertical
+    val reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+    val reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+
+    for (i <- 0 until n){
+        for (j <- 0 until n) {
+
+            // ==== INPUT ====
+            // vertical
+            if (i==0) {
+                io.out_b(j) := io.vec_b(j)
+            } else {
+                io.out_b(n * i + j) := reg_v(n * (i - 1) + j)
+            }
+            if (i < n - 1 && j < n)
+                reg_v(n * i + j) := io.out_b(n * i + j)
+
+            // horizontal
+            if (j==0) {
+                io.out_a(n * i) := io.vec_a(i)
+            } else {
+                io.out_a(n * i + j) := reg_h((n - 1) * i + (j - 1))
+            }
+            if (i < n && j < n - 1)
+                reg_h((n - 1) * i + j) := io.out_a(n * i + j)
+        }
+    }
+ }
\ No newline at end of file
diff --git a/src/main/scala/npu/npu.scala b/src/main/scala/npu/npu.scala
deleted file mode 100644
index 2fba8a3..0000000
--- a/src/main/scala/npu/npu.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-package npu
-
-import chisel3._
-import java.nio.file.{Paths, Files}
-import java.nio.charset.StandardCharsets
-import circt.stage.ChiselStage
-import ncore.pe.PE
-
-class NPU extends Module {
-
-  val nbits: Int = 8
-  val io = IO(new Bundle {
-    val in_a        = Input(UInt(nbits.W))
-    val in_b        = Input(UInt(nbits.W))
-    val accum       = Input(Bool())
-    val out         = Output(UInt((nbits*2).W))
-  })  
-
-  val pe = Module(new PE(8))
-  
-  // get value when ready
-  pe.io.in_a := io.in_a
-  pe.io.in_b := io.in_b
-  pe.io.accum := io.accum
-  io.out := pe.io.out
-}
-
-object Main extends App {
-  // These lines generate the Verilog output
-
-  val hdl = ChiselStage.emitSystemVerilog(
-    new NPU(),
-    firtoolOpts = Array("-disable-all-randomization", "-strip-debug-info")
-  )
-  Files.write(Paths.get("npu.v"), hdl.getBytes(StandardCharsets.UTF_8))
-}
diff --git a/src/main/scala/ncore/pe/procElem.scala b/src/main/scala/pe/procElem.scala
similarity index 86%
rename from src/main/scala/ncore/pe/procElem.scala
rename to src/main/scala/pe/procElem.scala
index bf88bcc..98e909d 100644
--- a/src/main/scala/ncore/pe/procElem.scala
+++ b/src/main/scala/pe/procElem.scala
@@ -1,8 +1,9 @@
 // See README.md for license details.
 
-package ncore.pe
+package pe
 
 import chisel3._
+import isa.backend._
 
 /**
   * processing element unit in npu design. 
@@ -11,7 +12,7 @@ import chisel3._
 class PE(val nbits: Int = 8) extends Module {
   val io = IO(
     new Bundle {
-      val accum       = Input(Bool())
+      val ctrl        = Input(new NCoreCUBundle())
       val in_a        = Input(UInt(nbits.W))
       val in_b        = Input(UInt(nbits.W))
       //  The register bandwith is optimized for large transformer 
@@ -22,7 +23,7 @@ class PE(val nbits: Int = 8) extends Module {
 
   val res = RegInit(0.U((nbits*2 + 12).W))
 
-  when (io.accum) {
+  when (io.ctrl.accum) {
     res := res + (io.in_a * io.in_b)
   } .otherwise {
     res := (io.in_a * io.in_b)
diff --git a/src/main/scala/ncore/tcm/tightCpldMem.scala b/src/main/scala/sram/SRAM.scala
similarity index 78%
rename from src/main/scala/ncore/tcm/tightCpldMem.scala
rename to src/main/scala/sram/SRAM.scala
index b036255..530c681 100644
--- a/src/main/scala/ncore/tcm/tightCpldMem.scala
+++ b/src/main/scala/sram/SRAM.scala
@@ -1,11 +1,11 @@
 // See README.md for license details.
 
-package ncore.tcm
+package sram
 
 import chisel3._
 import chisel3.util._
 
-class TCMCell(val nbits: Int = 8) extends Module {
+class SRAMCell(val nbits: Int = 8) extends Module {
     val io = IO(
         new Bundle {
             val d_in    = Input(UInt(nbits.W))
@@ -22,7 +22,7 @@ class TCMCell(val nbits: Int = 8) extends Module {
     }
 }
 
-class TCMBlock(val n: Int = 8, 
+class SRAMBlock(val n: Int = 8, 
                val size: Int = 4096,
                val rd_ch_num: Int = 2,
                val nbits: Int = 8
@@ -36,7 +36,7 @@ class TCMBlock(val n: Int = 8,
             val en_wr   = Input(Bool())
         }
     )
-    val cells_io = VecInit(Seq.fill(size) {Module(new TCMCell(nbits)).io})
+    val cells_io = VecInit(Seq.fill(size) {Module(new SRAMCell(nbits)).io})
 
     for (i <- 0 until size) {
         cells_io(i).en_wr := false.B.asTypeOf(cells_io(i).en_wr)
@@ -59,7 +59,7 @@ class TCMBlock(val n: Int = 8,
 }
 
 
-class DetachableTCM(
+class SRAM(
     val n: Int = 8,
     val nblocks: Int = 4,
     val size: Int = 4096,
@@ -73,18 +73,18 @@ class DetachableTCM(
         val en_wr       = Input(Bool())
     })
 
-    val tcm_blocks_io  = VecInit(Seq.fill(nblocks) {
-        Module(new TCMBlock(n, size, rd_ch_num, 8)).io})
+    val sram_blocks_io  = VecInit(Seq.fill(nblocks) {
+        Module(new SRAMBlock(n, size, rd_ch_num, 8)).io})
     
     for (i <- 0 until nblocks) {
-        tcm_blocks_io(i).en_wr := io.en_wr
+        sram_blocks_io(i).en_wr := io.en_wr
         for (j <- 0 until n) {
             for (k <- 0 until rd_ch_num) {
-                tcm_blocks_io(i).r_addr(k)(j) := io.r_addr(k)(j)
+                sram_blocks_io(i).r_addr(k)(j) := io.r_addr(k)(j)
             }
-            tcm_blocks_io(i).w_addr(j) := io.w_addr(j)
-            tcm_blocks_io(i).d_in(j) := io.d_in(j)(i)
-            io.d_out(j)(i) := tcm_blocks_io(i).d_out(j)
+            sram_blocks_io(i).w_addr(j) := io.w_addr(j)
+            sram_blocks_io(i).d_in(j) := io.d_in(j)(i)
+            io.d_out(j)(i) := sram_blocks_io(i).d_out(j)
         }
     }
     
diff --git a/src/main/scala/top/top.scala b/src/main/scala/top/top.scala
new file mode 100644
index 0000000..40f93e7
--- /dev/null
+++ b/src/main/scala/top/top.scala
@@ -0,0 +1,18 @@
+package top
+
+import chisel3._
+import java.nio.file.{Paths, Files}
+import java.nio.charset.StandardCharsets
+import circt.stage.ChiselStage
+import ncore._
+
+
+object Main extends App {
+  // These lines generate the Verilog output
+
+  val hdl = ChiselStage.emitSystemVerilog(
+    new NeuralCore(),
+    firtoolOpts = Array("-disable-all-randomization", "-strip-debug-info")
+  )
+  Files.write(Paths.get("top.v"), hdl.getBytes(StandardCharsets.UTF_8))
+}
diff --git a/src/main/scala/vcore/mmu/memMngUnit.scala b/src/main/scala/vcore/mmu/memMngUnit.scala
new file mode 100644
index 0000000..a53d2ae
--- /dev/null
+++ b/src/main/scala/vcore/mmu/memMngUnit.scala
@@ -0,0 +1,84 @@
+// // See README.md for license details.
+
+// package ncore.mmu
+
+// import chisel3._
+// import chisel3.util._
+// import isa.backend._
+// import ncore._
+
+
+// class OffsetGenerator(val n: Int = 8) extends Module {
+//     val io = IO(new Bundle {
+//         val keep    = Input(Vec(n, Bool()))
+//         val out     = Output(Vec(n, UInt(log2Ceil(n * n).W)))
+//     })
+//     val init_value = Seq.tabulate(n)(i => (n * i).U(log2Ceil(n * n).W))
+//     val regs = RegInit(VecInit(init_value))
+
+//     for (i <- 0 until n){
+//         when (io.keep(i)) {
+//             regs(i) := init_value(i)
+//         }.otherwise {
+//             regs(i) := (regs(i) + 1.U) % (n * n).U
+//         }
+//         io.out(i) := regs(i)
+//     }
+// }
+
+// /**
+//  * This is the neural core design
+//  */
+// class MemoryManageUnit(
+//     val n: Int = 8, 
+//     val size: Int = 4096
+//     ) extends Module {
+//     val io = IO(new Bundle {
+//         val ctrl            = Input(Vec(n * n, new MMUCtrlBundle(size)))
+//         val op_a            = Output(Vec(n * n, UInt(log2Ceil(size).W)))
+//         val op_b            = Output(Vec(n * n, UInt(log2Ceil(size).W)))
+//         val res             = Output(Vec(n * n, UInt(log2Ceil(size).W)))
+//     })
+
+//     val offsetgen_a = new OffsetGenerator(n)
+//     val offsetgen_b = new OffsetGenerator(n)
+
+//     // Create 2d register for horizontal & vertical
+//     val reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(log2Ceil(size).W))))
+//     val reg_r = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(log2Ceil(size).W))))
+//     val reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(log2Ceil(size).W))))
+
+//     for (i <- 0 until n){
+//         for (j <- 0 until n) {
+//             offsetgen_a.io.keep(i) := io.ctrl(n * i).offset_keep
+//             offsetgen_b.io.keep(i) := io.ctrl(j).offset_kee
+//             // ==== INPUT ====
+//             // vertical
+//             if (i==0) {
+//                 when (io.ctrl(n * i + j).h_only) {
+//                     io.op_b(j) := io.ctrl(n * i + j).in_addr + offsetgen_b.io.out(j)
+//                 } .otherwise {
+//                     io.op_b(j) := io.ctrl(n * i + j).in_addr + offsetgen_b.io.out(j)
+//                 }
+//             } else {
+//                 io.op_b(0)(n * i + j) := reg_v(n * (i - 1) + j)
+//             }
+//             if (i < n - 1 && j < n)
+//                 reg_v(n * i + j) := io.op_b(n * i + j)
+
+//             // horizontal & result
+//             if (j==0) {
+//                 io.op_a(n * i) := io.ctrl(n * i + j).in_addr + offsetgen_a.io.out(i)
+//                 io.res(n * i) := io.ctrl(n * i + j).out_addr + offsetgen_a.io.out(i)
+//             } else {
+//                 io.op_a(n * i + j) := reg_h((n - 1) * i + (j - 1))
+//                 io.res(n * i + j) := reg_r((n - 1) * i + (j - 1))
+//             }
+//             if (i < n && j < n - 1) {
+//                 reg_h((n - 1) * i + j) := io.op_a(n * i + j)
+//                 reg_r((n - 1) * i + j) := io.res(n * i + j)
+//             }
+//         }
+//     }
+
+// }
\ No newline at end of file
diff --git a/src/test/scala/ncore/CoreSpec.scala b/src/test/scala/ncore/CoreSpec.scala
index c87341c..432a25f 100644
--- a/src/test/scala/ncore/CoreSpec.scala
+++ b/src/test/scala/ncore/CoreSpec.scala
@@ -11,8 +11,8 @@ import chisel3.experimental.BundleLiterals._
 
 class CoreSpec extends AnyFlatSpec with ChiselScalatestTester {
 
-    "NeuralCoreforTest" should "do a normal matrix multiplication" in {
-        test(new NeuralCoreforTest(4, 8)) { dut =>
+    "NeuralCore" should "do a normal matrix multiplication" in {
+        test(new NeuralCore(4, 8)) { dut =>
             val print_helper = new testUtil.PrintHelper()
             val _n = dut.n
             val rand = new Random
@@ -78,9 +78,9 @@ class CoreSpec extends AnyFlatSpec with ChiselScalatestTester {
                 // The rest of the control signal will hand over
                 // to a dedicated systolic-ish control bus
                 if (i_tick < _n && i_tick >= 0)
-                    dut.io.ctrl.poke(0x1)
+                    dut.io.ctrl.accum.poke(0x1)
                 else
-                    dut.io.ctrl.poke(0x0)
+                    dut.io.ctrl.accum.poke(0x0)
 
                 // ideally, the array will give _n (diagnal) results per tick
                 dut.clock.step()
diff --git a/src/test/scala/ncore/cu/CUSpec.scala b/src/test/scala/ncore/cu/CUSpec.scala
index b0863d4..920c846 100644
--- a/src/test/scala/ncore/cu/CUSpec.scala
+++ b/src/test/scala/ncore/cu/CUSpec.scala
@@ -19,19 +19,19 @@ class CUSpec extends AnyFlatSpec with ChiselScalatestTester {
             var history = new Array[Int](2 * _n - 1)
             var prod = 0
             for (n <- 0 until 16) {
-                val _cbus_in = rand.between(0, 255)
+                val _cbus_in = rand.between(0, 2)
                 history +:= _cbus_in
-                dut.io.cbus_in.poke(_cbus_in)
+                dut.io.cbus_in.accum.poke(_cbus_in)
                 dut.clock.step()
                 history = history.slice(0, 2 * _n - 1)
                 println("Input tick @ " + n + ": " + _cbus_in)
                 for(i: Int <- 0 until _n){
                     for(j:Int <- 0 until _n) {
-                        dut.io.cbus_out(_n * i + j).expect(history(i + j))
+                        dut.io.cbus_out(_n * i + j).accum.expect(history(i + j))
                     }
                 }
                 println("Control tick @ " + n + " : ")
-                print_helper.printMatrixChisel(dut.io.cbus_out, _n)
+                // print_helper.printMatrixChisel(dut.io.cbus_out, _n)
             }
         }
     }
diff --git a/src/test/scala/ncore/mmu/MMUSpec.scala b/src/test/scala/ncore/mmu/MMUSpec.scala
index 537328f..4c71a10 100644
--- a/src/test/scala/ncore/mmu/MMUSpec.scala
+++ b/src/test/scala/ncore/mmu/MMUSpec.scala
@@ -1,50 +1,50 @@
-//// See README.md for license details.
+// //// See README.md for license details.
 
-package ncore.mmu
+// package ncore.mmu
 
-import testUtil._
-import scala.util.Random
-import chisel3._
-import chiseltest._
-import org.scalatest.flatspec.AnyFlatSpec
-import chisel3.experimental.BundleLiterals._
+// import testUtil._
+// import scala.util.Random
+// import chisel3._
+// import chiseltest._
+// import org.scalatest.flatspec.AnyFlatSpec
+// import chisel3.experimental.BundleLiterals._
 
-class MMUSpec extends AnyFlatSpec with ChiselScalatestTester {
+// class MMUSpec extends AnyFlatSpec with ChiselScalatestTester {
 
-    "OffsetGenerator" should "provide correct offset" in {
-        test(new OffsetGenerator(4)) { dut =>
-            val print_helper = new testUtil.PrintHelper()
-            val _n = dut.n
-            val _array = List(List(false, false, false, false),
-                              List(true, false, false, false),  
-                              List(true, true, false, false), 
-                              List(true, true, true, false), 
-                              List(false, true, true, true), 
-                              List(true, false, true, true), 
-                              List(false, true, false, true), 
-                              List(false, false, true, false), 
-                              List(false, false, false, true),
-                              )
-            val _expected = List(List(0, 4, 8, 12),
-                                List(1, 4, 8, 12),
-                                List(2, 5, 8, 12),
-                                List(3, 6, 9, 12),
-                                List(0, 7, 10, 13),
-                                List(1, 4, 11, 14),
-                                List(0, 5, 8, 15),
-                                List(0, 4, 9, 12),
-                                List(0, 4, 8, 13),
-            )
-            for (i <- 0 until _array.length) {
-                for (j <- 0 until _n) {
-                    dut.io.inc(j).poke(_array(i%_array.length)(j))
-                }
-                dut.clock.step()
-                for (j <- 0 until _n) {
-                    dut.io.out(j).expect(_expected(i)(j))
-                }
-                print_helper.printVectorChisel(dut.io.out, _n)
-            }
-        }
-    }
-}
\ No newline at end of file
+//     "OffsetGenerator" should "provide correct offset" in {
+//         test(new OffsetGenerator(4)) { dut =>
+//             val print_helper = new testUtil.PrintHelper()
+//             val _n = dut.n
+//             val _array = List(List(false, false, false, false),
+//                               List(true, false, false, false),  
+//                               List(true, true, false, false), 
+//                               List(true, true, true, false), 
+//                               List(false, true, true, true), 
+//                               List(true, false, true, true), 
+//                               List(false, true, false, true), 
+//                               List(false, false, true, false), 
+//                               List(false, false, false, true),
+//                               )
+//             val _expected = List(List(0, 4, 8, 12),
+//                                 List(1, 4, 8, 12),
+//                                 List(2, 5, 8, 12),
+//                                 List(3, 6, 9, 12),
+//                                 List(0, 7, 10, 13),
+//                                 List(1, 4, 11, 14),
+//                                 List(0, 5, 8, 15),
+//                                 List(0, 4, 9, 12),
+//                                 List(0, 4, 8, 13),
+//             )
+//             for (i <- 0 until _array.length) {
+//                 for (j <- 0 until _n) {
+//                     dut.io.keep(j).poke(!_array(i%_array.length)(j))
+//                 }
+//                 dut.clock.step()
+//                 for (j <- 0 until _n) {
+//                     dut.io.out(j).expect(_expected(i)(j))
+//                 }
+//                 print_helper.printVectorChisel(dut.io.out, _n)
+//             }
+//         }
+//     }
+// }
\ No newline at end of file
diff --git a/src/test/scala/ncore/pe/PESpec.scala b/src/test/scala/pe/PESpec.scala
similarity index 93%
rename from src/test/scala/ncore/pe/PESpec.scala
rename to src/test/scala/pe/PESpec.scala
index b08d114..eba2038 100644
--- a/src/test/scala/ncore/pe/PESpec.scala
+++ b/src/test/scala/pe/PESpec.scala
@@ -1,6 +1,6 @@
 // See README.md for license details.
 
-package ncore.pe
+package pe
 
 import scala.util.Random
 import chisel3._
@@ -20,7 +20,7 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester {
         val _left_in_ = rand.between(0, 255)
         dut.io.in_a.poke(_top_in_)
         dut.io.in_b.poke(_left_in_)
-        dut.io.accum.poke(true)
+        dut.io.ctrl.accum.poke(true)
         dut.clock.step()
         prod = prod + _top_in_ * _left_in_
         dut.io.out.expect(prod)
@@ -32,7 +32,7 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester {
       var _left_in_ = rand.between(1, 255)
       dut.io.in_a.poke(_top_in_)
       dut.io.in_b.poke(_left_in_)
-      dut.io.accum.poke(false)
+      dut.io.ctrl.accum.poke(false)
       dut.clock.step()
       prod = prod + _top_in_ * _left_in_
       dut.io.out.expect(prod)
@@ -42,7 +42,7 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester {
       _left_in_ = rand.between(1, 255)
       dut.io.in_a.poke(_top_in_)
       dut.io.in_b.poke(_left_in_)
-      dut.io.accum.poke(true)
+      dut.io.ctrl.accum.poke(true)
       dut.clock.step()
       prod = prod + _top_in_ * _left_in_
       dut.io.out.expect(prod)
diff --git a/src/test/scala/ncore/tcm/TCMSpec.scala b/src/test/scala/sram/SRAMSpec.scala
similarity index 90%
rename from src/test/scala/ncore/tcm/TCMSpec.scala
rename to src/test/scala/sram/SRAMSpec.scala
index 6477973..2d8464b 100644
--- a/src/test/scala/ncore/tcm/TCMSpec.scala
+++ b/src/test/scala/sram/SRAMSpec.scala
@@ -1,6 +1,6 @@
 // See README.md for license details.
 
-package ncore.tcm
+package sram
 
 import scala.util.Random
 import chisel3._
@@ -10,10 +10,10 @@ import org.scalatest.flatspec.AnyFlatSpec
 import chisel3.experimental.BundleLiterals._
 
 
-class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
+class SRAMSpec extends AnyFlatSpec with ChiselScalatestTester {
 
-  "TCM Cells" should "write on signal" in {
-    test(new TCMCell(8)) { dut =>
+  "SRAM Cells" should "write on signal" in {
+    test(new SRAMCell(8)) { dut =>
       val rand = new Random
       var _prev = 0
       for (i <- 0 until 10) {
@@ -29,8 +29,8 @@ class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
     }
   }
 
-  "TCM Block" should "write on signal and read anytime" in {
-    test(new TCMBlock(3, 192, 1)) { dut =>
+  "SRAM Block" should "write on signal and read anytime" in {
+    test(new SRAMBlock(3, 192, 1)) { dut =>
       val _n = dut.n
       val _cells = dut.size
       val rand = new Random
@@ -58,8 +58,8 @@ class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
     }
   }
 
-  "TCM Block" should "read anytime" in {
-    test(new TCMBlock(2, 64, 1)) { dut =>
+  "SRAM Block" should "read anytime" in {
+    test(new SRAMBlock(2, 64, 1)) { dut =>
       val _n = dut.n
       val _cells = dut.size
       val rand = new Random
@@ -96,8 +96,8 @@ class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
     }
   }
 
-  "TCM Block" should "read anytime on different channels" in {
-    test(new TCMBlock(2, 64, 2)) { dut =>
+  "SRAM Block" should "read anytime on different channels" in {
+    test(new SRAMBlock(2, 64, 2)) { dut =>
       val _n = dut.n
       val _cells = dut.size
       val _rd_ch_num = dut.rd_ch_num