diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml
index e1865e1..2fb383c 100644
--- a/.github/workflows/actions.yml
+++ b/.github/workflows/actions.yml
@@ -5,8 +5,9 @@ on:
       - main
       - releases/**
   pull_request:
-    types:
-      - opened
+    branches:
+      - main
+      - 'releases/**'
 
 jobs:
   Lint:
diff --git a/src/main/scala/isa/backend/controlMicroCode.scala b/src/main/scala/isa/backend/controlMicroCode.scala
new file mode 100644
index 0000000..d517852
--- /dev/null
+++ b/src/main/scala/isa/backend/controlMicroCode.scala
@@ -0,0 +1,9 @@
+// See README.md for license details.
+
+package isa.backend
+import chisel3._
+import chisel3.util._
+
+class NCoreCUBundle (val size: Int = 4096) extends Bundle {
+    val accum = Bool()
+}
\ No newline at end of file
diff --git a/src/main/scala/isa/backend/memMicroCode.scala b/src/main/scala/isa/backend/memMicroCode.scala
new file mode 100644
index 0000000..f1a1a9f
--- /dev/null
+++ b/src/main/scala/isa/backend/memMicroCode.scala
@@ -0,0 +1,28 @@
+// See README.md for license details.
+
+package isa.backend
+import chisel3._
+import chisel3.util._
+
+object MemLayout extends ChiselEnum {
+    val bit8    = Value(0x0.U)
+    val bit16   = Value(0x1.U)
+    val bit32   = Value(0x2.U)
+}
+
+object MemChannel extends ChiselEnum {
+    val ch0     = Value(0x0.U)
+    // 16/32 bits will have no ch1
+    val ch1     = Value(0x1.U)
+    // 32 bits will have no ch2
+    val ch2     = Value(0x2.U)
+    // 16/32 bits will have no ch3
+    val ch3     = Value(0x3.U)
+}
+
+class MMUCtrlBundle (val n: Int = 8, val size: Int = 4096) extends Bundle {
+    val offset_keep     = Bool()
+    val h_only          = Bool()
+    val in_addr         = Vec(n * n, UInt(log2Ceil(size).W))
+    val out_addr        = Vec(n * n, UInt(log2Ceil(size).W))
+}
\ No newline at end of file
diff --git a/src/main/scala/isa/instSetArch.scala b/src/main/scala/isa/instSetArch.scala
index f4b5359..84fd512 100644
--- a/src/main/scala/isa/instSetArch.scala
+++ b/src/main/scala/isa/instSetArch.scala
@@ -8,4 +8,12 @@ object NeuralISA extends ChiselEnum {
     val st = Value(0x2.U(4.W))
     val mma = Value(0x3.U(4.W))
     val ip = Value (0x4.U(4.W))
-}
\ No newline at end of file
+}
+
+object DType extends ChiselEnum {
+    val uint    = Value(0x0.U)
+    val int     = Value(0x1.U)
+    val fp      = Value(0x2.U)
+    // no bfp32c0
+    val bfp     = Value(0x3.U)
+}
diff --git a/src/main/scala/isa/memMicroCode.scala b/src/main/scala/isa/memMicroCode.scala
deleted file mode 100644
index a47e1e2..0000000
--- a/src/main/scala/isa/memMicroCode.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-// See README.md for license details.
-
-package isa
-import chisel3._
-import chisel3.util._
-
-
-object OffsetPattern extends ChiselEnum {
-    val not_def = Value(0x0.U)
-    val sca_0d  = Value(0x1.U)
-    val vec_1d  = Value(0x2.U)
-    val mat_2d  = Value(0x3.U)
-}
-
-object AddressMode extends ChiselEnum {
-    val immd        = Value(0x0.U)
-    val addr        = Value(0x1.U)
-    val addr_immd   = Value(0x2.U)
-}
-
-
-object MemLayout extends ChiselEnum {
-    val bit8    = Value(0x0.U)
-    val bit16   = Value(0x1.U)
-    val bit32   = Value(0x2.U)
-}
-
-object DType extends ChiselEnum {
-    val uint    = Value(0x0.U)
-    val int     = Value(0x1.U)
-    val fp      = Value(0x2.U)
-    // no bfp32c0
-    val bfp     = Value(0x3.U)
-}
-
-object MemChannel extends ChiselEnum {
-    val ch0     = Value(0x1.U)
-    // 16/32 bits will have no ch1
-    val ch1     = Value(0x2.U)
-    // 32 bits will have no ch2
-    val ch2     = Value(0x4.U)
-    // 16/32 bits will have no ch3
-    val ch3     = Value(0x8.U)
-}
\ No newline at end of file
diff --git a/src/main/scala/ncore/cu/controlUnit.scala b/src/main/scala/ncore/cu/controlUnit.scala
index d71189f..f5d8fe4 100644
--- a/src/main/scala/ncore/cu/controlUnit.scala
+++ b/src/main/scala/ncore/cu/controlUnit.scala
@@ -2,11 +2,12 @@
 package ncore.cu
 
 import chisel3._
+import isa.backend._
 
 /**
  * Control unit also uses systolic array to pass instructions
  */
-class ControlUnit(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
+class ControlUnitforTest(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
     val io = IO(new Bundle {
         val cbus_in     = Input(UInt(ctrl_width.W))
         val cbus_out    = Output(Vec(n * n, UInt(ctrl_width.W)))
@@ -14,6 +15,30 @@ class ControlUnit(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
     // Assign each element with diagnal control signal
     val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U(ctrl_width.W))))
 
+    // 1D systolic array for control
+    reg(0) := io.cbus_in
+    for(i<- 1 until 2*n-1){
+        reg(i) := reg(i-1)
+    }
+    // Boardcast to all elements in the array
+    for(i <- 0 until n){
+        for(j <- 0 until n){
+            io.cbus_out(n*i+j) := reg(i+j)
+        }
+    }
+}
+
+/**
+ * Control unit also uses systolic array to pass instructions
+ */
+class ControlUnit(val n: Int = 8, val sram_size: Int = 4096) extends Module {
+    val io = IO(new Bundle {
+        val cbus_in         = Input(new NCoreCUBundle(sram_size))
+        val cbus_out        = Output(Vec(n * n, new NCoreCUBundle(sram_size)))
+    })
+    // Assign each element with diagnal control signal
+    val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U.asTypeOf(new NCoreCUBundle(sram_size)))))
+
     // 1D systolic array for control
     reg(0) := io.cbus_in
     for(i<- 1 until 2*n-1){
diff --git a/src/main/scala/ncore/neuralCore.scala b/src/main/scala/ncore/neuralCore.scala
index 7ec0ddb..381a3a4 100644
--- a/src/main/scala/ncore/neuralCore.scala
+++ b/src/main/scala/ncore/neuralCore.scala
@@ -1,63 +1,41 @@
 // See README.md for license details
 package ncore
+import isa.backend._
+import pe._
 
 import chisel3._
 
+
 /**
  * This is the neural core design
  */
- class NeuralCoreforTest(val n: Int = 8, val nbits: Int = 8, val ctrl_width: Int = 8) extends Module {
+ class NeuralCore(val n: Int = 8, val nbits: Int = 8, val sram_size: Int = 4096) extends Module {
     val io = IO(new Bundle {
         val vec_a   = Input(Vec(n, UInt(nbits.W)))  // vector `a` is the left input
         val vec_b   = Input(Vec(n, UInt(nbits.W)))  // vector `b` is the top input
-        val ctrl    = Input(UInt(ctrl_width.W))
+        val ctrl    = Input(new NCoreCUBundle())
         val out     = Output(Vec(n * n, UInt((2 * nbits + 12).W)))
     })
 
     // Create n x n pe blocks
     val pe_io = VecInit(Seq.fill(n * n) {Module(new pe.PE(nbits)).io})
-    // Create 2d register for horizontal & vertical
-    val pe_reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
-    val pe_reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
 
     // we use systolic array to pipeline the instructions
     // this will avoid bubble and inst complexity 
     // while simplifying design with higher efficiency
-    val ctrl_array = Module(new cu.ControlUnit(n, ctrl_width))
+    val ctrl_array = Module(new cu.ControlUnit(n, sram_size))
     ctrl_array.io.cbus_in := io.ctrl
 
+    val sarray = Module(new sa.SystolicArray2D(n, nbits))
+    sarray.io.vec_a := io.vec_a
+    sarray.io.vec_b := io.vec_b
+
     for (i <- 0 until n){
         for (j <- 0 until n) {
-            // ==== OUTPUT ====
-            // pe array's output mapped to the matrix position
+            pe_io(n * i + j).in_a := sarray.io.out_a(n * i + j)
+            pe_io(n * i + j).in_b := sarray.io.out_b(n * i + j)
+            pe_io(n * i + j).ctrl := ctrl_array.io.cbus_out(n * i + j)
             io.out(n * i + j) := pe_io(n * i + j).out
-
-            // ==== INPUT ====
-            // vertical
-            if (i==0) {
-                pe_io(j).in_b := io.vec_b(j)
-            } else {
-                pe_io(n * i + j).in_b := pe_reg_v(n * (i - 1) + j)
-            }
-            if (i < n - 1 && j < n)
-                pe_reg_v(n * i + j) := pe_io(n * i + j).in_b
-
-            // horizontal
-            if (j==0) {
-                pe_io(n * i).in_a := io.vec_a(i)
-            } else {
-                pe_io(n * i + j).in_a := pe_reg_h((n - 1) * i + (j - 1))
-            }
-            if (i < n && j < n - 1)
-                pe_reg_h((n - 1) * i + j) := pe_io(n * i + j).in_a
-
-            // ==== CONTROL ====
-            // Currently we only have one bit control
-            // which is `ACCUM`
-            // TODO:
-            // Add ALU control to pe elements
-            val ctrl = ctrl_array.io.cbus_out(n * i + j).asBools
-            pe_io(n * i + j).accum := ctrl(0)
         }
     }
  }
\ No newline at end of file
diff --git a/src/main/scala/ncore/sa/systolicArray.scala b/src/main/scala/ncore/sa/systolicArray.scala
new file mode 100644
index 0000000..cd1213d
--- /dev/null
+++ b/src/main/scala/ncore/sa/systolicArray.scala
@@ -0,0 +1,46 @@
+// See README.md for license details
+package ncore.sa
+
+import chisel3._
+
+
+/**
+ * This is the neural core design
+ */
+ class SystolicArray2D(val n: Int = 8, val nbits: Int = 8) extends Module {
+    val io = IO(new Bundle {
+        val vec_a       = Input(Vec(n, UInt(nbits.W)))  // vector `a` is the left input
+        val vec_b       = Input(Vec(n, UInt(nbits.W)))  // vector `b` is the top input
+        val out_a       = Output(Vec(n * n, UInt(nbits.W)))
+        val out_b       = Output(Vec(n * n, UInt(nbits.W)))
+
+    })
+
+    // Create 2d register for horizontal & vertical
+    val reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+    val reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+
+    for (i <- 0 until n){
+        for (j <- 0 until n) {
+
+            // ==== INPUT ====
+            // vertical
+            if (i==0) {
+                io.out_b(j) := io.vec_b(j)
+            } else {
+                io.out_b(n * i + j) := reg_v(n * (i - 1) + j)
+            }
+            if (i < n - 1 && j < n)
+                reg_v(n * i + j) := io.out_b(n * i + j)
+
+            // horizontal
+            if (j==0) {
+                io.out_a(n * i) := io.vec_a(i)
+            } else {
+                io.out_a(n * i + j) := reg_h((n - 1) * i + (j - 1))
+            }
+            if (i < n && j < n - 1)
+                reg_h((n - 1) * i + j) := io.out_a(n * i + j)
+        }
+    }
+ }
\ No newline at end of file
diff --git a/src/main/scala/ncore/tcm/tightCpldMem.scala b/src/main/scala/ncore/tcm/tightCpldMem.scala
deleted file mode 100644
index f8fea8a..0000000
--- a/src/main/scala/ncore/tcm/tightCpldMem.scala
+++ /dev/null
@@ -1,93 +0,0 @@
-// See README.md for license details.
-
-package ncore.tcm
-
-import chisel3._
-import chisel3.util._
-import isa._
-
-class TCMCell(val nbits: Int = 8) extends Module {
-    val io = IO(
-        new Bundle {
-            val d_in    = Input(UInt(nbits.W))
-            val d_out   = Output(UInt(nbits.W))
-            val en_wr   = Input(Bool())
-        }
-    )
-
-    val reg = RegInit(0.U(nbits.W))
-    io.d_out := reg
-
-    when (io.en_wr) {
-        reg := io.d_in
-    }
-}
-
-class TCMBlock(val n: Int = 8, 
-               val size: Int = 4096,
-               val r_addr_width: Int = 12,
-               val w_addr_width: Int = 12,
-               val nbits: Int = 8
-) extends Module {
-    val io = IO(
-        new Bundle {
-            val d_in    = Input(Vec(n * n, UInt(nbits.W)))
-            val d_out   = Output(Vec(n * n, UInt(nbits.W)))
-            val r_addr  = Input(Vec(n * n, UInt(r_addr_width.W)))
-            val w_addr  = Input(Vec(n * n, UInt(w_addr_width.W)))
-            val en_wr   = Input(Bool())
-        }
-    )
-    val cells_io = VecInit(Seq.fill(size) {Module(new TCMCell(nbits)).io})
-
-    for (i <- 0 until size) {
-        cells_io(i).en_wr := false.B.asTypeOf(cells_io(i).en_wr)
-        // Need to initialize all wires just in case of not selected.
-        cells_io(i).d_in := 0.U.asTypeOf(cells_io(i).d_in)
-    }
-
-    //TODO: add range check
-    //TODO: add read & write conflict check
-
-    for (i <- 0 until n * n) {
-        io.d_out(i) := cells_io(io.r_addr(i)).d_out
-        when (io.en_wr) {
-            cells_io(io.w_addr(i)).en_wr := io.en_wr
-            cells_io(io.w_addr(i)).d_in := io.d_in(i)
-        }
-    }
-}
-
-
-class DetachableTCM(
-    val n: Int = 8, 
-    val size: Int = 4096,
-    val r_addr_width: Int = 12,
-    val w_addr_width: Int = 12,
-    val mlayout_width: Int = 6,
-) extends Module {
-    val io = IO(new Bundle {
-        val d_in    = Input(Vec(n * n, UInt(32.W)))
-        val d_out   = Output(Vec(n * n, UInt(32.W)))
-        // read address will have channel selection for last 2 bits
-        val r_addr  = Input(Vec(n * n, UInt((r_addr_width + 2).W)))
-        // write address will have channel selection for last 2 bits
-        val w_addr  = Input(Vec(n * n, UInt((w_addr_width + 2).W)))
-        val mem_ch  = Input(MemChannel())
-        val mem_lo  = Input(MemLayout())
-        val en_wr   = Input(Bool())
-    })
-
-    switch (io.mem_lo) {
-        is (MemLayout.bit8) {
-            
-        }
-        is (MemLayout.bit16) {
-
-        }
-        is (MemLayout.bit32) {
-
-        }
-    }
-    
-}
\ No newline at end of file
diff --git a/src/main/scala/npu/npu.scala b/src/main/scala/npu/npu.scala
deleted file mode 100644
index 2fba8a3..0000000
--- a/src/main/scala/npu/npu.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-package npu
-
-import chisel3._
-import java.nio.file.{Paths, Files}
-import java.nio.charset.StandardCharsets
-import circt.stage.ChiselStage
-import ncore.pe.PE
-
-class NPU extends Module {
-
-  val nbits: Int = 8
-  val io = IO(new Bundle {
-    val in_a        = Input(UInt(nbits.W))
-    val in_b        = Input(UInt(nbits.W))
-    val accum       = Input(Bool())
-    val out         = Output(UInt((nbits*2).W))
-  })  
-
-  val pe = Module(new PE(8))
-  
-  // get value when ready
-  pe.io.in_a := io.in_a
-  pe.io.in_b := io.in_b
-  pe.io.accum := io.accum
-  io.out := pe.io.out
-}
-
-object Main extends App {
-  // These lines generate the Verilog output
-
-  val hdl = ChiselStage.emitSystemVerilog(
-    new NPU(),
-    firtoolOpts = Array("-disable-all-randomization", "-strip-debug-info")
-  )
-  Files.write(Paths.get("npu.v"), hdl.getBytes(StandardCharsets.UTF_8))
-}
diff --git a/src/main/scala/ncore/pe/procElem.scala b/src/main/scala/pe/procElem.scala
similarity index 86%
rename from src/main/scala/ncore/pe/procElem.scala
rename to src/main/scala/pe/procElem.scala
index bf88bcc..98e909d 100644
--- a/src/main/scala/ncore/pe/procElem.scala
+++ b/src/main/scala/pe/procElem.scala
@@ -1,8 +1,9 @@
 // See README.md for license details.
 
-package ncore.pe
+package pe
 
 import chisel3._
+import isa.backend._
 
 /**
   * processing element unit in npu design. 
@@ -11,7 +12,7 @@ import chisel3._
 class PE(val nbits: Int = 8) extends Module {
   val io = IO(
     new Bundle {
-      val accum       = Input(Bool())
+      val ctrl        = Input(new NCoreCUBundle())
       val in_a        = Input(UInt(nbits.W))
       val in_b        = Input(UInt(nbits.W))
       //  The register bandwith is optimized for large transformer 
@@ -22,7 +23,7 @@ class PE(val nbits: Int = 8) extends Module {
 
   val res = RegInit(0.U((nbits*2 + 12).W))
 
-  when (io.accum) {
+  when (io.ctrl.accum) {
     res := res + (io.in_a * io.in_b)
   } .otherwise {
     res := (io.in_a * io.in_b)
diff --git a/src/main/scala/sram/SRAM.scala b/src/main/scala/sram/SRAM.scala
new file mode 100644
index 0000000..530c681
--- /dev/null
+++ b/src/main/scala/sram/SRAM.scala
@@ -0,0 +1,91 @@
+// See README.md for license details.
+
+package sram
+
+import chisel3._
+import chisel3.util._
+
+class SRAMCell(val nbits: Int = 8) extends Module {
+    val io = IO(
+        new Bundle {
+            val d_in    = Input(UInt(nbits.W))
+            val d_out   = Output(UInt(nbits.W))
+            val en_wr   = Input(Bool())
+        }
+    )
+
+    val reg = RegInit(0.U(nbits.W))
+    io.d_out := reg
+
+    when (io.en_wr) {
+        reg := io.d_in
+    }
+}
+
+class SRAMBlock(val n: Int = 8, 
+               val size: Int = 4096,
+               val rd_ch_num: Int = 2,
+               val nbits: Int = 8
+) extends Module {
+    val io = IO(
+        new Bundle {
+            val d_in    = Input(Vec(n * n, UInt(nbits.W)))
+            val d_out   = Output(Vec(rd_ch_num, Vec(n * n, UInt(nbits.W))))
+            val r_addr  = Input(Vec(rd_ch_num, Vec(n * n, UInt(log2Ceil(size).W))))
+            val w_addr  = Input(Vec(n * n, UInt(log2Ceil(size).W)))
+            val en_wr   = Input(Bool())
+        }
+    )
+    val cells_io = VecInit(Seq.fill(size) {Module(new SRAMCell(nbits)).io})
+
+    for (i <- 0 until size) {
+        cells_io(i).en_wr := false.B.asTypeOf(cells_io(i).en_wr)
+        // Need to initialize all wires just in case of not selected.
+        cells_io(i).d_in := 0.U.asTypeOf(cells_io(i).d_in)
+    }
+
+    //TODO: add range check
+    //TODO: add read & write conflict check
+
+    for (i <- 0 until n * n) {
+        for (k <- 0 until rd_ch_num) {
+            io.d_out(k)(i) := cells_io(io.r_addr(k)(i)).d_out
+        }
+        when (io.en_wr) {
+            cells_io(io.w_addr(i)).en_wr := io.en_wr
+            cells_io(io.w_addr(i)).d_in := io.d_in(i)
+        }
+    }
+}
+
+
+class SRAM(
+    val n: Int = 8,
+    val nblocks: Int = 4,
+    val size: Int = 4096,
+    val rd_ch_num: Int = 2,
+) extends Module {
+    val io = IO(new Bundle {
+        val d_in        = Input(Vec(n * n, Vec(nblocks, UInt(8.W))))
+        val d_out       = Output(Vec(rd_ch_num, Vec(n * n, Vec(nblocks, UInt(8.W)))))
+        val r_addr      = Input(Vec(rd_ch_num, Vec(n * n, UInt(log2Ceil(size).W))))
+        val w_addr      = Input(Vec(n * n, UInt(log2Ceil(size).W)))
+        val en_wr       = Input(Bool())
+    })
+
+    val sram_blocks_io  = VecInit(Seq.fill(nblocks) {
+        Module(new SRAMBlock(n, size, rd_ch_num, 8)).io})
+    
+    for (i <- 0 until nblocks) {
+        sram_blocks_io(i).en_wr := io.en_wr
+        for (j <- 0 until n) {
+            for (k <- 0 until rd_ch_num) {
+                sram_blocks_io(i).r_addr(k)(j) := io.r_addr(k)(j)
+            }
+            sram_blocks_io(i).w_addr(j) := io.w_addr(j)
+            sram_blocks_io(i).d_in(j) := io.d_in(j)(i)
+            io.d_out(j)(i) := sram_blocks_io(i).d_out(j)
+        }
+    }
+    
+}
\ No newline at end of file
diff --git a/src/main/scala/top/top.scala b/src/main/scala/top/top.scala
new file mode 100644
index 0000000..40f93e7
--- /dev/null
+++ b/src/main/scala/top/top.scala
@@ -0,0 +1,18 @@
+package top
+
+import chisel3._
+import java.nio.file.{Paths, Files}
+import java.nio.charset.StandardCharsets
+import circt.stage.ChiselStage
+import ncore._
+
+
+object Main extends App {
+  // These lines generate the Verilog output
+
+  val hdl = ChiselStage.emitSystemVerilog(
+    new NeuralCore(),
+    firtoolOpts = Array("-disable-all-randomization", "-strip-debug-info")
+  )
+  Files.write(Paths.get("top.v"), hdl.getBytes(StandardCharsets.UTF_8))
+}
diff --git a/src/main/scala/vcore/mmu/memMngUnit.scala b/src/main/scala/vcore/mmu/memMngUnit.scala
new file mode 100644
index 0000000..a53d2ae
--- /dev/null
+++ b/src/main/scala/vcore/mmu/memMngUnit.scala
@@ -0,0 +1,84 @@
+// // See README.md for license details.
+
+// package ncore.mmu
+
+// import chisel3._
+// import chisel3.util._
+// import isa.backend._
+// import ncore._
+
+
+// class OffsetGenerator(val n: Int = 8) extends Module {
+//     val io = IO(new Bundle {
+//         val keep    = Input(Vec(n, Bool()))
+//         val out     = Output(Vec(n, UInt(log2Ceil(n * n).W)))
+//     })
+//     val init_value = Seq.tabulate(n)(i => (n * i).U(log2Ceil(n * n).W))
+//     val regs = RegInit(VecInit(init_value))
+
+//     for (i <- 0 until n){
+//         when (io.keep(i)) {
+//             regs(i) := init_value(i)
+//         }.otherwise {
+//             regs(i) := (regs(i) + 1.U) % (n * n).U
+//         }
+//         io.out(i) := regs(i)
+//     }
+// }
+
+// /**
+//  * This is the neural core design
+//  */
+// class MemoryManageUnit(
+//     val n: Int = 8, 
+//     val size: Int = 4096
+//     ) extends Module {
+//     val io = IO(new Bundle {
+//         val ctrl            = Input(Vec(n * n, new MMUCtrlBundle(size)))
+//         val op_a            = Output(Vec(n * n, UInt(log2Ceil(size).W)))
+//         val op_b            = Output(Vec(n * n, UInt(log2Ceil(size).W)))
+//         val res             = Output(Vec(n * n, UInt(log2Ceil(size).W)))
+//     })
+
+//     val offsetgen_a = new OffsetGenerator(n)
+//     val offsetgen_b = new OffsetGenerator(n)
+
+//     // Create 2d register for horizontal & vertical
+//     val reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(log2Ceil(size).W))))
+//     val reg_r = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(log2Ceil(size).W))))
+//     val reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(log2Ceil(size).W))))
+
+//     for (i <- 0 until n){
+//         for (j <- 0 until n) {
+//             offsetgen_a.io.keep(i) := io.ctrl(n * i).offset_keep
+//             offsetgen_b.io.keep(i) := io.ctrl(j).offset_kee
+//             // ==== INPUT ====
+//             // vertical
+//             if (i==0) {
+//                 when (io.ctrl(n * i + j).h_only) {
+//                     io.op_b(j) := io.ctrl(n * i + j).in_addr + offsetgen_b.io.out(j)
+//                 } .otherwise {
+//                     io.op_b(j) := io.ctrl(n * i + j).in_addr + offsetgen_b.io.out(j)
+//                 }
+//             } else {
+//                 io.op_b(0)(n * i + j) := reg_v(n * (i - 1) + j)
+//             }
+//             if (i < n - 1 && j < n)
+//                 reg_v(n * i + j) := io.op_b(n * i + j)
+
+//             // horizontal & result
+//             if (j==0) {
+//                 io.op_a(n * i) := io.ctrl(n * i + j).in_addr + offsetgen_a.io.out(i)
+//                 io.res(n * i) := io.ctrl(n * i + j).out_addr + offsetgen_a.io.out(i)
+//             } else {
+//                 io.op_a(n * i + j) := reg_h((n - 1) * i + (j - 1))
+//                 io.res(n * i + j) := reg_r((n - 1) * i + (j - 1))
+//             }
+//             if (i < n && j < n - 1) {
+//                 reg_h((n - 1) * i + j) := io.op_a(n * i + j)
+//                 reg_r((n - 1) * i + j) := io.res(n * i + j)
+//             }
+//         }
+//     }
+
+// }
\ No newline at end of file
diff --git a/src/test/scala/ncore/CoreSpec.scala b/src/test/scala/ncore/CoreSpec.scala
index c87341c..432a25f 100644
--- a/src/test/scala/ncore/CoreSpec.scala
+++ b/src/test/scala/ncore/CoreSpec.scala
@@ -11,8 +11,8 @@ import chisel3.experimental.BundleLiterals._
 
 class CoreSpec extends AnyFlatSpec with ChiselScalatestTester {
 
-    "NeuralCoreforTest" should "do a normal matrix multiplication" in {
-        test(new NeuralCoreforTest(4, 8)) { dut =>
+    "NeuralCore" should "do a normal matrix multiplication" in {
+        test(new NeuralCore(4, 8)) { dut =>
             val print_helper = new testUtil.PrintHelper()
             val _n = dut.n
             val rand = new Random
@@ -78,9 +78,9 @@ class CoreSpec extends AnyFlatSpec with ChiselScalatestTester {
                 // The rest of the control signal will hand over
                 // to a dedicated systolic-ish control bus
                 if (i_tick < _n && i_tick >= 0)
-                    dut.io.ctrl.poke(0x1)
+                    dut.io.ctrl.accum.poke(0x1)
                 else
-                    dut.io.ctrl.poke(0x0)
+                    dut.io.ctrl.accum.poke(0x0)
 
                 // ideally, the array will give _n (diagnal) results per tick
                 dut.clock.step()
diff --git a/src/test/scala/ncore/cu/CUSpec.scala b/src/test/scala/ncore/cu/CUSpec.scala
index b0863d4..920c846 100644
--- a/src/test/scala/ncore/cu/CUSpec.scala
+++ b/src/test/scala/ncore/cu/CUSpec.scala
@@ -19,19 +19,19 @@ class CUSpec extends AnyFlatSpec with ChiselScalatestTester {
             var history = new Array[Int](2 * _n - 1)
             var prod = 0
             for (n <- 0 until 16) {
-                val _cbus_in = rand.between(0, 255)
+                val _cbus_in = rand.between(0, 2)
                 history +:= _cbus_in
-                dut.io.cbus_in.poke(_cbus_in)
+                dut.io.cbus_in.accum.poke(_cbus_in)
                 dut.clock.step()
                 history = history.slice(0, 2 * _n - 1)
                 println("Input tick @ " + n + ": " + _cbus_in)
                 for(i: Int <- 0 until _n){
                     for(j:Int <- 0 until _n) {
-                        dut.io.cbus_out(_n * i + j).expect(history(i + j))
+                        dut.io.cbus_out(_n * i + j).accum.expect(history(i + j))
                     }
                 }
                 println("Control tick @ " + n + " : ")
-                print_helper.printMatrixChisel(dut.io.cbus_out, _n)
+                // print_helper.printMatrixChisel(dut.io.cbus_out, _n)
             }
         }
     }
diff --git a/src/test/scala/ncore/mmu/MMUSpec.scala b/src/test/scala/ncore/mmu/MMUSpec.scala
new file mode 100644
index 0000000..4c71a10
--- /dev/null
+++ b/src/test/scala/ncore/mmu/MMUSpec.scala
@@ -0,0 +1,50 @@
+// //// See README.md for license details.
+
+// package ncore.mmu
+
+// import testUtil._
+// import scala.util.Random
+// import chisel3._
+// import chiseltest._
+// import org.scalatest.flatspec.AnyFlatSpec
+// import chisel3.experimental.BundleLiterals._
+
+// class MMUSpec extends AnyFlatSpec with ChiselScalatestTester {
+
+//     "OffsetGenerator" should "provide correct offset" in {
+//         test(new OffsetGenerator(4)) { dut =>
+//             val print_helper = new testUtil.PrintHelper()
+//             val _n = dut.n
+//             val _array = List(List(false, false, false, false),
+//                               List(true, false, false, false),  
+//                               List(true, true, false, false), 
+//                               List(true, true, true, false), 
+//                               List(false, true, true, true), 
+//                               List(true, false, true, true), 
+//                               List(false, true, false, true), 
+//                               List(false, false, true, false), 
+//                               List(false, false, false, true),
+//                               )
+//             val _expected = List(List(0, 4, 8, 12),
+//                                 List(1, 4, 8, 12),
+//                                 List(2, 5, 8, 12),
+//                                 List(3, 6, 9, 12),
+//                                 List(0, 7, 10, 13),
+//                                 List(1, 4, 11, 14),
+//                                 List(0, 5, 8, 15),
+//                                 List(0, 4, 9, 12),
+//                                 List(0, 4, 8, 13),
+//             )
+//             for (i <- 0 until _array.length) {
+//                 for (j <- 0 until _n) {
+//                     dut.io.keep(j).poke(!_array(i%_array.length)(j))
+//                 }
+//                 dut.clock.step()
+//                 for (j <- 0 until _n) {
+//                     dut.io.out(j).expect(_expected(i)(j))
+//                 }
+//                 print_helper.printVectorChisel(dut.io.out, _n)
+//             }
+//         }
+//     }
+// }
\ No newline at end of file
diff --git a/src/test/scala/ncore/pe/PESpec.scala b/src/test/scala/pe/PESpec.scala
similarity index 93%
rename from src/test/scala/ncore/pe/PESpec.scala
rename to src/test/scala/pe/PESpec.scala
index b08d114..eba2038 100644
--- a/src/test/scala/ncore/pe/PESpec.scala
+++ b/src/test/scala/pe/PESpec.scala
@@ -1,6 +1,6 @@
 // See README.md for license details.
 
-package ncore.pe
+package pe
 
 import scala.util.Random
 import chisel3._
@@ -20,7 +20,7 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester {
         val _left_in_ = rand.between(0, 255)
         dut.io.in_a.poke(_top_in_)
         dut.io.in_b.poke(_left_in_)
-        dut.io.accum.poke(true)
+        dut.io.ctrl.accum.poke(true)
         dut.clock.step()
         prod = prod + _top_in_ * _left_in_
         dut.io.out.expect(prod)
@@ -32,7 +32,7 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester {
       var _left_in_ = rand.between(1, 255)
       dut.io.in_a.poke(_top_in_)
       dut.io.in_b.poke(_left_in_)
-      dut.io.accum.poke(false)
+      dut.io.ctrl.accum.poke(false)
       dut.clock.step()
       prod = prod + _top_in_ * _left_in_
       dut.io.out.expect(prod)
@@ -42,7 +42,7 @@ class PESpec extends AnyFlatSpec with ChiselScalatestTester {
       _left_in_ = rand.between(1, 255)
       dut.io.in_a.poke(_top_in_)
       dut.io.in_b.poke(_left_in_)
-      dut.io.accum.poke(true)
+      dut.io.ctrl.accum.poke(true)
       dut.clock.step()
       prod = prod + _top_in_ * _left_in_
       dut.io.out.expect(prod)
diff --git a/src/test/scala/ncore/tcm/TCMSpec.scala b/src/test/scala/sram/SRAMSpec.scala
similarity index 51%
rename from src/test/scala/ncore/tcm/TCMSpec.scala
rename to src/test/scala/sram/SRAMSpec.scala
index 01bc2b5..2d8464b 100644
--- a/src/test/scala/ncore/tcm/TCMSpec.scala
+++ b/src/test/scala/sram/SRAMSpec.scala
@@ -1,6 +1,6 @@
 // See README.md for license details.
 
-package ncore.tcm
+package sram
 
 import scala.util.Random
 import chisel3._
@@ -10,10 +10,10 @@ import org.scalatest.flatspec.AnyFlatSpec
 import chisel3.experimental.BundleLiterals._
 
 
-class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
+class SRAMSpec extends AnyFlatSpec with ChiselScalatestTester {
 
-  "TCM Cells" should "write on signal" in {
-    test(new TCMCell(8)) { dut =>
+  "SRAM Cells" should "write on signal" in {
+    test(new SRAMCell(8)) { dut =>
       val rand = new Random
       var _prev = 0
       for (i <- 0 until 10) {
@@ -29,8 +29,8 @@ class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
     }
   }
 
-  "TCM Block" should "write on signal and read anytime" in {
-    test(new TCMBlock(3, 192)) { dut =>
+  "SRAM Block" should "write on signal and read anytime" in {
+    test(new SRAMBlock(3, 192, 1)) { dut =>
       val _n = dut.n
       val _cells = dut.size
       val rand = new Random
@@ -46,20 +46,20 @@ class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
         dut.io.en_wr.poke(true)
         dut.clock.step()
         for (i <- 0 until _n * _n) {
-          dut.io.r_addr(i).poke(_in_addr(i))
+          dut.io.r_addr(0)(i).poke(_in_addr(i))
         }
         for (i <- 0 until _n * _n){
-          dut.io.d_out(i).expect(_in_data(i))
+          dut.io.d_out(0)(i).expect(_in_data(i))
         }
         println("Result tick @ " + _i + ": ")
         print_helper.printMatrix(_in_data, _n)
-        print_helper.printMatrixChisel(dut.io.d_out, _n)
+        print_helper.printMatrixChisel(dut.io.d_out(0), _n)
       }
     }
   }
 
-  "TCM Block" should "read anytime" in {
-    test(new TCMBlock(2, 64)) { dut =>
+  "SRAM Block" should "read anytime" in {
+    test(new SRAMBlock(2, 64, 1)) { dut =>
       val _n = dut.n
       val _cells = dut.size
       val rand = new Random
@@ -81,16 +81,60 @@ class TCMSpec extends AnyFlatSpec with ChiselScalatestTester {
         val _r_addr = rand.shuffle((0 until _cells).toList).take(_n * _n)
         val _expected = new Array[Int](_n * _n)
         for (i <- 0 until _n * _n) {
-          dut.io.r_addr(i).poke(_r_addr(i))
+          dut.io.r_addr(0)(i).poke(_r_addr(i))
         }
         for (i <- 0 until _n * _n) {
           _expected(i) = _data(_r_addr(i))
         }
         println("Result tick @ " + _i + ": ")
         print_helper.printMatrix(_expected, _n)
-        print_helper.printMatrixChisel(dut.io.d_out, _n)
+        print_helper.printMatrixChisel(dut.io.d_out(0), _n)
         for (i <- 0 until _n * _n){
-          dut.io.d_out(i).expect(_data(_r_addr(i)))
+          dut.io.d_out(0)(i).expect(_data(_r_addr(i)))
+        }
+      }
+    }
+  }
+
+  "SRAM Block" should "read anytime on different channels" in {
+    test(new SRAMBlock(2, 64, 2)) { dut =>
+      val _n = dut.n
+      val _cells = dut.size
+      val _rd_ch_num = dut.rd_ch_num
+      val rand = new Random
+      val print_helper = new testUtil.PrintHelper()
+      val _data = new Array[Int](_cells)
+      for (_i <- 0 until 10) {
+        val _in_data = new Array[Int](_rd_ch_num * _n * _n)
+        val _in_addr = rand.shuffle((0 until _cells).toList).take(_rd_ch_num * _n * _n)
+        for (k <- 0 until _rd_ch_num){
+          for (i <- 0 until _n * _n) {
+            val _ind = k * _n * _n + i
+            _in_data(_ind) = rand.between(0, 255)
+            dut.io.d_in(i).poke(_in_data(_ind))
+            dut.io.w_addr(i).poke(_in_addr(_ind))
+            _data(_in_addr(_ind)) = _in_data(_ind)
+          }
+          dut.io.en_wr.poke(true)
+          dut.clock.step()
+        }
+      }
+      for(_i <- 0 until 10){
+        val _r_addr = rand.shuffle((0 until _cells).toList).take(_rd_ch_num * _n * _n)
+        val _expected = new Array[Int](_rd_ch_num * _n * _n)
+        for (k <- 0 until _rd_ch_num){
+          for (i <- 0 until _n * _n) {
+            val _ind = k * _n * _n + i
+            dut.io.r_addr(k)(i).poke(_r_addr(_ind))
+            _expected(_ind) = _data(_r_addr(_ind))
+          }
+        }
+        println("Result tick @ " + _i + ": ")
+        for (k <- 0 until _rd_ch_num){
+          for (i <- 0 until _n * _n){
+            val _ind = k * _n * _n + i
+            dut.io.d_out(k)(i).expect(_data(_r_addr(_ind)))
+          }
         }
       }
     }
diff --git a/src/test/scala/utils/printHelper.scala b/src/test/scala/utils/printHelper.scala
index c520bda..cf71ece 100644
--- a/src/test/scala/utils/printHelper.scala
+++ b/src/test/scala/utils/printHelper.scala
@@ -28,4 +28,12 @@ class PrintHelper(){
         }
         println("]")
     }
+
+    def printVectorChisel(vec: chisel3.Vec[chisel3.UInt], n: Int): Unit = {
+        var _row = ""
+        for (i <- 0 until n) {
+            _row += vec(i).peekInt().toString() + ", "
+        }
+        println("[" + _row + "]")
+    }
 }
\ No newline at end of file