refactored with new docs

mpskex · Feb 17, 2024 · fe41b86 · fe41b86
1 parent 922a1ae
commit fe41b86
Show file tree

Hide file tree

Showing 16 changed files with 257 additions and 192 deletions.
diff --git a/Makefile b/Makefile
@@ -43,7 +43,7 @@ push-image-arm64:
 	docker push fangruil/chisel-dev:arm64-${VER}
 
 docs:
-	pip3 install markdown-wavedrom mkdocs mkdocs-material python-markdown-math
+	pip3 install markdown-wavedrom mkdocs mkdocs-material python-markdown-math mkdocs-mermaid2-plugin
 	mkdocs serve
 
 clean:

diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 [![Documentation Status](https://readthedocs.org/projects/chisel-opennpu/badge/?version=latest)](https://chisel-opennpu.readthedocs.io/en/latest/?badge=latest)
 
+Docs: https://chisel-opennpu.readthedocs.io
+
 This is a chisel workbench designed for someone who like docker containers and vscode dev container plugin.
 
 DEVELOP IN PROGRESS. COMMERCIAL USE IS NOT ALLOWED.

diff --git a/docs/images/neural_core.png b/docs/images/neural_core.png
diff --git a/docs/implementations/NeuralCore.md b/docs/implementations/NeuralCore.md
@@ -0,0 +1,48 @@
+# Neural Core
+
+[Systolic Arrays](SystolicArray.md) are high throughput high latency computing architectures. They can be very efficient if we controll them with care.
+
+To support more general operations for linear algebra, we need to split the computing logic from the addressing and controlling logic. So the architecture should look like below:
+
+<div style="text-align: center">
+<img src="../../images/neural_core.png" width=80%/>
+</div>
+
+The overall architecture of the proposed Neural Core will look like a multi-layered 3D grid. If you look along z-axis, you will find them forming a pipeline.
+
+```mermaid
+graph LR
+    subgraph I[Scratch Pad Memory]
+        C[SPM inbound]
+        F[SPM outbound]
+    end
+    
+    G[DMA]
+
+    subgraph Neural Core
+        A[CU]
+        B[i-MMU]
+        D[PE]
+        E[o-MMU]
+    end
+
+
+    B --> |addr| C
+    C --> |data| B
+    A --> |ctrl| B
+    A --> |i-base-addr| B
+    B --> |data| D
+    A --> |ctrl| D
+    D --> |data| E
+    A --> |ctrl| E
+    E --> |addr| F
+    E --> |data| F
+    A --> |o-base-addr| E
+    I <--> G
+```
+
+Above is the pipeline of a Neural Unit (NU), which is an element of the processing element pipeline in the Neural Core. They can organize as systolic arrays or parallelized thread cores. 
+
+This flexible architecture is managed by MMU, where all the data flow is controlled. To reduce the number of running transistors, we fused the systolic design with a parallelism design. All $\mu$-CU and i-$\mu$MMU will have a stair-like scheduling characteristics. Though this design choice may lead to high latency, I think it is still quite efficient: It preserves high throughput with fair amount of registers and arithmetic units. Of course you can have a multiplexed control set to manage this grid, but that will have more overhead. For example, you need a large piece of logic to implement the parallelism and another one to avoid bubbles in Neural Units.
+
+An Neural Processing Unit (NPU) can have multiple Neural Cores (NCore). Each Neural Core has a 2 dimensional grid of Neural Uint (NU). Each Neural Unit has its own micro-CU ($\mu$-CU), micro-MMU for both input and output(i-$\mu$MMU/o-$\mu$MMU) and [processing element (PE)](ProcessingElement.md). Having large registers that hold the matrix is impossible. So the design follows other NPU designs, using a Scratch Pad Memory to store input and output data. Each $\mu$MMU is directly connected to SPM to obtain a instant access to the data.
diff --git a/docs/implementations/ProcessingElement.md b/docs/implementations/ProcessingElement.md
@@ -1,16 +1,16 @@
 # Processing Element
 
 ```ascii_art
-        ACCUM   TOP
+        ACCUM   IN_B
            \     |
             \    |
              \___v___
              | Proc  |
-   LEFT ---->| Elem  |-----> RIGHT
-              ¯¯¯|¯¯¯\
-                 |    \
-                 v     \
-               BOTTOM   OUT (to TLB mapped memory)
+   IN_A ---->| Elem  |
+              ¯¯¯|¯¯¯
+                 |
+                 v
+                OUT (to TLB mapped memory)
 ```
 
 Processing Element is the fundamental element in systolic array. This is a basic implementation of a 2D PE for 2D systolic array or DSP grid.
@@ -22,11 +22,9 @@ PE component will only accumulate the result if the `ACCUM` is high. This is eff
 wavedrom (
     { signal: [
       { name: "clk", wave:"P......", period: 4 },
-      { name: "top_in", wave: "x====xx", data:["top_1", "top_2", "top_3", "top_4"], period: 4},
-      { name: "left_in", wave: "x====xx", data:["left_1", "left_2", "left_3", "left_4"], period: 4},
+      { name: "in_a", wave: "x====xx", data:["a_1", "a_2", "a_3", "a_4"], period: 4},
+      { name: "in_b", wave: "x====xx", data:["b_1", "b_2", "b_3", "b_4"], period: 4},
       { name: "accu", wave: "1...01.", period: 4},
-      { name: "right_out", wave: "xx====x", data:["top_1", "top_2", "top_3", "top_4"], period: 4},
-      { name: "bottom_out", wave: "xx====x", data:["left_1", "left_2", "left_3", "left_4"], period: 4},
-      { name: "out", wave: "xx====x", data:["prod1=top_1*left_1", "prod_1 + top_2 * left_2", "prod_3=top_3 * left_3", "prod_3 + top_4 * left_4"], period: 4},
+      { name: "out", wave: "xx====x", data:["prod1=a_1*b_1", "prod_1 + a_2 * b_2", "prod_3=a_3 * b_3", "prod_3 + a_4 * b_4"], period: 4},
       ] }
 )
diff --git a/docs/index.md b/docs/index.md
@@ -4,14 +4,17 @@ This is an open-source neural processing unit implementation in Chisel3.
 
 Specifically, this NPU is targeted at to be integerated to a low-power and edge-oriented SoC systems. So all design choices are facing those demands.
 
+You can check the source code on [GitHub](https://github.com/mpskex/chisel-npu).
+
 For overall chip design, you may find [the FullChipDesign website](https://www.fullchipdesign.com/) pretty helpful there.
 
-## Designs
+## ISA Designs
 - [Instructions](designs/01.isa.md)
 - [Memory](designs/02.memory.md)
 - [Buses](designs/03.bus.md)
 
 ## Implementation Details
 
-- [Processing Element (PE)](implementations/ProcessingElement.md)
-- [Systolic Array (SA)](implementations/SystolicArray.md)
+- [Neural Core (NCore)](implementations/NeuralCore.md)
+  - [Processing Element (PE)](implementations/ProcessingElement.md)
+  - [Systolic Array (SA)](implementations/SystolicArray.md)
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -4,6 +4,9 @@ repo_name: Chisel NPU
 theme:
   name: material
 
+plugins:
+    - search
+    - mermaid2
 
 markdown_extensions:
   - admonition

diff --git a/src/main/scala/ncore/cu/controlUnit.scala b/src/main/scala/ncore/cu/controlUnit.scala
@@ -0,0 +1,28 @@
+// See README.md for license details
+package ncore.cu
+
+import chisel3._
+
+/**
+ * Control unit also uses systolic array to pass instructions
+ */
+class ControlUnit(val n: Int = 8, val ctrl_width: Int = 8) extends Module {
+    val io = IO(new Bundle {
+        val cbus_in     = Input(UInt(ctrl_width.W))
+        val cbus_out    = Output(Vec(n * n, UInt(ctrl_width.W)))
+    })
+    // Assign each element with diagnal control signal
+    val reg = RegInit(VecInit(Seq.fill(2*n-1)(0.U(ctrl_width.W))))
+
+    // 1D systolic array for control
+    reg(0) := io.cbus_in
+    for(i<- 1 until 2*n-1){
+        reg(i) := reg(i-1)
+    }
+    // Boardcast to all elements in the array
+    for(i <- 0 until n){
+        for(j <- 0 until n){
+            io.cbus_out(n*i+j) := reg(i+j)
+        }
+    }
+}
diff --git a/src/main/scala/ncore/neuralCore.scala b/src/main/scala/ncore/neuralCore.scala
@@ -0,0 +1,63 @@
+// See README.md for license details
+package ncore
+
+import chisel3._
+
+/**
+ * This is the neural core design
+ */
+ class NeuralCore(val n: Int = 8, val nbits: Int = 8, val ctrl_width: Int = 8) extends Module {
+    val io = IO(new Bundle {
+        val vec_a   = Input(Vec(n, UInt(nbits.W)))  // vector `a` is the left input
+        val vec_b   = Input(Vec(n, UInt(nbits.W)))  // vector `b` is the top input
+        val ctrl    = Input(UInt(ctrl_width.W))
+        val out     = Output(Vec(n * n, UInt((2 * nbits + 12).W)))
+    })
+
+    // Create n x n pe blocks
+    val pe_io = VecInit(Seq.fill(n * n) {Module(new pe.PE(nbits)).io})
+    // Create 2d register for horizontal & vertical
+    val pe_reg_h = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+    val pe_reg_v = RegInit(VecInit(Seq.fill((n - 1) * n)(0.U(nbits.W))))
+
+    // we use systolic array to pipeline the instructions
+    // this will avoid bubble and inst complexity 
+    // while simplifying design with higher efficiency
+    val ctrl_array = Module(new cu.ControlUnit(n, ctrl_width))
+    ctrl_array.io.cbus_in := io.ctrl
+
+    for (i <- 0 until n){
+        for (j <- 0 until n) {
+            // ==== OUTPUT ====
+            // pe array's output mapped to the matrix position
+            io.out(n * i + j) := pe_io(n * i + j).out
+
+            // ==== INPUT ====
+            // vertical
+            if (i==0) {
+                pe_io(j).in_b := io.vec_b(j)
+            } else {
+                pe_io(n * i + j).in_b := pe_reg_v(n * (i - 1) + j)
+            }
+            if (i < n - 1 && j < n)
+                pe_reg_v(n * i + j) := pe_io(n * i + j).in_b
+
+            // horizontal
+            if (j==0) {
+                pe_io(n * i).in_a := io.vec_a(i)
+            } else {
+                pe_io(n * i + j).in_a := pe_reg_h((n - 1) * i + (j - 1))
+            }
+            if (i < n && j < n - 1)
+                pe_reg_h((n - 1) * i + j) := pe_io(n * i + j).in_a
+
+            // ==== CONTROL ====
+            // Currently we only have one bit control
+            // which is `ACCUM`
+            // TODO:
+            // Add ALU control to pe elements
+            val ctrl = ctrl_array.io.cbus_out(n * i + j).asBools
+            pe_io(n * i + j).accum := ctrl(0)
+        }
+    }
+ }
diff --git a/src/main/scala/procElem/procElem.scala → src/main/scala/ncore/pe/procElem.scala b/src/main/scala/procElem/procElem.scala → src/main/scala/ncore/pe/procElem.scala
@@ -1,6 +1,6 @@
 // See README.md for license details.
 
-package procElem
+package ncore.pe
 
 import chisel3._
 
@@ -12,30 +12,20 @@ class PE(val nbits: Int = 8) extends Module {
   val io = IO(
     new Bundle {
       val accum       = Input(Bool())
-      val top_in      = Input(UInt(nbits.W))
-      val left_in     = Input(UInt(nbits.W))
-      val bottom_out  = Output(UInt((nbits).W))
-      val right_out   = Output(UInt((nbits).W))
+      val in_a      = Input(UInt(nbits.W))
+      val in_b     = Input(UInt(nbits.W))
       //  The register bandwith is optimized for large transformer 
       //  The lower bound of max cap matrix size is:
       //    2^12 x 2^12 = (4096 x 4096)
       val out         = Output(UInt((nbits * 2 + 12).W))
   })
 
   val res = RegInit(0.U((nbits*2 + 12).W))
-  val reg_h = RegInit(0.U(nbits.W))
-  val reg_v = RegInit(0.U(nbits.W))
 
   when (io.accum) {
-    res := res + (io.top_in * io.left_in)
+    res := res + (io.in_a * io.in_b)
   } .otherwise {
-    res := (io.top_in * io.left_in)
+    res := (io.in_a * io.in_b)
   }
-
-  reg_v := io.top_in
-  reg_h := io.left_in
-
-  io.bottom_out := reg_v
-  io.right_out := reg_h
   io.out := res
 }
diff --git a/src/main/scala/npu/npu.scala b/src/main/scala/npu/npu.scala
@@ -4,29 +4,25 @@ import chisel3._
 import java.nio.file.{Paths, Files}
 import java.nio.charset.StandardCharsets
 import circt.stage.ChiselStage
-import procElem.PE
+import ncore.pe.PE
 
 class NPU extends Module {
 
   val nbits: Int = 8
   val io = IO(new Bundle {
-    val top_in      = Input(UInt(nbits.W))
-    val left_in     = Input(UInt(nbits.W))
+    val in_a        = Input(UInt(nbits.W))
+    val in_b        = Input(UInt(nbits.W))
     val accum       = Input(Bool())
-    val bottom_out  = Output(UInt((nbits*2).W))
-    val right_out   = Output(UInt((nbits*2).W))
     val out         = Output(UInt((nbits*2).W))
   })  
 
   val pe = Module(new PE(8))
 
   // get value when ready
-  pe.io.top_in := io.top_in
-  pe.io.left_in := io.left_in
+  pe.io.in_a := io.in_a
+  pe.io.in_b := io.in_b
   pe.io.accum := io.accum
   io.out := pe.io.out
-  io.bottom_out := pe.io.bottom_out
-  io.right_out := pe.io.right_out
 }
 
 object Main extends App {

diff --git a/src/main/scala/systolicArray/systolicArray.scala b/src/main/scala/systolicArray/systolicArray.scala