csr.bus: redesign Multiplexer shadow registers.

Before this commit, csr.Multiplexer had separate shadows for every element in its memory map. The same shadow was shared for read and write accesses to an element; a combined read/write transaction was impossible despite being allowed by the CSR interface. After this commit, csr.Multiplexer has separate shadows for read and write accesses, but both shadows are shared by every element using them. For multiplexers with many elements, this approach also results in significant resource savings.
jfng · Jul 24, 2023 · d778b3d · d778b3d
1 parent d2ca157
commit d778b3d
Show file tree

Hide file tree

Showing 2 changed files with 333 additions and 143 deletions.
diff --git a/amaranth_soc/csr/bus.py b/amaranth_soc/csr/bus.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 import enum
 from amaranth import *
 from amaranth.utils import log2_int
@@ -171,10 +172,87 @@ def memory_map(self, memory_map):
 
 
 class Multiplexer(Elaboratable):
+    class _Shadow:
+        class Chunk:
+            def __init__(self, shadow, offset, elements):
+                self.name = f"{shadow.name}__{offset}"
+                self.data = Signal(shadow.shape, name=f"{self.name}__data")
+                self.r_en = Signal(name=f"{self.name}__r_en")
+                self.w_en = Signal(name=f"{self.name}__w_en")
+                self._elements = tuple(elements)
+
+            def elements(self):
+                yield from self._elements
+
+        def __init__(self, width, overlaps, *, name):
+            assert isinstance(name, str)
+            assert overlaps is None or isinstance(overlaps, int) and overlaps >= 0
+            self.name     = name
+            self.shape    = Shape.cast(unsigned(width))
+            self.overlaps = overlaps
+            self._ranges  = set()
+            self._blocks  = 1
+            self._chunks  = None
+
+        def add(self, elem_range):
+            assert isinstance(elem_range, range)
+            self._ranges.add(elem_range)
+
+        def decode(self, addr, elem_range):
+            assert isinstance(addr, int) and isinstance(elem_range, range)
+            range_bits = log2_int(elem_range.stop - elem_range.start, need_pow2=False)
+            block_bits = log2_int(self._blocks)
+            misaligned = bool(elem_range.start % (2 ** range_bits))
+            mask = 2 ** (range_bits + block_bits + misaligned) - 1
+            return addr & mask
+
+        def encode(self, offset, elem_range):
+            assert isinstance(offset, int) and isinstance(elem_range, range)
+            range_bits = log2_int(elem_range.stop - elem_range.start, need_pow2=False)
+            block_bits = log2_int(self._blocks)
+            misaligned = bool(elem_range.start % (2 ** range_bits))
+            mask = 2 ** (range_bits + block_bits + misaligned) - 1
+            return elem_range.start & ~mask | offset
+
+        def prepare(self):
+            if self.overlaps is None:
+                self.overlaps = len(self._ranges)
+
+            elements = defaultdict(list)
+            balanced = True
+
+            for elem_range in self._ranges:
+                for chunk_addr in elem_range:
+                    chunk_offset = self.decode(chunk_addr, elem_range)
+                    if len(elements[chunk_offset]) > self.overlaps:
+                        balanced = False
+                        break
+                    elements[chunk_offset].append(elem_range)
+
+            if balanced:
+                self._ranges = frozenset(self._ranges)
+                self._chunks = dict()
+                for chunk_offset, chunk_elements in elements.items():
+                    chunk = Multiplexer._Shadow.Chunk(self, chunk_offset, chunk_elements)
+                    self._chunks[chunk_offset] = chunk
+            else:
+                self._blocks *= 2
+                self.prepare()
+
+        def chunks(self):
+            for chunk_offset, chunk in self._chunks.items():
+                yield chunk_offset, chunk
+
     """CSR register multiplexer.
 
     An address-based multiplexer for CSR registers implementing atomic updates.
 
+    This implementation assumes the following from the CSR bus:
+        * an initiator must have exclusive ownership over the multiplexer for the full duration of
+          a register transaction;
+        * an initiator must access a register in ascending order of addresses, but it may abort a
+          transaction after any bus cycle.
+
     Latency
     -------
 
@@ -204,6 +282,59 @@ class Multiplexer(Elaboratable):
     the register. This allows determining write latency solely from the amount of addresses the
     register occupies in the CPU address space, and the width of the CSR bus.
 
+    Buffering
+    ---------
+
+    CSR registers use separate shadows to buffer read and write accesses, which can happen in the
+    same transaction. To save resources, both shadows are shared by every CSR register using them.
+    By default, the contents of a shadow are split in `2 ** ceil(log2(w/n))` chunks of *n*-bits
+    where *n* is the CSR bus data width and *w* the width of the widest register.
+
+    For example, let's consider the read shadow of a multiplexer with a 4-bit wide bus:
+
+    .. code-block::
+
+        +----------+-----------------+
+        |  chunks  | 0 1 2 3 4 5 6 7 |
+        +----------+-----------------+
+        |          | A B C D E F . . | <- a 24-bit register at address 0x0, containing 0xFEDCBA
+        | r_shadow +-----------------+
+        |          | . . . . A B C . | <- a 12-bit register at address 0xC, containing 0xCBA
+        +----------+-----------------+
+
+    While this configuration minimizes the amount of flip-flops in a shadow, having many registers
+    share the same chunks can create a deeply nested mux tree. This may potentially impact timing
+    closure or logic usage, depending on the platform toolchain and technology.
+
+    The amount of chunks of a shadow may also depend on the following:
+        * If a CSR register isn't naturally aligned (i.e. its address isn't a multiple of its
+          size), it will use twice as many shadow chunks.
+        * If more registers share the same chunk than what is allowed by the `shadow_overlaps`
+          parameter, the shadow size is doubled (as many times as needed) in order to balance
+          chunk usage.
+
+    To illustrate the chunk balancing process, let's revisit the above example in the case where
+    `shadow_overlaps` is set to `0` (i.e. a chunk is exclusive to a single register):
+
+    .. code-block::
+
+        +----------+-----------------+-----------------+
+        |  blocks  |        0        |        1        |
+        +----------+-----------------+-----------------+
+        |  chunks  | 0 1 2 3 4 5 6 7 | 8 9 A B C D E F |
+        +----------+-----------------+-----------------+
+        |          | A B C D E F . . | . . . . . . . . | <- a 24-bit register at address 0x0
+        | r_shadow +-----------------+-----------------+
+        |          | . . . . . . . . | . . . . A B C . | <- a 12-bit register at address 0xC
+        +----------+-----------------+-----------------+
+
+    A use case for setting `shadow_overlaps` to `0` would be to loosen the assumption of exclusive
+    ownership of the multiplexer into exclusive ownership of a set of registers (i.e. transactions
+    may be interleaved by multiple initiators, as long as they target different registers).
+
+    Note that unused shadow chunks are not instantiated in the RTL. The addressing scheme described
+    above is the same for the write shadow.
+
     Parameters
     ----------
     addr_width : int
@@ -214,16 +345,21 @@ class Multiplexer(Elaboratable):
         Register alignment. See :class:`..memory.MemoryMap`.
     name : str
         Window name. Optional.
+    shadow_overlaps : int
+        Maximum number of CSR registers that can share a single chunk of a shadow register.
+        Optional. If `None`, any number of CSR registers can share a shadow chunk.
 
     Attributes
     ----------
     bus : :class:`Interface`
         CSR bus providing access to registers.
     """
-    def __init__(self, *, addr_width, data_width, alignment=0, name=None):
+    def __init__(self, *, addr_width, data_width, alignment=0, name=None, shadow_overlaps=None):
         self._map = MemoryMap(addr_width=addr_width, data_width=data_width, alignment=alignment,
                               name=name)
         self._bus = None
+        self._r_shadow = Multiplexer._Shadow(data_width, shadow_overlaps, name="r_shadow")
+        self._w_shadow = Multiplexer._Shadow(data_width, shadow_overlaps, name="w_shadow")
 
     @property
     def bus(self):
@@ -258,50 +394,77 @@ def add(self, element, *, addr=None, alignment=None, extend=False):
     def elaborate(self, platform):
         m = Module()
 
-        # Instead of a straightforward multiplexer for reads, use a per-element address comparator,
-        # AND the shadow register chunk with the comparator output, and OR all of those together.
-        # If the toolchain doesn't already synthesize multiplexer trees this way, this trick can
-        # save a significant amount of logic, since e.g. one 4-LUT can pack one 2-MUX, but two
-        # 2-AND or 2-OR gates.
-        r_data_fanin = 0
-
         for elem, _, (elem_start, elem_end) in self._map.resources():
-            shadow = Signal(elem.width, name="{}__shadow".format(elem.name))
+            elem_range = range(elem_start, elem_end)
             if elem.access.readable():
-                shadow_en = Signal(elem_end - elem_start, name="{}__shadow_en".format(elem.name))
-                m.d.sync += shadow_en.eq(0)
+                self._r_shadow.add(elem_range)
             if elem.access.writable():
-                m.d.comb += elem.w_data.eq(shadow)
-                m.d.sync += elem.w_stb.eq(0)
+                self._w_shadow.add(elem_range)
+
+        self._r_shadow.prepare()
+        self._w_shadow.prepare()
+
+        # Instead of a straightforward multiplexer for reads, use an address comparator for each
+        # shadow register chunk, AND the comparator output with the chunk contents, and OR all of
+        # those together. If the toolchain doesn't already synthesize multiplexer trees this way,
+        # this trick can save a significant amount of logic, since e.g. one 4-LUT can pack one
+        # 2-MUX, but two 2-AND or 2-OR gates.
+        r_data_fanin = 0
+
+        for chunk_offset, r_chunk in self._r_shadow.chunks():
+            # Use the same trick to select which element is read into a shadow register chunk.
+            r_chunk_w_en_fanin = 0
+            r_chunk_data_fanin = 0
+
+            m.d.sync += r_chunk.r_en.eq(0)
 
-            # Enumerate every address used by the register explicitly, rather than using
-            # arithmetic comparisons, since some toolchains (e.g. Yosys) are too eager to infer
-            # carry chains for comparisons, even with a constant. (Register sizes don't have
-            # to be powers of 2.)
             with m.Switch(self.bus.addr):
-                for chunk_offset, chunk_addr in enumerate(range(elem_start, elem_end)):
-                    shadow_slice = shadow.word_select(chunk_offset, self.bus.data_width)
+                for elem_range in r_chunk.elements():
+                    chunk_addr  = self._r_shadow.encode(chunk_offset, elem_range)
+                    elem        = self._map.decode_address(elem_range.start)
+                    elem_offset = chunk_addr - elem_range.start
+                    elem_slice  = elem.r_data.word_select(elem_offset, self.bus.data_width)
 
                     with m.Case(chunk_addr):
-                        if elem.access.readable():
-                            r_data_fanin |= Mux(shadow_en[chunk_offset], shadow_slice, 0)
-                            if chunk_addr == elem_start:
-                                m.d.comb += elem.r_stb.eq(self.bus.r_stb)
-                                with m.If(self.bus.r_stb):
-                                    m.d.sync += shadow.eq(elem.r_data)
-                            # Delay by 1 cycle, allowing reads to be pipelined.
-                            m.d.sync += shadow_en.eq(self.bus.r_stb << chunk_offset)
-
-                        if elem.access.writable():
-                            if chunk_addr == elem_end - 1:
-                                # Delay by 1 cycle, avoiding combinatorial paths through
-                                # the CSR bus and into CSR registers.
-                                m.d.sync += elem.w_stb.eq(self.bus.w_stb)
-                            with m.If(self.bus.w_stb):
-                                m.d.sync += shadow_slice.eq(self.bus.w_data)
+                        if chunk_addr == elem_range.start:
+                            m.d.comb += elem.r_stb.eq(self.bus.r_stb)
+                        # Delay by 1 cycle, allowing reads to be pipelined.
+                        m.d.sync += r_chunk.r_en.eq(self.bus.r_stb)
+
+                    r_chunk_w_en_fanin |= elem.r_stb
+                    r_chunk_data_fanin |= Mux(elem.r_stb, elem_slice, 0)
+
+            m.d.comb += r_chunk.w_en.eq(r_chunk_w_en_fanin)
+            with m.If(r_chunk.w_en):
+                m.d.sync += r_chunk.data.eq(r_chunk_data_fanin)
+
+            r_data_fanin |= Mux(r_chunk.r_en, r_chunk.data, 0)
 
         m.d.comb += self.bus.r_data.eq(r_data_fanin)
 
+        for chunk_offset, w_chunk in self._w_shadow.chunks():
+            with m.Switch(self.bus.addr):
+                for elem_range in w_chunk.elements():
+                    chunk_addr  = self._w_shadow.encode(chunk_offset, elem_range)
+                    elem        = self._map.decode_address(elem_range.start)
+                    elem_offset = chunk_addr - elem_range.start
+                    elem_slice  = elem.w_data.word_select(elem_offset, self.bus.data_width)
+
+                    if chunk_addr == elem_range.stop - 1:
+                        m.d.sync += elem.w_stb.eq(0)
+
+                    with m.Case(chunk_addr):
+                        if chunk_addr == elem_range.stop - 1:
+                            # Delay by 1 cycle, avoiding combinatorial paths through
+                            # the CSR bus and into CSR registers.
+                            m.d.sync += elem.w_stb.eq(self.bus.w_stb)
+                        m.d.comb += w_chunk.w_en.eq(self.bus.w_stb)
+
+                    m.d.comb += elem_slice.eq(w_chunk.data)
+
+            with m.If(w_chunk.w_en):
+                m.d.sync += w_chunk.data.eq(self.bus.w_data)
+
         return m