Skip to content

Commit

Permalink
csr.bus: redesign Multiplexer shadow registers.
Browse files Browse the repository at this point in the history
Before this commit, csr.Multiplexer had separate shadows for every
element in its memory map. The same shadow was shared for read and
write accesses to an element; a combined read/write transaction was
impossible despite being allowed by the CSR interface.

After this commit, csr.Multiplexer has separate shadows for read and
write accesses, but both shadows are shared by every element using
them. For multiplexers with many elements, this approach also results
in significant resource savings.
  • Loading branch information
jfng committed Jul 24, 2023
1 parent d2ca157 commit d778b3d
Show file tree
Hide file tree
Showing 2 changed files with 333 additions and 143 deletions.
233 changes: 198 additions & 35 deletions amaranth_soc/csr/bus.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import defaultdict
import enum
from amaranth import *
from amaranth.utils import log2_int
Expand Down Expand Up @@ -171,10 +172,87 @@ def memory_map(self, memory_map):


class Multiplexer(Elaboratable):
class _Shadow:
class Chunk:
def __init__(self, shadow, offset, elements):
self.name = f"{shadow.name}__{offset}"
self.data = Signal(shadow.shape, name=f"{self.name}__data")
self.r_en = Signal(name=f"{self.name}__r_en")
self.w_en = Signal(name=f"{self.name}__w_en")
self._elements = tuple(elements)

def elements(self):
yield from self._elements

def __init__(self, width, overlaps, *, name):
assert isinstance(name, str)
assert overlaps is None or isinstance(overlaps, int) and overlaps >= 0
self.name = name
self.shape = Shape.cast(unsigned(width))
self.overlaps = overlaps
self._ranges = set()
self._blocks = 1
self._chunks = None

def add(self, elem_range):
assert isinstance(elem_range, range)
self._ranges.add(elem_range)

def decode(self, addr, elem_range):
assert isinstance(addr, int) and isinstance(elem_range, range)
range_bits = log2_int(elem_range.stop - elem_range.start, need_pow2=False)
block_bits = log2_int(self._blocks)
misaligned = bool(elem_range.start % (2 ** range_bits))
mask = 2 ** (range_bits + block_bits + misaligned) - 1
return addr & mask

def encode(self, offset, elem_range):
assert isinstance(offset, int) and isinstance(elem_range, range)
range_bits = log2_int(elem_range.stop - elem_range.start, need_pow2=False)
block_bits = log2_int(self._blocks)
misaligned = bool(elem_range.start % (2 ** range_bits))
mask = 2 ** (range_bits + block_bits + misaligned) - 1
return elem_range.start & ~mask | offset

def prepare(self):
if self.overlaps is None:
self.overlaps = len(self._ranges)

elements = defaultdict(list)
balanced = True

for elem_range in self._ranges:
for chunk_addr in elem_range:
chunk_offset = self.decode(chunk_addr, elem_range)
if len(elements[chunk_offset]) > self.overlaps:
balanced = False
break
elements[chunk_offset].append(elem_range)

if balanced:
self._ranges = frozenset(self._ranges)
self._chunks = dict()
for chunk_offset, chunk_elements in elements.items():
chunk = Multiplexer._Shadow.Chunk(self, chunk_offset, chunk_elements)
self._chunks[chunk_offset] = chunk
else:
self._blocks *= 2
self.prepare()

def chunks(self):
for chunk_offset, chunk in self._chunks.items():
yield chunk_offset, chunk

"""CSR register multiplexer.
An address-based multiplexer for CSR registers implementing atomic updates.
This implementation assumes the following from the CSR bus:
* an initiator must have exclusive ownership over the multiplexer for the full duration of
a register transaction;
* an initiator must access a register in ascending order of addresses, but it may abort a
transaction after any bus cycle.
Latency
-------
Expand Down Expand Up @@ -204,6 +282,59 @@ class Multiplexer(Elaboratable):
the register. This allows determining write latency solely from the amount of addresses the
register occupies in the CPU address space, and the width of the CSR bus.
Buffering
---------
CSR registers use separate shadows to buffer read and write accesses, which can happen in the
same transaction. To save resources, both shadows are shared by every CSR register using them.
By default, the contents of a shadow are split in `2 ** ceil(log2(w/n))` chunks of *n*-bits
where *n* is the CSR bus data width and *w* the width of the widest register.
For example, let's consider the read shadow of a multiplexer with a 4-bit wide bus:
.. code-block::
+----------+-----------------+
| chunks | 0 1 2 3 4 5 6 7 |
+----------+-----------------+
| | A B C D E F . . | <- a 24-bit register at address 0x0, containing 0xFEDCBA
| r_shadow +-----------------+
| | . . . . A B C . | <- a 12-bit register at address 0xC, containing 0xCBA
+----------+-----------------+
While this configuration minimizes the amount of flip-flops in a shadow, having many registers
share the same chunks can create a deeply nested mux tree. This may potentially impact timing
closure or logic usage, depending on the platform toolchain and technology.
The amount of chunks of a shadow may also depend on the following:
* If a CSR register isn't naturally aligned (i.e. its address isn't a multiple of its
size), it will use twice as many shadow chunks.
* If more registers share the same chunk than what is allowed by the `shadow_overlaps`
parameter, the shadow size is doubled (as many times as needed) in order to balance
chunk usage.
To illustrate the chunk balancing process, let's revisit the above example in the case where
`shadow_overlaps` is set to `0` (i.e. a chunk is exclusive to a single register):
.. code-block::
+----------+-----------------+-----------------+
| blocks | 0 | 1 |
+----------+-----------------+-----------------+
| chunks | 0 1 2 3 4 5 6 7 | 8 9 A B C D E F |
+----------+-----------------+-----------------+
| | A B C D E F . . | . . . . . . . . | <- a 24-bit register at address 0x0
| r_shadow +-----------------+-----------------+
| | . . . . . . . . | . . . . A B C . | <- a 12-bit register at address 0xC
+----------+-----------------+-----------------+
A use case for setting `shadow_overlaps` to `0` would be to loosen the assumption of exclusive
ownership of the multiplexer into exclusive ownership of a set of registers (i.e. transactions
may be interleaved by multiple initiators, as long as they target different registers).
Note that unused shadow chunks are not instantiated in the RTL. The addressing scheme described
above is the same for the write shadow.
Parameters
----------
addr_width : int
Expand All @@ -214,16 +345,21 @@ class Multiplexer(Elaboratable):
Register alignment. See :class:`..memory.MemoryMap`.
name : str
Window name. Optional.
shadow_overlaps : int
Maximum number of CSR registers that can share a single chunk of a shadow register.
Optional. If `None`, any number of CSR registers can share a shadow chunk.
Attributes
----------
bus : :class:`Interface`
CSR bus providing access to registers.
"""
def __init__(self, *, addr_width, data_width, alignment=0, name=None):
def __init__(self, *, addr_width, data_width, alignment=0, name=None, shadow_overlaps=None):
self._map = MemoryMap(addr_width=addr_width, data_width=data_width, alignment=alignment,
name=name)
self._bus = None
self._r_shadow = Multiplexer._Shadow(data_width, shadow_overlaps, name="r_shadow")
self._w_shadow = Multiplexer._Shadow(data_width, shadow_overlaps, name="w_shadow")

@property
def bus(self):
Expand Down Expand Up @@ -258,50 +394,77 @@ def add(self, element, *, addr=None, alignment=None, extend=False):
def elaborate(self, platform):
m = Module()

# Instead of a straightforward multiplexer for reads, use a per-element address comparator,
# AND the shadow register chunk with the comparator output, and OR all of those together.
# If the toolchain doesn't already synthesize multiplexer trees this way, this trick can
# save a significant amount of logic, since e.g. one 4-LUT can pack one 2-MUX, but two
# 2-AND or 2-OR gates.
r_data_fanin = 0

for elem, _, (elem_start, elem_end) in self._map.resources():
shadow = Signal(elem.width, name="{}__shadow".format(elem.name))
elem_range = range(elem_start, elem_end)
if elem.access.readable():
shadow_en = Signal(elem_end - elem_start, name="{}__shadow_en".format(elem.name))
m.d.sync += shadow_en.eq(0)
self._r_shadow.add(elem_range)
if elem.access.writable():
m.d.comb += elem.w_data.eq(shadow)
m.d.sync += elem.w_stb.eq(0)
self._w_shadow.add(elem_range)

self._r_shadow.prepare()
self._w_shadow.prepare()

# Instead of a straightforward multiplexer for reads, use an address comparator for each
# shadow register chunk, AND the comparator output with the chunk contents, and OR all of
# those together. If the toolchain doesn't already synthesize multiplexer trees this way,
# this trick can save a significant amount of logic, since e.g. one 4-LUT can pack one
# 2-MUX, but two 2-AND or 2-OR gates.
r_data_fanin = 0

for chunk_offset, r_chunk in self._r_shadow.chunks():
# Use the same trick to select which element is read into a shadow register chunk.
r_chunk_w_en_fanin = 0
r_chunk_data_fanin = 0

m.d.sync += r_chunk.r_en.eq(0)

# Enumerate every address used by the register explicitly, rather than using
# arithmetic comparisons, since some toolchains (e.g. Yosys) are too eager to infer
# carry chains for comparisons, even with a constant. (Register sizes don't have
# to be powers of 2.)
with m.Switch(self.bus.addr):
for chunk_offset, chunk_addr in enumerate(range(elem_start, elem_end)):
shadow_slice = shadow.word_select(chunk_offset, self.bus.data_width)
for elem_range in r_chunk.elements():
chunk_addr = self._r_shadow.encode(chunk_offset, elem_range)
elem = self._map.decode_address(elem_range.start)
elem_offset = chunk_addr - elem_range.start
elem_slice = elem.r_data.word_select(elem_offset, self.bus.data_width)

with m.Case(chunk_addr):
if elem.access.readable():
r_data_fanin |= Mux(shadow_en[chunk_offset], shadow_slice, 0)
if chunk_addr == elem_start:
m.d.comb += elem.r_stb.eq(self.bus.r_stb)
with m.If(self.bus.r_stb):
m.d.sync += shadow.eq(elem.r_data)
# Delay by 1 cycle, allowing reads to be pipelined.
m.d.sync += shadow_en.eq(self.bus.r_stb << chunk_offset)

if elem.access.writable():
if chunk_addr == elem_end - 1:
# Delay by 1 cycle, avoiding combinatorial paths through
# the CSR bus and into CSR registers.
m.d.sync += elem.w_stb.eq(self.bus.w_stb)
with m.If(self.bus.w_stb):
m.d.sync += shadow_slice.eq(self.bus.w_data)
if chunk_addr == elem_range.start:
m.d.comb += elem.r_stb.eq(self.bus.r_stb)
# Delay by 1 cycle, allowing reads to be pipelined.
m.d.sync += r_chunk.r_en.eq(self.bus.r_stb)

r_chunk_w_en_fanin |= elem.r_stb
r_chunk_data_fanin |= Mux(elem.r_stb, elem_slice, 0)

m.d.comb += r_chunk.w_en.eq(r_chunk_w_en_fanin)
with m.If(r_chunk.w_en):
m.d.sync += r_chunk.data.eq(r_chunk_data_fanin)

r_data_fanin |= Mux(r_chunk.r_en, r_chunk.data, 0)

m.d.comb += self.bus.r_data.eq(r_data_fanin)

for chunk_offset, w_chunk in self._w_shadow.chunks():
with m.Switch(self.bus.addr):
for elem_range in w_chunk.elements():
chunk_addr = self._w_shadow.encode(chunk_offset, elem_range)
elem = self._map.decode_address(elem_range.start)
elem_offset = chunk_addr - elem_range.start
elem_slice = elem.w_data.word_select(elem_offset, self.bus.data_width)

if chunk_addr == elem_range.stop - 1:
m.d.sync += elem.w_stb.eq(0)

with m.Case(chunk_addr):
if chunk_addr == elem_range.stop - 1:
# Delay by 1 cycle, avoiding combinatorial paths through
# the CSR bus and into CSR registers.
m.d.sync += elem.w_stb.eq(self.bus.w_stb)
m.d.comb += w_chunk.w_en.eq(self.bus.w_stb)

m.d.comb += elem_slice.eq(w_chunk.data)

with m.If(w_chunk.w_en):
m.d.sync += w_chunk.data.eq(self.bus.w_data)

return m


Expand Down
Loading

0 comments on commit d778b3d

Please sign in to comment.