From 50bf4e7ab2c78928d51ab09b434c8d1ddfdc331c Mon Sep 17 00:00:00 2001
From: Eduardo Bart <edub4rt@gmail.com>
Date: Fri, 5 Apr 2024 23:26:36 -0300
Subject: [PATCH] feat: optimize instruction fetch and decoding with big jump
 tables

---
 .github/workflows/build.yml        |    2 +
 Makefile                           |    9 +-
 src/.gitignore                     |    1 +
 src/Makefile                       |   44 +-
 src/device-state-access.h          |    4 +-
 src/i-state-access.h               |    5 +-
 src/interpret.cpp                  | 2399 ++++++++++++++++------------
 src/interpret.h                    |    2 +-
 src/machine-state.h                |    3 +-
 src/machine.cpp                    |   31 +-
 src/record-state-access.h          |   46 +-
 src/record-step-state-access.h     |   45 +-
 src/replay-state-access.h          |   95 +-
 src/replay-step-state-access.h     |   99 +-
 src/riscv-constants.h              |  171 --
 src/send-cmio-response.cpp         |    8 +-
 src/send-cmio-response.h           |    8 +-
 src/soft-float.h                   |   32 +-
 src/state-access.h                 |   11 -
 src/translate-virtual-address.h    |    6 +-
 tools/gen-interpret-jump-table.lua |  602 +++++++
 uarch/uarch-machine-state-access.h |   10 +-
 uarch/uarch-run.cpp                |    6 +-
 23 files changed, 2179 insertions(+), 1460 deletions(-)
 create mode 100755 tools/gen-interpret-jump-table.lua

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 252b475a0..6cff82ebe 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -103,6 +103,7 @@ jobs:
             uarch-pristine-ram.c
             uarch-pristine-hash.c
             machine-c-version.h
+            interpret-jump-table.h
             cartesi-machine-v${{ env.MACHINE_EMULATOR_VERSION }}_amd64.deb
             cartesi-machine-v${{ env.MACHINE_EMULATOR_VERSION }}_arm64.deb
 
@@ -726,6 +727,7 @@ jobs:
         if: ${{ startsWith(github.ref, 'refs/tags/v') }}
         run: |
           mv artifacts/machine-c-version.h src
+          mv artifacts/interpret-jump-table.h src
           mv artifacts/uarch-pristine-ram.c uarch
           mv artifacts/uarch-pristine-hash.c uarch
           make create-generated-files-patch
diff --git a/Makefile b/Makefile
index 1d692987a..9640ae3eb 100644
--- a/Makefile
+++ b/Makefile
@@ -154,7 +154,7 @@ export CXX=g++
 
 endif
 
-GENERATED_FILES= uarch/uarch-pristine-hash.c uarch/uarch-pristine-ram.c src/machine-c-version.h
+GENERATED_FILES= uarch/uarch-pristine-hash.c uarch/uarch-pristine-ram.c src/machine-c-version.h src/interpret-jump-table.h
 ADD_GENERATED_FILES_DIFF= add-generated-files.diff
 
 all: source-default
@@ -244,12 +244,15 @@ lint-% check-format-% format-% check-format-lua-% check-lua-% format-lua-%:
 source-default:
 	@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR)
 
-uarch: $(SRCDIR)/machine-c-version.h
+uarch: $(SRCDIR)/machine-c-version.h $(SRCDIR)/interpret-jump-table.h
 	@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C uarch
 
 $(SRCDIR)/machine-c-version.h:
 	@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR) machine-c-version.h
 
+$(SRCDIR)/interpret-jump-table.h:
+	@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR) interpret-jump-table.h
+
 build-emulator-builder-image:
 	docker build $(DOCKER_PLATFORM) --build-arg DEBUG=$(debug) --build-arg COVERAGE=$(coverage) --build-arg SANITIZE=$(sanitize) --target builder -t cartesi/machine-emulator:builder -f Dockerfile .
 
@@ -282,6 +285,7 @@ copy:
 	docker create --name uarch-ram-bin $(DOCKER_PLATFORM) $(DEBIAN_IMG)
 	docker cp uarch-ram-bin:/usr/src/emulator/$(DEB_FILENAME) .
 	docker cp uarch-ram-bin:/usr/src/emulator/src/machine-c-version.h .
+	docker cp uarch-ram-bin:/usr/src/emulator/src/interpret-jump-table.h .
 	docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-ram.bin .
 	docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-pristine-ram.c .
 	docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-pristine-hash.c .
@@ -399,4 +403,3 @@ $(ADD_GENERATED_FILES_DIFF): $(GENERATED_FILES)
 
 .PHONY: help all submodules doc clean distclean src luacartesi hash uarch \
 	create-generated-files-patch $(SUBDIRS) $(SUBCLEAN)
-
diff --git a/src/.gitignore b/src/.gitignore
index ecdc63b96..b59133762 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -8,3 +8,4 @@ compile_flags.txt
 coverage*
 jsonrpc-discover.cpp
 machine-c-version.h
+interpret-jump-table.h
diff --git a/src/Makefile b/src/Makefile
index 5f66b62d7..3450a747b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -214,18 +214,27 @@ SHA3_CFLAGS=-O3
 
 # Optimization flags for the interpreter
 ifneq (,$(filter yes,$(relwithdebinfo) $(release)))
-ifneq (,$(filter gcc,$(CC)))
-# The following flag helps GCC to eliminate more redundant computations in the interpret loop,
-# saving some host instructions and improving performance.
-# This flag is usually enabled by default at -O3,
-# but we don't use -O3 because it enables some other flags that are not worth for the interpreter.
-INTERPRET_CXXFLAGS+=-fgcse-after-reload -fpredictive-commoning -fsplit-paths -ftree-partial-pre
+ifneq (,$(findstring gcc,$(CC)))
+# The following improves computed goto dispatch as stated in GCC manual
+INTERPRET_CXXFLAGS+=-fno-gcse
+# The following remove extra jumps in the computed goto dispatch
+INTERPRET_CXXFLAGS+=-fno-crossjumping
+# The following remove extra NOPs before jumping back to the interpret hot loop
+INTERPRET_CXXFLAGS+=-fno-align-loops
+# The interpreter dispatch loop performs better as a big inlined function
+INTERPRET_CXXFLAGS+=-finline-limit=1024
+# The interpreter hot loop is big and puts pressure on register allocation, this improves register use
+INTERPRET_CXXFLAGS+=-frename-registers -fweb
+# The interpreter instruction dispatch is big, the following reduces its size minimizing CPU cache pressure
+INTERPRET_CXXFLAGS+=-freorder-blocks-algorithm=simple
+# Some distributions enables stack protector by default, make sure it's disabled
+INTERPRET_CXXFLAGS+=-fno-stack-protector
 endif
-# Disable jump tables, because it degrades the instruction decoding performance in the interpret loop,
-# since it generates a memory indirection that has a high cost in opcode switches.
-INTERPRET_CXXFLAGS+=-fno-jump-tables
 endif
 
+# Make testing new optimization options easier
+INTERPRET_CXXFLAGS+=$(MYINTERPRET_CXXFLAGS)
+
 # Link time optimizations
 ifeq ($(lto),yes)
 OPTFLAGS+=-flto=auto
@@ -262,7 +271,7 @@ PGO_WORKLOAD=\
 	whetstone 25000
 
 LINTER_IGNORE_SOURCES=
-LINTER_IGNORE_HEADERS=
+LINTER_IGNORE_HEADERS=interpret-jump-table.h
 LINTER_SOURCES=$(filter-out $(LINTER_IGNORE_SOURCES),$(strip $(wildcard *.cpp) $(wildcard *.c)))
 LINTER_HEADERS=$(filter-out $(LINTER_IGNORE_HEADERS),$(strip $(wildcard *.hpp) $(wildcard *.h)))
 
@@ -273,7 +282,7 @@ CLANG_FORMAT=clang-format
 CLANG_FORMAT_UARCH_FILES:=$(wildcard ../uarch/*.cpp)
 CLANG_FORMAT_UARCH_FILES:=$(filter-out %uarch-printf%,$(strip $(CLANG_FORMAT_UARCH_FILES)))
 CLANG_FORMAT_FILES:=$(wildcard *.cpp) $(wildcard *.c) $(wildcard *.h) $(wildcard *.hpp) $(CLANG_FORMAT_UARCH_FILES)
-CLANG_FORMAT_IGNORE_FILES:=
+CLANG_FORMAT_IGNORE_FILES:=interpret-jump-table.h
 CLANG_FORMAT_FILES:=$(strip $(CLANG_FORMAT_FILES))
 CLANG_FORMAT_FILES:=$(filter-out $(CLANG_FORMAT_IGNORE_FILES),$(strip $(CLANG_FORMAT_FILES)))
 
@@ -542,12 +551,12 @@ jsonrpc-discover.cpp: jsonrpc-discover.json
 	echo '} // namespace cartesi' >> jsonrpc-discover.cpp
 
 %.clang-tidy: %.cpp machine-c-version.h
-	@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CXXFLAGS) $(LUA_INC) 2>/dev/null
+	@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CXXFLAGS) $(LUA_INC) -DCLANG_TIDY_LINT 2>/dev/null
 	@$(CXX) $(CXXFLAGS) $(LUA_INC) $< -MM -MT $@ -MF $@.d > /dev/null 2>&1
 	@touch $@
 
 %.clang-tidy: %.c
-	@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CFLAGS) 2>/dev/null
+	@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CFLAGS) -DCLANG_TIDY_LINT 2>/dev/null
 	@$(CC) $(CFLAGS) $< -MM -MT $@ -MF $@.d > /dev/null 2>&1
 	@touch $@
 
@@ -560,7 +569,10 @@ uarch-pristine-ram.o: $(UARCH_PRISTINE_RAM_C)
 uarch-pristine-hash.o: $(UARCH_PRISTINE_HASH_C)
 	$(CC) $(CFLAGS) -c -o $@ $<
 
-interpret.o: interpret.cpp machine-c-version.h
+interpret-jump-table.h: ../tools/gen-interpret-jump-table.lua
+	$< > $@
+
+interpret.o: interpret.cpp interpret-jump-table.h machine-c-version.h
 	$(CXX) $(CXXFLAGS) $(INTERPRET_CXXFLAGS) -c -o $@ $<
 
 %.o: %.cpp machine-c-version.h
@@ -571,7 +583,7 @@ interpret.o: interpret.cpp machine-c-version.h
 
 ../uarch/uarch-pristine-ram.c ../uarch/uarch-pristine-hash.c: generate-uarch-pristine
 
-generate-uarch-pristine:
+generate-uarch-pristine: machine-c-version.h interpret-jump-table.h
 ifeq (,$(wildcard ../uarch/uarch-pristine-hash.c))
 	@if [ "$(DEV_ENV_HAS_TOOLCHAIN)" = "yes" ]; then \
 		$(MAKE) -C .. uarch; \
@@ -583,7 +595,7 @@ endif
 clean: clean-auto-generated clean-coverage clean-profile clean-tidy clean-libcartesi clean-executables
 
 clean-auto-generated:
-	@rm -f jsonrpc-discover.cpp machine-c-version.h
+	@rm -f jsonrpc-discover.cpp machine-c-version.h interpret-jump-table.h
 
 clean-tidy:
 	@rm -f *.clang-tidy
diff --git a/src/device-state-access.h b/src/device-state-access.h
index 51d5633a9..ef471818d 100644
--- a/src/device-state-access.h
+++ b/src/device-state-access.h
@@ -36,7 +36,7 @@ namespace cartesi {
 template <typename STATE_ACCESS>
 class device_state_access : public i_device_state_access {
 public:
-    explicit device_state_access(STATE_ACCESS &a, uint64_t mcycle) : m_a(a), m_mcycle(mcycle) {
+    explicit device_state_access(STATE_ACCESS a, uint64_t mcycle) : m_a(a), m_mcycle(mcycle) {
         static_assert(is_an_i_state_access<STATE_ACCESS>::value, "not an i_state_access");
     }
 
@@ -52,7 +52,7 @@ class device_state_access : public i_device_state_access {
     ~device_state_access() override = default;
 
 private:
-    STATE_ACCESS &m_a; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
+    STATE_ACCESS m_a; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
     uint64_t m_mcycle;
 
     void do_set_mip(uint64_t mask) override {
diff --git a/src/i-state-access.h b/src/i-state-access.h
index d692b32c2..d4b126ea6 100644
--- a/src/i-state-access.h
+++ b/src/i-state-access.h
@@ -24,6 +24,7 @@
 #include <type_traits>
 #include <utility>
 
+#include "compiler-defines.h"
 #include "meta.h"
 #include "shadow-tlb.h"
 
@@ -773,7 +774,7 @@ class i_state_access { // CRTP
     }
 
     /// \brief Invalidates all TLB entries of all types.
-    void flush_all_tlb() {
+    NO_INLINE void flush_all_tlb() {
         derived().template flush_tlb_type<TLB_CODE>();
         derived().template flush_tlb_type<TLB_READ>();
         derived().template flush_tlb_type<TLB_WRITE>();
@@ -781,7 +782,7 @@ class i_state_access { // CRTP
 
     /// \brief Invalidates TLB entries for a specific virtual address.
     /// \param vaddr Target virtual address.
-    void flush_tlb_vaddr(uint64_t vaddr) {
+    NO_INLINE void flush_tlb_vaddr(uint64_t vaddr) {
         return derived().do_flush_tlb_vaddr(vaddr);
     }
 
diff --git a/src/interpret.cpp b/src/interpret.cpp
index 572d317e3..35769e9fd 100644
--- a/src/interpret.cpp
+++ b/src/interpret.cpp
@@ -113,6 +113,11 @@
 
 namespace cartesi {
 
+enum class rd_kind {
+    x0, // rd = 0
+    xN, // rd is a positive natural number (1, 2, 3 ... 31)
+};
+
 #ifdef DUMP_REGS
 static const std::array<const char *, X_REG_COUNT> reg_name{"zero", "ra", "sp", "gp", "tp", "t0", "t1", "t2", "s0",
     "s1", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11",
@@ -296,6 +301,12 @@ static void dump_regs(const STATE &s) {
 }
 #endif
 
+/// \brief Checks if a instruction is uncompressed.
+/// \param insn Instruction.
+static FORCE_INLINE bool insn_is_uncompressed(uint32_t insn) {
+    return (insn & 3) == 3;
+}
+
 /// \brief Checks if CSR is read-only.
 /// \param csraddr Address of CSR in file.
 /// \returns true if read-only, false otherwise.
@@ -318,7 +329,7 @@ static inline uint32_t csr_priv(CSR_address csr) {
 /// \param new_prv New privilege level.
 /// \details This function is outlined to minimize host CPU code cache pressure.
 template <typename STATE_ACCESS>
-static NO_INLINE void set_priv(STATE_ACCESS &a, int new_prv) {
+static FORCE_INLINE void set_priv(STATE_ACCESS a, int new_prv) {
     INC_COUNTER(a.get_statistics(), priv_level[new_prv]);
     a.write_iflags_PRV(new_prv);
     // Invalidate all TLB entries
@@ -339,7 +350,13 @@ static NO_INLINE void set_priv(STATE_ACCESS &a, int new_prv) {
 /// \returns The new program counter, pointing to the raised exception trap handler.
 /// \details This function is outlined to minimize host CPU code cache pressure.
 template <typename STATE_ACCESS>
-static NO_INLINE uint64_t raise_exception(STATE_ACCESS &a, uint64_t pc, uint64_t cause, uint64_t tval) {
+static NO_INLINE uint64_t raise_exception(STATE_ACCESS a, uint64_t pc, uint64_t cause, uint64_t tval) {
+    if (cause == MCAUSE_ILLEGAL_INSN && !insn_is_uncompressed(static_cast<uint32_t>(tval))) {
+        // Discard high bits of compressed instructions,
+        // this is not performed in the instruction hot loop as an optimization.
+        tval = static_cast<uint16_t>(tval);
+    }
+
 #if defined(DUMP_EXCEPTIONS) || defined(DUMP_MMU_EXCEPTIONS) || defined(DUMP_INTERRUPTS) ||                            \
     defined(DUMP_ILLEGAL_INSN_EXCEPTIONS)
     {
@@ -442,7 +459,7 @@ static NO_INLINE uint64_t raise_exception(STATE_ACCESS &a, uint64_t pc, uint64_t
 /// \param a Machine state accessor object.
 /// \returns The mask.
 template <typename STATE_ACCESS>
-static inline uint32_t get_pending_irq_mask(STATE_ACCESS &a) {
+static inline uint32_t get_pending_irq_mask(STATE_ACCESS a) {
     const uint64_t mip = a.read_mip();
     const uint64_t mie = a.read_mie();
 
@@ -522,7 +539,7 @@ static inline uint32_t get_highest_priority_irq_num(uint32_t v) {
 /// \param a Machine state accessor object.
 /// \param pc Machine current program counter.
 template <typename STATE_ACCESS>
-static inline uint64_t raise_interrupt_if_any(STATE_ACCESS &a, uint64_t pc) {
+static inline uint64_t raise_interrupt_if_any(STATE_ACCESS a, uint64_t pc) {
     const uint32_t mask = get_pending_irq_mask(a);
     if (unlikely(mask != 0)) {
         const uint64_t irq_num = get_highest_priority_irq_num(mask);
@@ -535,7 +552,7 @@ static inline uint64_t raise_interrupt_if_any(STATE_ACCESS &a, uint64_t pc) {
 /// \param a Machine state accessor object.
 /// \param mcycle Machine current cycle.
 template <typename STATE_ACCESS>
-static inline void set_rtc_interrupt(STATE_ACCESS &a, uint64_t mcycle) {
+static inline void set_rtc_interrupt(STATE_ACCESS a, uint64_t mcycle) {
     const uint64_t timecmp_cycle = rtc_time_to_cycle(a.read_clint_mtimecmp());
     if (timecmp_cycle <= mcycle && timecmp_cycle != 0) {
         const uint64_t mip = a.read_mip();
@@ -543,10 +560,10 @@ static inline void set_rtc_interrupt(STATE_ACCESS &a, uint64_t mcycle) {
     }
 }
 
-/// \brief Obtains the funct3 and opcode fields an instruction.
+/// \brief Obtains the id fields an instruction.
 /// \param insn Instruction.
-static inline uint32_t insn_get_funct3_00000_opcode(uint32_t insn) {
-    return insn & 0b111000001111111;
+static FORCE_INLINE uint32_t insn_get_id(uint32_t insn) {
+    return insn & 0b1111'11111'1111111;
 }
 
 /// \brief Obtains the funct3 and trailing 0 bits from an instruction.
@@ -668,24 +685,6 @@ static inline uint32_t insn_get_rs3(uint32_t insn) {
     return (insn >> 27);
 }
 
-/// \brief Obtains the compressed instruction funct3 and opcode fields an instruction.
-/// \param insn Instruction.
-static inline uint32_t insn_get_c_funct3(uint32_t insn) {
-    return insn & 0b1110000000000011;
-}
-
-/// \brief Obtains the compressed instruction funct6, funct2 and opcode fields an instruction.
-/// \param insn Instruction.
-static inline uint32_t insn_get_CA_funct6_funct2(uint32_t insn) {
-    return insn & 0b1111110001100011;
-}
-
-/// \brief Obtains the compressed instruction funct2 and opcode fields an instruction.
-/// \param insn Instruction.
-static inline uint32_t insn_get_CB_funct2(uint32_t insn) {
-    return insn & 0b1110110000000011;
-}
-
 /// \brief Obtains the RD field from a compressed instructions that uses the CIW
 /// or CL format and RS2 field from CS or CA.
 /// \param insn Instruction.
@@ -709,19 +708,29 @@ static inline uint32_t insn_get_CR_CSS_rs2(uint32_t insn) {
 /// \param insn Instruction.
 /// \details This function is forced to be inline because GCC may not always inline it.
 static FORCE_INLINE int32_t insn_get_C_J_imm(uint32_t insn) {
-    auto imm = static_cast<int32_t>(((insn >> (12 - 11)) & 0x800) | ((insn >> (11 - 4)) & 0x10) |
-        ((insn >> (9 - 8)) & 0x300) | ((insn << (10 - 8)) & 0x400) | ((insn >> (7 - 6)) & 0x40) |
-        ((insn << (7 - 6)) & 0x80) | ((insn >> (3 - 1)) & 0xe) | ((insn << (5 - 2)) & 0x20));
-    return (imm << 20) >> 20;
+    return static_cast<int32_t>(
+        (static_cast<uint32_t>(static_cast<int32_t>(insn << 19) >> 20) & ~0b11111111111) | // imm[11]
+        ((insn >> (11 - 4)) & 0b10000) |                                                   // imm[4]
+        ((insn >> (9 - 8)) & 0b1100000000) |                                               // imm[9:8]
+        ((insn << (10 - 8)) & 0b10000000000) |                                             // imm[10]
+        ((insn >> (7 - 6)) & 0b1000000) |                                                  // imm[6]
+        ((insn << (7 - 6)) & 0b10000000) |                                                 // imm[7]
+        ((insn >> (3 - 1)) & 0b1110) |                                                     // imm[3:1]
+        ((insn << (5 - 2)) & 0b100000)                                                     // imm[5]
+    );
 }
 
 /// \brief Obtains the immediate value from a C_BEQZ and C_BNEZ instruction.
 /// \param insn Instruction.
 /// \details This function is forced to be inline because GCC may not always inline it.
 static FORCE_INLINE int32_t insn_get_C_BEQZ_BNEZ_imm(uint32_t insn) {
-    auto imm = static_cast<int32_t>(((insn >> (12 - 8)) & 0x100) | ((insn >> (10 - 3)) & 0x18) |
-        ((insn << (6 - 5)) & 0xc0) | ((insn >> (3 - 1)) & 0x6) | ((insn << (5 - 2)) & 0x20));
-    return (imm << 23) >> 23;
+    return static_cast<int32_t>(
+        (static_cast<uint32_t>(static_cast<int32_t>(insn << 19) >> 23) & ~0b11111111) | // imm[8]
+        ((insn >> 7) & 0b11000) |                                                       // imm[4:3]
+        ((insn << 1) & 0b11000000) |                                                    // imm[7:6]
+        ((insn >> 2) & 0b110) |                                                         // imm[2:1]
+        ((insn << 3) & 0b100000)                                                        // imm[5]
+    );
 }
 
 /// \brief Obtains the immediate value from a CL/CS-type instruction.
@@ -742,7 +751,9 @@ static FORCE_INLINE uint32_t insn_get_CI_CB_imm(uint32_t insn) {
 /// \param insn Instruction.
 /// \details This function is forced to be inline because GCC may not always inline it.
 static FORCE_INLINE int32_t insn_get_CI_CB_imm_se(uint32_t insn) {
-    return static_cast<int32_t>(insn_get_CI_CB_imm(insn) << 26) >> 26;
+    return static_cast<int32_t>((static_cast<uint32_t>(static_cast<int32_t>(insn << 19) >> 26) & ~0b11111) | // imm[5]
+        ((insn >> 2) & 0b11111)                                                                              // imm[4:0]
+    );
 }
 
 /// \brief Obtains the immediate value from a C.LW and C.SW instructions.
@@ -764,17 +775,23 @@ static FORCE_INLINE uint32_t insn_get_CIW_imm(uint32_t insn) {
 /// \param insn Instruction.
 /// \details This function is forced to be inline because GCC may not always inline it.
 static FORCE_INLINE int32_t insn_get_C_ADDI16SP_imm(uint32_t insn) {
-    auto imm = static_cast<int32_t>(((insn >> (12 - 9)) & 0x200) | ((insn >> (6 - 4)) & 0x10) |
-        ((insn << (6 - 5)) & 0x40) | ((insn << (7 - 3)) & 0x180) | ((insn << (5 - 2)) & 0x20));
-    return (imm << 22) >> 22;
+    return static_cast<int32_t>(
+        (static_cast<uint32_t>(static_cast<int32_t>(insn << 19) >> 22) & ~0b111111111) | // imm[9]
+        ((insn >> 2) & 0b10000) |                                                        // imm[4]
+        ((insn << 1) & 0b1000000) |                                                      // imm[6]
+        ((insn << 4) & 0b110000000) |                                                    // imm[8:7]
+        ((insn << 3) & 0b100000)                                                         // imm[5]
+    );
 }
 
 /// \brief Obtains the immediate value from a C.LUI instruction.
 /// \param insn Instruction.
 /// \details This function is forced to be inline because GCC may not always inline it.
 static FORCE_INLINE int32_t insn_get_C_LUI_imm(uint32_t insn) {
-    auto imm = static_cast<int32_t>(((insn << (17 - 12)) & 0x20000) | ((insn << (12 - 2)) & 0x1F000));
-    return (imm << 14) >> 14;
+    return static_cast<int32_t>(
+        (static_cast<uint32_t>(static_cast<int32_t>(insn << 19) >> 14) & ~0b11111111111111111) | // imm[17]
+        ((insn << 10) & 0b11111000000000000)                                                     // imm[16:12]
+    );
 }
 
 /// \brief Obtains the immediate value from a C.FLDSP and C.LDSP instructions.
@@ -821,7 +838,7 @@ static FORCE_INLINE int32_t insn_get_C_SWSP_imm(uint32_t insn) {
 /// is outlined, and taking PC by reference would cause the compiler to store it in a stack variable
 /// instead of always storing it in register (this is an optimization).
 template <typename T, typename STATE_ACCESS, bool RAISE_STORE_EXCEPTIONS = false>
-static NO_INLINE std::pair<bool, uint64_t> read_virtual_memory_slow(STATE_ACCESS &a, uint64_t pc, uint64_t mcycle,
+static NO_INLINE std::pair<bool, uint64_t> read_virtual_memory_slow(STATE_ACCESS a, uint64_t pc, uint64_t mcycle,
     uint64_t vaddr, T *pval) {
     using U = std::make_unsigned_t<T>;
     // No support for misaligned accesses: They are handled by a trap in BBL
@@ -870,13 +887,15 @@ static NO_INLINE std::pair<bool, uint64_t> read_virtual_memory_slow(STATE_ACCESS
 /// \param pval Pointer to word receiving value.
 /// \returns True if succeeded, false otherwise.
 template <typename T, typename STATE_ACCESS, bool RAISE_STORE_EXCEPTIONS = false>
-static FORCE_INLINE bool read_virtual_memory(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint64_t vaddr, T *pval) {
+static FORCE_INLINE bool read_virtual_memory(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint64_t vaddr, T *pval) {
     // Try hitting the TLB
     if (unlikely(!(a.template read_memory_word_via_tlb<TLB_READ>(vaddr, pval)))) {
         // Outline the slow path into a function call to minimize host CPU code cache pressure
         INC_COUNTER(a.get_statistics(), tlb_rmiss);
+        T val = 0; // Don't pass pval reference directly so the compiler can store it in a register
         auto [status, new_pc] =
-            read_virtual_memory_slow<T, STATE_ACCESS, RAISE_STORE_EXCEPTIONS>(a, pc, mcycle, vaddr, pval);
+            read_virtual_memory_slow<T, STATE_ACCESS, RAISE_STORE_EXCEPTIONS>(a, pc, mcycle, vaddr, &val);
+        *pval = val;
         pc = new_pc;
         return status;
     }
@@ -899,7 +918,7 @@ static FORCE_INLINE bool read_virtual_memory(STATE_ACCESS &a, uint64_t &pc, uint
 /// is outlined, and taking PC by reference would cause the compiler to store it in a stack variable
 /// instead of always storing it in register (this is an optimization).
 template <typename T, typename STATE_ACCESS>
-static NO_INLINE std::pair<execute_status, uint64_t> write_virtual_memory_slow(STATE_ACCESS &a, uint64_t pc,
+static NO_INLINE std::pair<execute_status, uint64_t> write_virtual_memory_slow(STATE_ACCESS a, uint64_t pc,
     uint64_t mcycle, uint64_t vaddr, uint64_t val64) {
     using U = std::make_unsigned_t<T>;
     // No support for misaligned accesses: They are handled by a trap in BBL
@@ -944,7 +963,7 @@ static NO_INLINE std::pair<execute_status, uint64_t> write_virtual_memory_slow(S
 /// \param val64 Value to write.
 /// \returns True if succeeded, false if exception raised.
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status write_virtual_memory(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint64_t vaddr,
+static FORCE_INLINE execute_status write_virtual_memory(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint64_t vaddr,
     uint64_t val64) {
     // Try hitting the TLB
     if (unlikely((!a.template write_memory_word_via_tlb<TLB_WRITE>(vaddr, static_cast<T>(val64))))) {
@@ -959,7 +978,7 @@ static FORCE_INLINE execute_status write_virtual_memory(STATE_ACCESS &a, uint64_
 }
 
 template <typename STATE_ACCESS>
-static void dump_insn([[maybe_unused]] STATE_ACCESS &a, [[maybe_unused]] uint64_t pc, [[maybe_unused]] uint32_t insn,
+static void dump_insn([[maybe_unused]] STATE_ACCESS a, [[maybe_unused]] uint64_t pc, [[maybe_unused]] uint32_t insn,
     [[maybe_unused]] const char *name) {
 #ifdef DUMP_HIST
     a.get_naked_state().insn_hist[name]++;
@@ -993,7 +1012,7 @@ static void dump_insn([[maybe_unused]] STATE_ACCESS &a, [[maybe_unused]] uint64_
 /// \details This function is tail-called whenever the caller decoded enough of the instruction to identify it as
 /// illegal.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status raise_illegal_insn_exception(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status raise_illegal_insn_exception(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     pc = raise_exception(a, pc, MCAUSE_ILLEGAL_INSN, insn);
     return execute_status::failure;
 }
@@ -1005,7 +1024,7 @@ static FORCE_INLINE execute_status raise_illegal_insn_exception(STATE_ACCESS &a,
 /// \return execute_status::failure
 /// \details This function is tail-called whenever the caller identified that the next value of pc is misaligned.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status raise_misaligned_fetch_exception(STATE_ACCESS &a, uint64_t &pc, uint64_t new_pc) {
+static FORCE_INLINE execute_status raise_misaligned_fetch_exception(STATE_ACCESS a, uint64_t &pc, uint64_t new_pc) {
     pc = raise_exception(a, pc, MCAUSE_INSN_ADDRESS_MISALIGNED, new_pc);
     return execute_status::failure;
 }
@@ -1017,7 +1036,7 @@ static FORCE_INLINE execute_status raise_misaligned_fetch_exception(STATE_ACCESS
 /// \return execute_status::failure
 /// \details This function is tail-called whenever the caller identified a raised exception.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status advance_to_raised_exception(STATE_ACCESS & /*a*/, uint64_t & /*pc*/) {
+static FORCE_INLINE execute_status advance_to_raised_exception(STATE_ACCESS /*a*/, uint64_t & /*pc*/) {
     return execute_status::failure;
 }
 
@@ -1030,7 +1049,7 @@ static FORCE_INLINE execute_status advance_to_raised_exception(STATE_ACCESS & /*
 /// \return status
 /// \details This function is tail-called whenever the caller wants move to the next instruction.
 template <uint64_t size = 4, typename STATE_ACCESS>
-static FORCE_INLINE execute_status advance_to_next_insn(STATE_ACCESS & /*a*/, uint64_t &pc,
+static FORCE_INLINE execute_status advance_to_next_insn(STATE_ACCESS /*a*/, uint64_t &pc,
     execute_status status = execute_status::success) {
     pc += static_cast<uint32_t>(size);
     return status;
@@ -1043,7 +1062,7 @@ static FORCE_INLINE execute_status advance_to_next_insn(STATE_ACCESS & /*a*/, ui
 /// \return execute_status::success
 /// \details This function is tail-called whenever the caller wants to jump.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_jump(STATE_ACCESS & /*a*/, uint64_t &pc, uint64_t new_pc) {
+static FORCE_INLINE execute_status execute_jump(STATE_ACCESS /*a*/, uint64_t &pc, uint64_t new_pc) {
     pc = new_pc;
     return execute_status::success;
 }
@@ -1054,7 +1073,7 @@ static FORCE_INLINE execute_status execute_jump(STATE_ACCESS & /*a*/, uint64_t &
 /// \param pc Interpreter loop program counter (will be overwritten).
 /// \param insn Instruction.
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LR(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_LR(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     const uint64_t vaddr = a.read_x(insn_get_rs1(insn));
     T val = 0;
     if (unlikely(!read_virtual_memory<T>(a, pc, mcycle, vaddr, &val))) {
@@ -1075,7 +1094,7 @@ static FORCE_INLINE execute_status execute_LR(STATE_ACCESS &a, uint64_t &pc, uin
 /// \param pc Interpreter loop program counter (will be overwritten).
 /// \param insn Instruction.
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SC(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_SC(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     uint64_t val = 0;
     const uint64_t vaddr = a.read_x(insn_get_rs1(insn));
     execute_status status = execute_status::success;
@@ -1098,7 +1117,7 @@ static FORCE_INLINE execute_status execute_SC(STATE_ACCESS &a, uint64_t &pc, uin
 
 /// \brief Implementation of the LR.W instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LR_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_LR_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     if (unlikely((insn & 0b00000001111100000000000000000000) != 0)) {
         return raise_illegal_insn_exception(a, pc, insn);
     }
@@ -1108,13 +1127,13 @@ static FORCE_INLINE execute_status execute_LR_W(STATE_ACCESS &a, uint64_t &pc, u
 
 /// \brief Implementation of the SC.W instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SC_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_SC_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "sc.w");
     return execute_SC<int32_t>(a, pc, mcycle, insn);
 }
 
 template <typename T, typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_AMO(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn,
+static FORCE_INLINE execute_status execute_AMO(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn,
     const F &f) {
     const uint64_t vaddr = a.read_x(insn_get_rs1(insn));
     T valm = 0;
@@ -1139,14 +1158,14 @@ static FORCE_INLINE execute_status execute_AMO(STATE_ACCESS &a, uint64_t &pc, ui
 
 /// \brief Implementation of the AMOSWAP.W instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOSWAP_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOSWAP_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amoswap.w");
     return execute_AMO<int32_t>(a, pc, mcycle, insn, [](int32_t /*valm*/, int32_t valr) -> int32_t { return valr; });
 }
 
 /// \brief Implementation of the AMOADD.W instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOADD_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOADD_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amoadd.w");
     return execute_AMO<int32_t>(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t {
         int32_t val = 0;
@@ -1156,28 +1175,28 @@ static FORCE_INLINE execute_status execute_AMOADD_W(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOXOR_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOXOR_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amoxor.w");
     return execute_AMO<int32_t>(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { return valm ^ valr; });
 }
 
 /// \brief Implementation of the AMOAND.W instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOAND_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOAND_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amoand.w");
     return execute_AMO<int32_t>(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { return valm & valr; });
 }
 
 /// \brief Implementation of the AMOOR.W instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOOR_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOOR_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amoor.w");
     return execute_AMO<int32_t>(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { return valm | valr; });
 }
 
 /// \brief Implementation of the AMOMIN.W instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOMIN_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOMIN_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amomin.w");
     return execute_AMO<int32_t>(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t {
         if (valm < valr) {
@@ -1189,7 +1208,7 @@ static FORCE_INLINE execute_status execute_AMOMIN_W(STATE_ACCESS &a, uint64_t &p
 
 /// \brief Implementation of the AMOMAX.W instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOMAX_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOMAX_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amomax.w");
     return execute_AMO<int32_t>(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t {
         if (valm > valr) {
@@ -1201,7 +1220,7 @@ static FORCE_INLINE execute_status execute_AMOMAX_W(STATE_ACCESS &a, uint64_t &p
 
 /// \brief Implementation of the AMOMINU.W instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOMINU_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOMINU_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amominu.w");
     return execute_AMO<int32_t>(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t {
         if (static_cast<uint32_t>(valm) < static_cast<uint32_t>(valr)) {
@@ -1213,7 +1232,7 @@ static FORCE_INLINE execute_status execute_AMOMINU_W(STATE_ACCESS &a, uint64_t &
 
 /// \brief Implementation of the AMOMAXU.W instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOMAXU_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOMAXU_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amomaxu.w");
     return execute_AMO<int32_t>(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t {
         if (static_cast<uint32_t>(valm) > static_cast<uint32_t>(valr)) {
@@ -1225,7 +1244,7 @@ static FORCE_INLINE execute_status execute_AMOMAXU_W(STATE_ACCESS &a, uint64_t &
 
 /// \brief Implementation of the LR.D instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LR_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_LR_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     if (unlikely((insn & 0b00000001111100000000000000000000) != 0)) {
         return raise_illegal_insn_exception(a, pc, insn);
     }
@@ -1235,21 +1254,21 @@ static FORCE_INLINE execute_status execute_LR_D(STATE_ACCESS &a, uint64_t &pc, u
 
 /// \brief Implementation of the SC.D instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SC_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_SC_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "sc.d");
     return execute_SC<uint64_t>(a, pc, mcycle, insn);
 }
 
 /// \brief Implementation of the AMOSWAP.D instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOSWAP_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOSWAP_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amoswap.d");
     return execute_AMO<int64_t>(a, pc, mcycle, insn, [](int64_t /*valm*/, int64_t valr) -> int64_t { return valr; });
 }
 
 /// \brief Implementation of the AMOADD.D instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOADD_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOADD_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amoadd.d");
     return execute_AMO<int64_t>(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t {
         int64_t val = 0;
@@ -1259,28 +1278,28 @@ static FORCE_INLINE execute_status execute_AMOADD_D(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOXOR_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOXOR_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amoxor.d");
     return execute_AMO<int64_t>(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t { return valm ^ valr; });
 }
 
 /// \brief Implementation of the AMOAND.D instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOAND_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOAND_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amoand.d");
     return execute_AMO<int64_t>(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t { return valm & valr; });
 }
 
 /// \brief Implementation of the AMOOR.D instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOOR_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOOR_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amoor.d");
     return execute_AMO<int64_t>(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t { return valm | valr; });
 }
 
 /// \brief Implementation of the AMOMIN.D instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOMIN_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOMIN_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amomin.d");
     return execute_AMO<int64_t>(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t {
         if (valm < valr) {
@@ -1292,7 +1311,7 @@ static FORCE_INLINE execute_status execute_AMOMIN_D(STATE_ACCESS &a, uint64_t &p
 
 /// \brief Implementation of the AMOMAX.D instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOMAX_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOMAX_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amomax.d");
     return execute_AMO<int64_t>(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t {
         if (valm > valr) {
@@ -1304,7 +1323,7 @@ static FORCE_INLINE execute_status execute_AMOMAX_D(STATE_ACCESS &a, uint64_t &p
 
 /// \brief Implementation of the AMOMINU.D instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOMINU_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOMINU_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amominu.d");
     return execute_AMO<uint64_t>(a, pc, mcycle, insn, [](uint64_t valm, uint64_t valr) -> uint64_t {
         if (valm < valr) {
@@ -1316,7 +1335,7 @@ static FORCE_INLINE execute_status execute_AMOMINU_D(STATE_ACCESS &a, uint64_t &
 
 /// \brief Implementation of the AMOMAXU.D instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMOMAXU_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMOMAXU_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "amomaxu.d");
     return execute_AMO<uint64_t>(a, pc, mcycle, insn, [](uint64_t valm, uint64_t valr) -> uint64_t {
         if (valm > valr) {
@@ -1327,9 +1346,12 @@ static FORCE_INLINE execute_status execute_AMOMAXU_D(STATE_ACCESS &a, uint64_t &
 }
 
 /// \brief Implementation of the ADDW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_ADDW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_ADDW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "addw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         // Discard upper 32 bits
         auto rs1w = static_cast<int32_t>(rs1);
@@ -1341,9 +1363,12 @@ static FORCE_INLINE execute_status execute_ADDW(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the SUBW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SUBW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SUBW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "subw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         // Convert 64-bit to 32-bit
         auto rs1w = static_cast<int32_t>(rs1);
@@ -1355,12 +1380,15 @@ static FORCE_INLINE execute_status execute_SUBW(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the SLLW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLLW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLLW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     if (unlikely((insn & 0b11111110000000000111000001111111) != 0b00000000000000000001000000111011)) {
         return raise_illegal_insn_exception(a, pc, insn);
     }
     dump_insn(a, pc, insn, "sllw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         const auto rs1w = static_cast<int32_t>(static_cast<uint32_t>(rs1) << (rs2 & 31));
         return static_cast<uint64_t>(rs1w);
@@ -1368,9 +1396,12 @@ static FORCE_INLINE execute_status execute_SLLW(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the SRLW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRLW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRLW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "srlw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto rs1w = static_cast<int32_t>(static_cast<uint32_t>(rs1) >> (rs2 & 31));
         return static_cast<uint64_t>(rs1w);
@@ -1378,9 +1409,12 @@ static FORCE_INLINE execute_status execute_SRLW(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the SRAW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRAW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRAW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "sraw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         const int32_t rs1w = static_cast<int32_t>(rs1) >> (rs2 & 31);
         return static_cast<uint64_t>(rs1w);
@@ -1388,9 +1422,12 @@ static FORCE_INLINE execute_status execute_SRAW(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the MULW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_MULW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_MULW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "mulw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto rs1w = static_cast<int32_t>(rs1);
         auto rs2w = static_cast<int32_t>(rs2);
@@ -1401,12 +1438,15 @@ static FORCE_INLINE execute_status execute_MULW(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the DIVW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_DIVW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_DIVW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     if (unlikely((insn & 0b11111110000000000111000001111111) != 0b00000010000000000100000000111011)) {
         return raise_illegal_insn_exception(a, pc, insn);
     }
     dump_insn(a, pc, insn, "divw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto rs1w = static_cast<int32_t>(rs1);
         auto rs2w = static_cast<int32_t>(rs2);
@@ -1421,9 +1461,12 @@ static FORCE_INLINE execute_status execute_DIVW(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the DIVUW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_DIVUW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_DIVUW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "divuw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto rs1w = static_cast<uint32_t>(rs1);
         auto rs2w = static_cast<uint32_t>(rs2);
@@ -1435,12 +1478,15 @@ static FORCE_INLINE execute_status execute_DIVUW(STATE_ACCESS &a, uint64_t &pc,
 }
 
 /// \brief Implementation of the REMW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_REMW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_REMW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     if (unlikely((insn & 0b11111110000000000111000001111111) != 0b00000010000000000110000000111011)) {
         return raise_illegal_insn_exception(a, pc, insn);
     }
     dump_insn(a, pc, insn, "remw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto rs1w = static_cast<int32_t>(rs1);
         auto rs2w = static_cast<int32_t>(rs2);
@@ -1455,12 +1501,15 @@ static FORCE_INLINE execute_status execute_REMW(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the REMUW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_REMUW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_REMUW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     if (unlikely((insn & 0b11111110000000000111000001111111) != 0b00000010000000000111000000111011)) {
         return raise_illegal_insn_exception(a, pc, insn);
     }
     dump_insn(a, pc, insn, "remuw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto rs1w = static_cast<uint32_t>(rs1);
         auto rs2w = static_cast<uint32_t>(rs2);
@@ -1482,7 +1531,7 @@ static inline uint64_t read_csr_success(uint64_t val, bool *status) {
 }
 
 template <typename STATE_ACCESS>
-static inline bool rdcounteren(STATE_ACCESS &a, uint64_t mask) {
+static inline bool rdcounteren(STATE_ACCESS a, uint64_t mask) {
     uint64_t counteren = MCOUNTEREN_R_MASK;
     auto priv = a.read_iflags_PRV();
     if (priv <= PRV_S) {
@@ -1495,7 +1544,7 @@ static inline bool rdcounteren(STATE_ACCESS &a, uint64_t mask) {
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_cycle(STATE_ACCESS &a, uint64_t mcycle, bool *status) {
+static inline uint64_t read_csr_cycle(STATE_ACCESS a, uint64_t mcycle, bool *status) {
     if (rdcounteren(a, MCOUNTEREN_CY_MASK)) {
         return read_csr_success(mcycle, status);
     }
@@ -1503,7 +1552,7 @@ static inline uint64_t read_csr_cycle(STATE_ACCESS &a, uint64_t mcycle, bool *st
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_instret(STATE_ACCESS &a, uint64_t mcycle, bool *status) {
+static inline uint64_t read_csr_instret(STATE_ACCESS a, uint64_t mcycle, bool *status) {
     if (unlikely(!rdcounteren(a, MCOUNTEREN_IR_MASK))) {
         return read_csr_fail(status);
     }
@@ -1513,7 +1562,7 @@ static inline uint64_t read_csr_instret(STATE_ACCESS &a, uint64_t mcycle, bool *
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_time(STATE_ACCESS &a, uint64_t mcycle, bool *status) {
+static inline uint64_t read_csr_time(STATE_ACCESS a, uint64_t mcycle, bool *status) {
     if (unlikely(!rdcounteren(a, MCOUNTEREN_TM_MASK))) {
         return read_csr_fail(status);
     }
@@ -1522,54 +1571,54 @@ static inline uint64_t read_csr_time(STATE_ACCESS &a, uint64_t mcycle, bool *sta
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_sstatus(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_sstatus(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mstatus() & SSTATUS_R_MASK, status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_senvcfg(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_senvcfg(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_senvcfg() & SENVCFG_R_MASK, status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_sie(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_sie(STATE_ACCESS a, bool *status) {
     const uint64_t mie = a.read_mie();
     const uint64_t mideleg = a.read_mideleg();
     return read_csr_success(mie & mideleg, status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_stvec(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_stvec(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_stvec(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_scounteren(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_scounteren(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_scounteren(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_sscratch(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_sscratch(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_sscratch(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_sepc(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_sepc(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_sepc(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_scause(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_scause(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_scause(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_stval(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_stval(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_stval(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_sip(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_sip(STATE_ACCESS a, bool *status) {
     // Ensure values are are loaded in order: do not nest with operator
     const uint64_t mip = a.read_mip();
     const uint64_t mideleg = a.read_mideleg();
@@ -1577,7 +1626,7 @@ static inline uint64_t read_csr_sip(STATE_ACCESS &a, bool *status) {
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_satp(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_satp(STATE_ACCESS a, bool *status) {
     const uint64_t mstatus = a.read_mstatus();
     auto priv = a.read_iflags_PRV();
     // When TVM=1, attempts to read or write the satp CSR
@@ -1589,67 +1638,67 @@ static inline uint64_t read_csr_satp(STATE_ACCESS &a, bool *status) {
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mstatus(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mstatus(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mstatus() & MSTATUS_R_MASK, status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_menvcfg(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_menvcfg(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_menvcfg() & MENVCFG_R_MASK, status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_misa(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_misa(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_misa(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_medeleg(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_medeleg(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_medeleg(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mideleg(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mideleg(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mideleg(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mie(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mie(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mie(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mtvec(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mtvec(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mtvec(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mcounteren(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mcounteren(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mcounteren(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mscratch(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mscratch(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mscratch(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mepc(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mepc(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mepc(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mcause(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mcause(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mcause(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mtval(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mtval(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mtval(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mip(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mip(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mip(), status);
 }
 
@@ -1658,29 +1707,29 @@ static inline uint64_t read_csr_mcycle(uint64_t mcycle, bool *status) {
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_minstret(STATE_ACCESS &a, uint64_t mcycle, bool *status) {
+static inline uint64_t read_csr_minstret(STATE_ACCESS a, uint64_t mcycle, bool *status) {
     const uint64_t icycleinstret = a.read_icycleinstret();
     const uint64_t minstret = mcycle - icycleinstret;
     return read_csr_success(minstret, status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mvendorid(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mvendorid(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mvendorid(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_marchid(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_marchid(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_marchid(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_mimpid(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_mimpid(STATE_ACCESS a, bool *status) {
     return read_csr_success(a.read_mimpid(), status);
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_fflags(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_fflags(STATE_ACCESS a, bool *status) {
     // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
     if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
         return read_csr_fail(status);
@@ -1690,7 +1739,7 @@ static inline uint64_t read_csr_fflags(STATE_ACCESS &a, bool *status) {
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_frm(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_frm(STATE_ACCESS a, bool *status) {
     // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
     if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
         return read_csr_fail(status);
@@ -1700,7 +1749,7 @@ static inline uint64_t read_csr_frm(STATE_ACCESS &a, bool *status) {
 }
 
 template <typename STATE_ACCESS>
-static inline uint64_t read_csr_fcsr(STATE_ACCESS &a, bool *status) {
+static inline uint64_t read_csr_fcsr(STATE_ACCESS a, bool *status) {
     // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
     if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
         return read_csr_fail(status);
@@ -1715,7 +1764,7 @@ static inline uint64_t read_csr_fcsr(STATE_ACCESS &a, bool *status) {
 /// \returns Register value.
 /// \details This function is outlined to minimize host CPU code cache pressure.
 template <typename STATE_ACCESS>
-static NO_INLINE uint64_t read_csr(STATE_ACCESS &a, uint64_t mcycle, CSR_address csraddr, bool *status) {
+static NO_INLINE uint64_t read_csr(STATE_ACCESS a, uint64_t mcycle, CSR_address csraddr, bool *status) {
     if (unlikely(csr_priv(csraddr) > a.read_iflags_PRV())) {
         return read_csr_fail(status);
     }
@@ -1876,20 +1925,20 @@ static NO_INLINE uint64_t read_csr(STATE_ACCESS &a, uint64_t mcycle, CSR_address
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_sstatus(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_sstatus(STATE_ACCESS a, uint64_t val) {
     const uint64_t mstatus = a.read_mstatus();
     return write_csr_mstatus(a, (mstatus & ~SSTATUS_W_MASK) | (val & SSTATUS_W_MASK));
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_senvcfg(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_senvcfg(STATE_ACCESS a, uint64_t val) {
     const uint64_t senvcfg = a.read_senvcfg();
     a.write_senvcfg((senvcfg & ~SENVCFG_W_MASK) | (val & SENVCFG_W_MASK));
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_sie(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_sie(STATE_ACCESS a, uint64_t val) {
     uint64_t mie = a.read_mie();
     const uint64_t mask = a.read_mideleg();
     mie = (mie & ~mask) | (val & mask);
@@ -1898,43 +1947,43 @@ static execute_status write_csr_sie(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_stvec(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_stvec(STATE_ACCESS a, uint64_t val) {
     a.write_stvec(val & ~1);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_scounteren(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_scounteren(STATE_ACCESS a, uint64_t val) {
     a.write_scounteren(val & SCOUNTEREN_RW_MASK);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_sscratch(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_sscratch(STATE_ACCESS a, uint64_t val) {
     a.write_sscratch(val);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_sepc(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_sepc(STATE_ACCESS a, uint64_t val) {
     a.write_sepc(val & ~1);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_scause(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_scause(STATE_ACCESS a, uint64_t val) {
     a.write_scause(val);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_stval(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_stval(STATE_ACCESS a, uint64_t val) {
     a.write_stval(val);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_sip(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_sip(STATE_ACCESS a, uint64_t val) {
     const uint64_t mask = a.read_mideleg();
     uint64_t mip = a.read_mip();
     mip = (mip & ~mask) | (val & mask);
@@ -1943,7 +1992,7 @@ static execute_status write_csr_sip(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static NO_INLINE execute_status write_csr_satp(STATE_ACCESS &a, uint64_t val) {
+static NO_INLINE execute_status write_csr_satp(STATE_ACCESS a, uint64_t val) {
     const uint64_t mstatus = a.read_mstatus();
     auto priv = a.read_iflags_PRV();
 
@@ -1996,7 +2045,7 @@ static NO_INLINE execute_status write_csr_satp(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mstatus(STATE_ACCESS &a, uint64_t val) {
+static NO_INLINE execute_status write_csr_mstatus(STATE_ACCESS a, uint64_t val) {
     const uint64_t old_mstatus = a.read_mstatus() & MSTATUS_R_MASK;
 
     // M-mode software can determine whether a privilege mode is implemented
@@ -2069,7 +2118,7 @@ static execute_status write_csr_mstatus(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_menvcfg(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_menvcfg(STATE_ACCESS a, uint64_t val) {
     uint64_t menvcfg = a.read_menvcfg() & MENVCFG_R_MASK;
 
     // Modify only bits that can be written to
@@ -2080,7 +2129,7 @@ static execute_status write_csr_menvcfg(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_medeleg(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_medeleg(STATE_ACCESS a, uint64_t val) {
     // For exceptions that cannot occur in less privileged modes,
     // the corresponding medeleg bits should be read-only zero
     a.write_medeleg((a.read_medeleg() & ~MEDELEG_W_MASK) | (val & MEDELEG_W_MASK));
@@ -2088,7 +2137,7 @@ static execute_status write_csr_medeleg(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mideleg(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_mideleg(STATE_ACCESS a, uint64_t val) {
     const uint64_t mask = MIP_SSIP_MASK | MIP_STIP_MASK | MIP_SEIP_MASK;
     uint64_t mideleg = a.read_mideleg();
     mideleg = (mideleg & ~mask) | (val & mask);
@@ -2097,7 +2146,7 @@ static execute_status write_csr_mideleg(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mie(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_mie(STATE_ACCESS a, uint64_t val) {
     const uint64_t mask = MIP_MSIP_MASK | MIP_MTIP_MASK | MIP_MEIP_MASK | MIP_SSIP_MASK | MIP_STIP_MASK | MIP_SEIP_MASK;
     uint64_t mie = a.read_mie();
     mie = (mie & ~mask) | (val & mask);
@@ -2106,19 +2155,19 @@ static execute_status write_csr_mie(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mtvec(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_mtvec(STATE_ACCESS a, uint64_t val) {
     a.write_mtvec(val & ~1);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mcounteren(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_mcounteren(STATE_ACCESS a, uint64_t val) {
     a.write_mcounteren(val & MCOUNTEREN_RW_MASK);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_minstret(STATE_ACCESS &a, uint64_t mcycle, uint64_t val) {
+static execute_status write_csr_minstret(STATE_ACCESS a, uint64_t mcycle, uint64_t val) {
     // Note that mcycle will only be incremented after the instruction is executed,
     // but we have to compute this in advance
     const uint64_t icycleinstret = (mcycle + 1) - val;
@@ -2127,7 +2176,7 @@ static execute_status write_csr_minstret(STATE_ACCESS &a, uint64_t mcycle, uint6
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mcycle(STATE_ACCESS & /*a*/, uint64_t /*val*/) {
+static execute_status write_csr_mcycle(STATE_ACCESS /*a*/, uint64_t /*val*/) {
     // We can't allow writes to mcycle because we use it to measure the progress in machine execution.
     // The specs say it is an MRW CSR, read-writeable in M-mode.
     // BBL enables all counters in both M- and S-modes.
@@ -2138,31 +2187,31 @@ static execute_status write_csr_mcycle(STATE_ACCESS & /*a*/, uint64_t /*val*/) {
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mscratch(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_mscratch(STATE_ACCESS a, uint64_t val) {
     a.write_mscratch(val);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mepc(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_mepc(STATE_ACCESS a, uint64_t val) {
     a.write_mepc(val & ~1);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mcause(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_mcause(STATE_ACCESS a, uint64_t val) {
     a.write_mcause(val);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mtval(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_mtval(STATE_ACCESS a, uint64_t val) {
     a.write_mtval(val);
     return execute_status::success;
 }
 
 template <typename STATE_ACCESS>
-static execute_status write_csr_mip(STATE_ACCESS &a, uint64_t val) {
+static execute_status write_csr_mip(STATE_ACCESS a, uint64_t val) {
     const uint64_t mask = MIP_SSIP_MASK | MIP_STIP_MASK | MIP_SEIP_MASK;
     auto mip = a.read_mip();
     mip = (mip & ~mask) | (val & mask);
@@ -2171,7 +2220,7 @@ static execute_status write_csr_mip(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static inline execute_status write_csr_fflags(STATE_ACCESS &a, uint64_t val) {
+static inline execute_status write_csr_fflags(STATE_ACCESS a, uint64_t val) {
     const uint64_t mstatus = a.read_mstatus();
     // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
     if (unlikely((mstatus & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
@@ -2183,7 +2232,7 @@ static inline execute_status write_csr_fflags(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static inline execute_status write_csr_frm(STATE_ACCESS &a, uint64_t val) {
+static inline execute_status write_csr_frm(STATE_ACCESS a, uint64_t val) {
     const uint64_t mstatus = a.read_mstatus();
     // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
     if (unlikely((mstatus & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
@@ -2195,7 +2244,7 @@ static inline execute_status write_csr_frm(STATE_ACCESS &a, uint64_t val) {
 }
 
 template <typename STATE_ACCESS>
-static inline execute_status write_csr_fcsr(STATE_ACCESS &a, uint64_t val) {
+static inline execute_status write_csr_fcsr(STATE_ACCESS a, uint64_t val) {
     const uint64_t mstatus = a.read_mstatus();
     // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
     if (unlikely((mstatus & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
@@ -2213,7 +2262,7 @@ static inline execute_status write_csr_fcsr(STATE_ACCESS &a, uint64_t val) {
 /// \returns The status of the operation (true for success, false otherwise).
 /// \details This function is outlined to minimize host CPU code cache pressure.
 template <typename STATE_ACCESS>
-static NO_INLINE execute_status write_csr(STATE_ACCESS &a, uint64_t mcycle, CSR_address csraddr, uint64_t val) {
+static NO_INLINE execute_status write_csr(STATE_ACCESS a, uint64_t mcycle, CSR_address csraddr, uint64_t val) {
 #if defined(DUMP_CSR)
     fprintf(stderr, "csr_write: csr=0x%03x val=0x", static_cast<int>(csraddr));
     print_uint64_t(val);
@@ -2367,7 +2416,7 @@ static NO_INLINE execute_status write_csr(STATE_ACCESS &a, uint64_t mcycle, CSR_
 }
 
 template <typename STATE_ACCESS, typename RS1VAL>
-static FORCE_INLINE execute_status execute_csr_RW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn,
+static FORCE_INLINE execute_status execute_csr_RW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn,
     const RS1VAL &rs1val) {
     auto csraddr = static_cast<CSR_address>(insn_I_get_uimm(insn));
     // Try to read old CSR value
@@ -2400,22 +2449,22 @@ static FORCE_INLINE execute_status execute_csr_RW(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the CSRRW instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_CSRRW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_CSRRW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "csrrw");
     return execute_csr_RW(a, pc, mcycle, insn,
-        [](STATE_ACCESS &a, uint32_t insn) -> uint64_t { return a.read_x(insn_get_rs1(insn)); });
+        [](STATE_ACCESS a, uint32_t insn) -> uint64_t { return a.read_x(insn_get_rs1(insn)); });
 }
 
 /// \brief Implementation of the CSRRWI instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_CSRRWI(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_CSRRWI(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "csrrwi");
     return execute_csr_RW(a, pc, mcycle, insn,
-        [](STATE_ACCESS &, uint32_t insn) -> uint64_t { return static_cast<uint64_t>(insn_get_rs1(insn)); });
+        [](STATE_ACCESS, uint32_t insn) -> uint64_t { return static_cast<uint64_t>(insn_get_rs1(insn)); });
 }
 
 template <typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_csr_SC(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn,
+static FORCE_INLINE execute_status execute_csr_SC(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn,
     const F &f) {
     auto csraddr = static_cast<CSR_address>(insn_I_get_uimm(insn));
     // Try to read old CSR value
@@ -2450,20 +2499,20 @@ static FORCE_INLINE execute_status execute_csr_SC(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the CSRRS instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_CSRRS(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_CSRRS(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "csrrs");
     return execute_csr_SC(a, pc, mcycle, insn, [](uint64_t csr, uint64_t rs1) -> uint64_t { return csr | rs1; });
 }
 
 /// \brief Implementation of the CSRRC instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_CSRRC(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_CSRRC(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "csrrc");
     return execute_csr_SC(a, pc, mcycle, insn, [](uint64_t csr, uint64_t rs1) -> uint64_t { return csr & ~rs1; });
 }
 
 template <typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_csr_SCI(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn,
+static FORCE_INLINE execute_status execute_csr_SCI(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn,
     const F &f) {
     auto csraddr = static_cast<CSR_address>(insn_I_get_uimm(insn));
     // Try to read old CSR value
@@ -2495,21 +2544,21 @@ static FORCE_INLINE execute_status execute_csr_SCI(STATE_ACCESS &a, uint64_t &pc
 
 /// \brief Implementation of the CSRRSI instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_CSRRSI(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_CSRRSI(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "csrrsi");
     return execute_csr_SCI(a, pc, mcycle, insn, [](uint64_t csr, uint32_t rs1) -> uint64_t { return csr | rs1; });
 }
 
 /// \brief Implementation of the CSRRCI instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_CSRRCI(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_CSRRCI(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "csrrci");
     return execute_csr_SCI(a, pc, mcycle, insn, [](uint64_t csr, uint32_t rs1) -> uint64_t { return csr & ~rs1; });
 }
 
 /// \brief Implementation of the ECALL instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_ECALL(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_ECALL(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "ecall");
     auto priv = a.read_iflags_PRV();
     pc = raise_exception(a, pc, MCAUSE_ECALL_BASE + priv, 0);
@@ -2518,7 +2567,7 @@ static FORCE_INLINE execute_status execute_ECALL(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the EBREAK instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_EBREAK(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_EBREAK(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "ebreak");
     pc = raise_exception(a, pc, MCAUSE_BREAKPOINT, pc);
     return execute_status::failure;
@@ -2526,7 +2575,7 @@ static FORCE_INLINE execute_status execute_EBREAK(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the SRET instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRET(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_SRET(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "sret");
     auto priv = a.read_iflags_PRV();
     uint64_t mstatus = a.read_mstatus();
@@ -2556,7 +2605,7 @@ static FORCE_INLINE execute_status execute_SRET(STATE_ACCESS &a, uint64_t &pc, u
 
 /// \brief Implementation of the MRET instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_MRET(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_MRET(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "mret");
     auto priv = a.read_iflags_PRV();
     if (unlikely(priv < PRV_M)) {
@@ -2588,7 +2637,7 @@ static FORCE_INLINE execute_status execute_MRET(STATE_ACCESS &a, uint64_t &pc, u
 /// \brief Implementation of the WFI instruction.
 /// \details This function is outlined to minimize host CPU code cache pressure.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_WFI(STATE_ACCESS &a, uint64_t &pc, uint64_t &mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_WFI(STATE_ACCESS a, uint64_t &pc, uint64_t &mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "wfi");
     // Check privileges and do nothing else
     auto priv = a.read_iflags_PRV();
@@ -2614,7 +2663,7 @@ static FORCE_INLINE execute_status execute_WFI(STATE_ACCESS &a, uint64_t &pc, ui
 
 /// \brief Implementation of the FENCE instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FENCE(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FENCE(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     INC_COUNTER(a.get_statistics(), fence);
     dump_insn(a, pc, insn, "fence");
     // Really do nothing
@@ -2623,7 +2672,7 @@ static FORCE_INLINE execute_status execute_FENCE(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the FENCE.I instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FENCE_I(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FENCE_I(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     INC_COUNTER(a.get_statistics(), fence_i);
     dump_insn(a, pc, insn, "fence.i");
     // Really do nothing
@@ -2631,11 +2680,8 @@ static FORCE_INLINE execute_status execute_FENCE_I(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_arithmetic(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) {
+static FORCE_INLINE execute_status execute_arithmetic(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     const uint32_t rd = insn_get_rd(insn);
-    if (unlikely(rd == 0)) {
-        return advance_to_next_insn(a, pc);
-    }
     // Ensure rs1 and rs2 are loaded in order: do not nest with call to f() as
     // the order of evaluation of arguments in a function call is undefined.
     const uint64_t rs1 = a.read_x(insn_get_rs1(insn));
@@ -2646,9 +2692,12 @@ static FORCE_INLINE execute_status execute_arithmetic(STATE_ACCESS &a, uint64_t
 }
 
 /// \brief Implementation of the ADD instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_ADD(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_ADD(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "add");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         uint64_t val = 0;
         __builtin_add_overflow(rs1, rs2, &val);
@@ -2657,9 +2706,12 @@ static FORCE_INLINE execute_status execute_ADD(STATE_ACCESS &a, uint64_t &pc, ui
 }
 
 /// \brief Implementation of the SUB instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SUB(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "sub");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         uint64_t val = 0;
         __builtin_sub_overflow(rs1, rs2, &val);
@@ -2668,70 +2720,97 @@ static FORCE_INLINE execute_status execute_SUB(STATE_ACCESS &a, uint64_t &pc, ui
 }
 
 /// \brief Implementation of the SLL instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLL(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLL(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "sll");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn,
         [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 << (rs2 & (XLEN - 1)); });
 }
 
 /// \brief Implementation of the SLT instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLT(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLT(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "slt");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn,
         [](uint64_t rs1, uint64_t rs2) -> uint64_t { return static_cast<int64_t>(rs1) < static_cast<int64_t>(rs2); });
 }
 
 /// \brief Implementation of the SLTU instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLTU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLTU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "sltu");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 < rs2; });
 }
 
 /// \brief Implementation of the XOR instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_XOR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_XOR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "xor");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 ^ rs2; });
 }
 
 /// \brief Implementation of the SRL instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRL(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRL(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "srl");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn,
         [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 >> (rs2 & (XLEN - 1)); });
 }
 
 /// \brief Implementation of the SRA instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRA(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRA(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "sra");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         return static_cast<uint64_t>(static_cast<int64_t>(rs1) >> (rs2 & (XLEN - 1)));
     });
 }
 
 /// \brief Implementation of the OR instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_OR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_OR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "or");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 | rs2; });
 }
 
 /// \brief Implementation of the AND instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AND(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_AND(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "and");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 & rs2; });
 }
 
 /// \brief Implementation of the MUL instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_MUL(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_MUL(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "mul");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto srs1 = static_cast<int64_t>(rs1);
         auto srs2 = static_cast<int64_t>(rs2);
@@ -2742,9 +2821,12 @@ static FORCE_INLINE execute_status execute_MUL(STATE_ACCESS &a, uint64_t &pc, ui
 }
 
 /// \brief Implementation of the MULH instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_MULH(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_MULH(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "mulh");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto srs1 = static_cast<int64_t>(rs1);
         auto srs2 = static_cast<int64_t>(rs2);
@@ -2753,9 +2835,12 @@ static FORCE_INLINE execute_status execute_MULH(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the MULHSU instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_MULHSU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_MULHSU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "mulhsu");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto srs1 = static_cast<int64_t>(rs1);
         return static_cast<uint64_t>(
@@ -2764,18 +2849,24 @@ static FORCE_INLINE execute_status execute_MULHSU(STATE_ACCESS &a, uint64_t &pc,
 }
 
 /// \brief Implementation of the MULHU instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_MULHU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_MULHU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "mulhu");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         return static_cast<uint64_t>((static_cast<uint128_t>(rs1) * static_cast<uint128_t>(rs2)) >> 64);
     });
 }
 
 /// \brief Implementation of the DIV instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_DIV(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_DIV(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "div");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto srs1 = static_cast<int64_t>(rs1);
         auto srs2 = static_cast<int64_t>(rs2);
@@ -2790,9 +2881,12 @@ static FORCE_INLINE execute_status execute_DIV(STATE_ACCESS &a, uint64_t &pc, ui
 }
 
 /// \brief Implementation of the DIVU instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_DIVU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_DIVU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "divu");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         if (unlikely(rs2 == 0)) {
             return static_cast<uint64_t>(-1);
@@ -2802,9 +2896,12 @@ static FORCE_INLINE execute_status execute_DIVU(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the REM instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_REM(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_REM(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "rem");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         auto srs1 = static_cast<int64_t>(rs1);
         auto srs2 = static_cast<int64_t>(rs2);
@@ -2819,9 +2916,12 @@ static FORCE_INLINE execute_status execute_REM(STATE_ACCESS &a, uint64_t &pc, ui
 }
 
 /// \brief Implementation of the REMU instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_REMU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_REMU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "remu");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t {
         if (unlikely(rs2 == 0)) {
             return rs1;
@@ -2831,12 +2931,9 @@ static FORCE_INLINE execute_status execute_REMU(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 template <typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_arithmetic_immediate(STATE_ACCESS &a, uint64_t &pc, uint32_t insn,
+static FORCE_INLINE execute_status execute_arithmetic_immediate(STATE_ACCESS a, uint64_t &pc, uint32_t insn,
     const F &f) {
     const uint32_t rd = insn_get_rd(insn);
-    if (unlikely(rd == 0)) {
-        return advance_to_next_insn(a, pc);
-    }
     const uint64_t rs1 = a.read_x(insn_get_rs1(insn));
     const int32_t imm = insn_I_get_imm(insn);
     a.write_x(rd, f(rs1, imm));
@@ -2844,26 +2941,35 @@ static FORCE_INLINE execute_status execute_arithmetic_immediate(STATE_ACCESS &a,
 }
 
 /// \brief Implementation of the SRLI instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRLI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRLI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "srli");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn,
         [](uint64_t rs1, int32_t imm) -> uint64_t { return rs1 >> (imm & (XLEN - 1)); });
 }
 
 /// \brief Implementation of the SRAI instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRAI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRAI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "srai");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t {
         return static_cast<uint64_t>(static_cast<int64_t>(rs1) >> (imm & (XLEN - 1)));
     });
 }
 
 /// \brief Implementation of the ADDI instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_ADDI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_ADDI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "addi");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t {
         int64_t val = 0;
         __builtin_add_overflow(static_cast<int64_t>(rs1), static_cast<int64_t>(imm), &val);
@@ -2872,49 +2978,67 @@ static FORCE_INLINE execute_status execute_ADDI(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the SLTI instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLTI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLTI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "slti");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn,
         [](uint64_t rs1, int32_t imm) -> uint64_t { return static_cast<int64_t>(rs1) < static_cast<int64_t>(imm); });
 }
 
 /// \brief Implementation of the SLTIU instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLTIU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLTIU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "sltiu");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn,
         [](uint64_t rs1, int32_t imm) -> uint64_t { return rs1 < static_cast<uint64_t>(imm); });
 }
 
 /// \brief Implementation of the XORI instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_XORI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_XORI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "xori");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { return rs1 ^ imm; });
 }
 
 /// \brief Implementation of the ORI instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_ORI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_ORI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "ori");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { return rs1 | imm; });
 }
 
 /// \brief Implementation of the ANDI instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_ANDI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_ANDI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "andi");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { return rs1 & imm; });
 }
 
 /// \brief Implementation of the SLLI instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLLI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLLI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     if (unlikely((insn & (0b111111 << 26)) != 0)) {
         return raise_illegal_insn_exception(a, pc, insn);
     }
     dump_insn(a, pc, insn, "slli");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t {
         // No need to mask lower 6 bits in imm because of the if condition a above
         // We do it anyway here to prevent problems if this code is moved
@@ -2923,9 +3047,12 @@ static FORCE_INLINE execute_status execute_SLLI(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 /// \brief Implementation of the ADDIW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_ADDIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_ADDIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "addiw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t {
         int32_t val = 0;
         __builtin_add_overflow(static_cast<int32_t>(rs1), imm, &val);
@@ -2934,12 +3061,15 @@ static FORCE_INLINE execute_status execute_ADDIW(STATE_ACCESS &a, uint64_t &pc,
 }
 
 /// \brief Implementation of the SLLIW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLLIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLLIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     if (unlikely(insn_get_funct7(insn) != 0)) {
         return raise_illegal_insn_exception(a, pc, insn);
     }
     dump_insn(a, pc, insn, "slliw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t {
         // No need to mask lower 5 bits in imm because of the if condition a above
         // We do it anyway here to prevent problems if this code is moved
@@ -2949,9 +3079,12 @@ static FORCE_INLINE execute_status execute_SLLIW(STATE_ACCESS &a, uint64_t &pc,
 }
 
 /// \brief Implementation of the SRLIW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRLIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRLIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "srliw");
+    if constexpr (rd_kind == rd_kind::x0) {
+        return advance_to_next_insn(a, pc);
+    }
     return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t {
         // No need to mask lower 5 bits in imm because of funct7 test in caller
         // We do it anyway here to prevent problems if this code is moved
@@ -2961,13 +3094,16 @@ static FORCE_INLINE execute_status execute_SRLIW(STATE_ACCESS &a, uint64_t &pc,
 }
 
 /// \brief Implementation of the SRAIW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRAIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRAIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "sraiw");
-    // When rd=0 the instruction is a HINT, and we consider it as a soft yield when rs1 == 31
-    if (unlikely(insn_get_rd(insn) == 0 && insn_get_rs1(insn) == 31 && a.get_soft_yield())) {
-        // Force the main interpreter loop to break
-        return advance_to_next_insn(a, pc, execute_status::success_and_yield);
+    if constexpr (rd_kind == rd_kind::x0) {
+        // When rd=0 the instruction is a HINT, and we consider it as a soft yield when rs1 == 31
+        if (unlikely(insn_get_rs1(insn) == 31 && a.get_soft_yield())) {
+            // Force the main interpreter loop to break
+            return advance_to_next_insn(a, pc, execute_status::success_and_yield);
+        }
+        return advance_to_next_insn(a, pc);
     }
     return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t {
         const int32_t rs1w = static_cast<int32_t>(rs1) >> (imm & 0b11111);
@@ -2976,47 +3112,50 @@ static FORCE_INLINE execute_status execute_SRAIW(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_S(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_S(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     const uint64_t vaddr = a.read_x(insn_get_rs1(insn));
     const int32_t imm = insn_S_get_imm(insn);
     const uint64_t val = a.read_x(insn_get_rs2(insn));
     const execute_status status = write_virtual_memory<T>(a, pc, mcycle, vaddr + imm, val);
-    if (unlikely(status == execute_status::failure)) {
-        return advance_to_raised_exception(a, pc);
+    if (unlikely(status != execute_status::success)) {
+        if (status == execute_status::failure) {
+            return advance_to_raised_exception(a, pc);
+        }
+        return advance_to_next_insn(a, pc, status);
     }
-    return advance_to_next_insn(a, pc, status);
+    return advance_to_next_insn(a, pc);
 }
 
 /// \brief Implementation of the SB instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SB(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_SB(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "sb");
     return execute_S<uint8_t>(a, pc, mcycle, insn);
 }
 
 /// \brief Implementation of the SH instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SH(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_SH(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "sh");
     return execute_S<uint16_t>(a, pc, mcycle, insn);
 }
 
 /// \brief Implementation of the SW instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_SW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "sw");
     return execute_S<uint32_t>(a, pc, mcycle, insn);
 }
 
 /// \brief Implementation of the SD instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_SD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "sd");
     return execute_S<uint64_t>(a, pc, mcycle, insn);
 }
 
-template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_L(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+template <typename T, rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_L(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     const uint64_t vaddr = a.read_x(insn_get_rs1(insn));
     const int32_t imm = insn_I_get_imm(insn);
     T val = 0;
@@ -3025,7 +3164,7 @@ static FORCE_INLINE execute_status execute_L(STATE_ACCESS &a, uint64_t &pc, uint
     }
     const uint32_t rd = insn_get_rd(insn);
     // don't write x0
-    if (unlikely(rd == 0)) {
+    if constexpr (rd_kind == rd_kind::x0) {
         return advance_to_next_insn(a, pc);
     }
     // This static branch is eliminated by the compiler
@@ -3038,56 +3177,56 @@ static FORCE_INLINE execute_status execute_L(STATE_ACCESS &a, uint64_t &pc, uint
 }
 
 /// \brief Implementation of the LB instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LB(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_LB(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "lb");
-    return execute_L<int8_t>(a, pc, mcycle, insn);
+    return execute_L<int8_t, rd_kind>(a, pc, mcycle, insn);
 }
 
 /// \brief Implementation of the LH instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LH(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_LH(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "lh");
-    return execute_L<int16_t>(a, pc, mcycle, insn);
+    return execute_L<int16_t, rd_kind>(a, pc, mcycle, insn);
 }
 
 /// \brief Implementation of the LW instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_LW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "lw");
-    return execute_L<int32_t>(a, pc, mcycle, insn);
+    return execute_L<int32_t, rd_kind>(a, pc, mcycle, insn);
 }
 
 /// \brief Implementation of the LD instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_LD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "ld");
-    return execute_L<int64_t>(a, pc, mcycle, insn);
+    return execute_L<int64_t, rd_kind>(a, pc, mcycle, insn);
 }
 
 /// \brief Implementation of the LBU instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LBU(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_LBU(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "lbu");
-    return execute_L<uint8_t>(a, pc, mcycle, insn);
+    return execute_L<uint8_t, rd_kind>(a, pc, mcycle, insn);
 }
 
 /// \brief Implementation of the LHU instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LHU(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_LHU(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "lhu");
-    return execute_L<uint16_t>(a, pc, mcycle, insn);
+    return execute_L<uint16_t, rd_kind>(a, pc, mcycle, insn);
 }
 
 /// \brief Implementation of the LWU instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LWU(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_LWU(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "lwu");
-    return execute_L<uint32_t>(a, pc, mcycle, insn);
+    return execute_L<uint32_t, rd_kind>(a, pc, mcycle, insn);
 }
 
 template <typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_branch(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) {
+static FORCE_INLINE execute_status execute_branch(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     const uint64_t rs1 = a.read_x(insn_get_rs1(insn));
     const uint64_t rs2 = a.read_x(insn_get_rs2(insn));
     if (f(rs1, rs2)) {
@@ -3099,21 +3238,21 @@ static FORCE_INLINE execute_status execute_branch(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the BEQ instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_BEQ(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_BEQ(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "beq");
     return execute_branch(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> bool { return rs1 == rs2; });
 }
 
 /// \brief Implementation of the BNE instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_BNE(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_BNE(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "bne");
     return execute_branch(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> bool { return rs1 != rs2; });
 }
 
 /// \brief Implementation of the BLT instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_BLT(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_BLT(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "blt");
     return execute_branch(a, pc, insn,
         [](uint64_t rs1, uint64_t rs2) -> bool { return static_cast<int64_t>(rs1) < static_cast<int64_t>(rs2); });
@@ -3121,7 +3260,7 @@ static FORCE_INLINE execute_status execute_BLT(STATE_ACCESS &a, uint64_t &pc, ui
 
 /// \brief Implementation of the BGE instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_BGE(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_BGE(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "bge");
     return execute_branch(a, pc, insn,
         [](uint64_t rs1, uint64_t rs2) -> bool { return static_cast<int64_t>(rs1) >= static_cast<int64_t>(rs2); });
@@ -3129,64 +3268,64 @@ static FORCE_INLINE execute_status execute_BGE(STATE_ACCESS &a, uint64_t &pc, ui
 
 /// \brief Implementation of the BLTU instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_BLTU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_BLTU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "bltu");
     return execute_branch(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> bool { return rs1 < rs2; });
 }
 
 /// \brief Implementation of the BGEU instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_BGEU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_BGEU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "bgeu");
     return execute_branch(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> bool { return rs1 >= rs2; });
 }
 
 /// \brief Implementation of the LUI instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_LUI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_LUI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "lui");
-    const uint32_t rd = insn_get_rd(insn);
-    if (unlikely(rd == 0)) {
+    if constexpr (rd_kind == rd_kind::x0) {
         return advance_to_next_insn(a, pc);
     }
+    const uint32_t rd = insn_get_rd(insn);
     a.write_x(rd, insn_U_get_imm(insn));
     return advance_to_next_insn(a, pc);
 }
 
 /// \brief Implementation of the AUIPC instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AUIPC(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_AUIPC(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "auipc");
-    const uint32_t rd = insn_get_rd(insn);
-    if (unlikely(rd == 0)) {
+    if constexpr (rd_kind == rd_kind::x0) {
         return advance_to_next_insn(a, pc);
     }
+    const uint32_t rd = insn_get_rd(insn);
     a.write_x(rd, pc + insn_U_get_imm(insn));
     return advance_to_next_insn(a, pc);
 }
 
 /// \brief Implementation of the JAL instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_JAL(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_JAL(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "jal");
     const uint64_t new_pc = pc + insn_J_get_imm(insn);
-    const uint32_t rd = insn_get_rd(insn);
-    if (unlikely(rd == 0)) {
+    if constexpr (rd_kind == rd_kind::x0) {
         return execute_jump(a, pc, new_pc);
     }
+    const uint32_t rd = insn_get_rd(insn);
     a.write_x(rd, pc + 4);
     return execute_jump(a, pc, new_pc);
 }
 
 /// \brief Implementation of the JALR instruction.
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_JALR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_JALR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "jalr");
     const uint64_t val = pc + 4;
     const uint64_t new_pc =
         static_cast<int64_t>(a.read_x(insn_get_rs1(insn)) + insn_I_get_imm(insn)) & ~static_cast<uint64_t>(1);
     const uint32_t rd = insn_get_rd(insn);
-    if (unlikely(rd != 0)) {
+    if constexpr (rd_kind != rd_kind::x0) {
         a.write_x(rd, val);
         return execute_jump(a, pc, new_pc);
     }
@@ -3196,7 +3335,7 @@ static FORCE_INLINE execute_status execute_JALR(STATE_ACCESS &a, uint64_t &pc, u
 /// \brief Implementation of the SFENCE.VMA instruction.
 /// \details This function is outlined to minimize host CPU code cache pressure.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SFENCE_VMA(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_SFENCE_VMA(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     // rs1 and rs2 are arbitrary, rest is set
     if (unlikely((insn & 0b11111110000000000111111111111111) != 0b00010010000000000000000001110011)) {
         return raise_illegal_insn_exception(a, pc, insn);
@@ -3248,32 +3387,34 @@ static FORCE_INLINE execute_status execute_SFENCE_VMA(STATE_ACCESS &a, uint64_t
     return advance_to_next_insn(a, pc, execute_status::success_and_flush_fetch);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRLI_SRAI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_SRLI_SRAI_funct7_sr1>(insn_get_funct7_sr1(insn))) {
-        case insn_SRLI_SRAI_funct7_sr1::SRLI:
-            return execute_SRLI(a, pc, insn);
-        case insn_SRLI_SRAI_funct7_sr1::SRAI:
-            return execute_SRAI(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRLI_SRAI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7_sr1 = static_cast<insn_SRLI_SRAI_funct7_sr1>(insn_get_funct7_sr1(insn));
+    if (funct7_sr1 == insn_SRLI_SRAI_funct7_sr1::SRLI) {
+        return execute_SRLI<rd_kind>(a, pc, insn);
+    }
+    if (funct7_sr1 == insn_SRLI_SRAI_funct7_sr1::SRAI) {
+        return execute_SRAI<rd_kind>(a, pc, insn);
     }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRLIW_SRAIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_SRLIW_SRAIW_funct7>(insn_get_funct7(insn))) {
-        case insn_SRLIW_SRAIW_funct7::SRLIW:
-            return execute_SRLIW(a, pc, insn);
-        case insn_SRLIW_SRAIW_funct7::SRAIW:
-            return execute_SRAIW(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRLIW_SRAIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_SRLIW_SRAIW_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_SRLIW_SRAIW_funct7::SRLIW) {
+        return execute_SRLIW<rd_kind>(a, pc, insn);
+    }
+    if (funct7 == insn_SRLIW_SRAIW_funct7::SRAIW) {
+        return execute_SRAIW<rd_kind>(a, pc, insn);
     }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMO_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMO_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     switch (static_cast<insn_AMO_funct7_sr2>(insn_get_funct7_sr2(insn))) {
         case insn_AMO_funct7_sr2::AMOADD:
             return execute_AMOADD_W(a, pc, mcycle, insn);
@@ -3303,7 +3444,7 @@ static FORCE_INLINE execute_status execute_AMO_W(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AMO_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_AMO_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     switch (static_cast<insn_AMO_funct7_sr2>(insn_get_funct7_sr2(insn))) {
         case insn_AMO_funct7_sr2::AMOADD:
             return execute_AMOADD_D(a, pc, mcycle, insn);
@@ -3332,136 +3473,150 @@ static FORCE_INLINE execute_status execute_AMO_D(STATE_ACCESS &a, uint64_t &pc,
     }
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_ADD_MUL_SUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_ADD_MUL_SUB_funct7>(insn_get_funct7(insn))) {
-        case insn_ADD_MUL_SUB_funct7::ADD:
-            return execute_ADD(a, pc, insn);
-        case insn_ADD_MUL_SUB_funct7::MUL:
-            return execute_MUL(a, pc, insn);
-        case insn_ADD_MUL_SUB_funct7::SUB:
-            return execute_SUB(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_ADD_MUL_SUB(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_ADD_MUL_SUB_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_ADD_MUL_SUB_funct7::ADD) {
+        return execute_ADD<rd_kind>(a, pc, insn);
+    }
+    if (funct7 == insn_ADD_MUL_SUB_funct7::MUL) {
+        return execute_MUL<rd_kind>(a, pc, insn);
+    }
+    if (funct7 == insn_ADD_MUL_SUB_funct7::SUB) {
+        return execute_SUB<rd_kind>(a, pc, insn);
     }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLL_MULH(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_SLL_MULH_funct7>(insn_get_funct7(insn))) {
-        case insn_SLL_MULH_funct7::SLL:
-            return execute_SLL(a, pc, insn);
-        case insn_SLL_MULH_funct7::MULH:
-            return execute_MULH(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLL_MULH(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_SLL_MULH_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_SLL_MULH_funct7::SLL) {
+        return execute_SLL<rd_kind>(a, pc, insn);
+    }
+    if (funct7 == insn_SLL_MULH_funct7::MULH) {
+        return execute_MULH<rd_kind>(a, pc, insn);
     }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLT_MULHSU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_SLT_MULHSU_funct7>(insn_get_funct7(insn))) {
-        case insn_SLT_MULHSU_funct7::SLT:
-            return execute_SLT(a, pc, insn);
-        case insn_SLT_MULHSU_funct7::MULHSU:
-            return execute_MULHSU(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLT_MULHSU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_SLT_MULHSU_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_SLT_MULHSU_funct7::SLT) {
+        return execute_SLT<rd_kind>(a, pc, insn);
     }
+    if (funct7 == insn_SLT_MULHSU_funct7::MULHSU) {
+        return execute_MULHSU<rd_kind>(a, pc, insn);
+    }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SLTU_MULHU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_SLTU_MULHU_funct7>(insn_get_funct7(insn))) {
-        case insn_SLTU_MULHU_funct7::SLTU:
-            return execute_SLTU(a, pc, insn);
-        case insn_SLTU_MULHU_funct7::MULHU:
-            return execute_MULHU(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SLTU_MULHU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_SLTU_MULHU_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_SLTU_MULHU_funct7::SLTU) {
+        return execute_SLTU<rd_kind>(a, pc, insn);
+    }
+    if (funct7 == insn_SLTU_MULHU_funct7::MULHU) {
+        return execute_MULHU<rd_kind>(a, pc, insn);
     }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_XOR_DIV(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_XOR_DIV_funct7>(insn_get_funct7(insn))) {
-        case insn_XOR_DIV_funct7::XOR:
-            return execute_XOR(a, pc, insn);
-        case insn_XOR_DIV_funct7::DIV:
-            return execute_DIV(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_XOR_DIV(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_XOR_DIV_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_XOR_DIV_funct7::XOR) {
+        return execute_XOR<rd_kind>(a, pc, insn);
     }
+    if (funct7 == insn_XOR_DIV_funct7::DIV) {
+        return execute_DIV<rd_kind>(a, pc, insn);
+    }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRL_DIVU_SRA(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_SRL_DIVU_SRA_funct7>(insn_get_funct7(insn))) {
-        case insn_SRL_DIVU_SRA_funct7::SRL:
-            return execute_SRL(a, pc, insn);
-        case insn_SRL_DIVU_SRA_funct7::DIVU:
-            return execute_DIVU(a, pc, insn);
-        case insn_SRL_DIVU_SRA_funct7::SRA:
-            return execute_SRA(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRL_DIVU_SRA(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_SRL_DIVU_SRA_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_SRL_DIVU_SRA_funct7::SRL) {
+        return execute_SRL<rd_kind>(a, pc, insn);
+    }
+    if (funct7 == insn_SRL_DIVU_SRA_funct7::SRA) {
+        return execute_SRA<rd_kind>(a, pc, insn);
     }
+    if (funct7 == insn_SRL_DIVU_SRA_funct7::DIVU) {
+        return execute_DIVU<rd_kind>(a, pc, insn);
+    }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_OR_REM(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_OR_REM_funct7>(insn_get_funct7(insn))) {
-        case insn_OR_REM_funct7::OR:
-            return execute_OR(a, pc, insn);
-        case insn_OR_REM_funct7::REM:
-            return execute_REM(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_OR_REM(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_OR_REM_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_OR_REM_funct7::OR) {
+        return execute_OR<rd_kind>(a, pc, insn);
     }
+    if (funct7 == insn_OR_REM_funct7::REM) {
+        return execute_REM<rd_kind>(a, pc, insn);
+    }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_AND_REMU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_AND_REMU_funct7>(insn_get_funct7(insn))) {
-        case insn_AND_REMU_funct7::AND:
-            return execute_AND(a, pc, insn);
-        case insn_AND_REMU_funct7::REMU:
-            return execute_REMU(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_AND_REMU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_AND_REMU_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_AND_REMU_funct7::AND) {
+        return execute_AND<rd_kind>(a, pc, insn);
     }
+    if (funct7 == insn_AND_REMU_funct7::REMU) {
+        return execute_REMU<rd_kind>(a, pc, insn);
+    }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_ADDW_MULW_SUBW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_ADDW_MULW_SUBW_funct7>(insn_get_funct7(insn))) {
-        case insn_ADDW_MULW_SUBW_funct7::ADDW:
-            return execute_ADDW(a, pc, insn);
-        case insn_ADDW_MULW_SUBW_funct7::MULW:
-            return execute_MULW(a, pc, insn);
-        case insn_ADDW_MULW_SUBW_funct7::SUBW:
-            return execute_SUBW(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_ADDW_MULW_SUBW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_ADDW_MULW_SUBW_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_ADDW_MULW_SUBW_funct7::ADDW) {
+        return execute_ADDW<rd_kind>(a, pc, insn);
+    }
+    if (funct7 == insn_ADDW_MULW_SUBW_funct7::MULW) {
+        return execute_MULW<rd_kind>(a, pc, insn);
     }
+    if (funct7 == insn_ADDW_MULW_SUBW_funct7::SUBW) {
+        return execute_SUBW<rd_kind>(a, pc, insn);
+    }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_SRLW_DIVUW_SRAW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    switch (static_cast<insn_SRLW_DIVUW_SRAW_funct7>(insn_get_funct7(insn))) {
-        case insn_SRLW_DIVUW_SRAW_funct7::SRLW:
-            return execute_SRLW(a, pc, insn);
-        case insn_SRLW_DIVUW_SRAW_funct7::DIVUW:
-            return execute_DIVUW(a, pc, insn);
-        case insn_SRLW_DIVUW_SRAW_funct7::SRAW:
-            return execute_SRAW(a, pc, insn);
-        default:
-            return raise_illegal_insn_exception(a, pc, insn);
+template <rd_kind rd_kind, typename STATE_ACCESS>
+static FORCE_INLINE execute_status execute_SRLW_DIVUW_SRAW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // Use ifs instead of a switch to produce fewer branches for the most frequent instructions
+    const auto funct7 = static_cast<insn_SRLW_DIVUW_SRAW_funct7>(insn_get_funct7(insn));
+    if (funct7 == insn_SRLW_DIVUW_SRAW_funct7::SRLW) {
+        return execute_SRLW<rd_kind>(a, pc, insn);
+    }
+    if (funct7 == insn_SRLW_DIVUW_SRAW_funct7::DIVUW) {
+        return execute_DIVUW<rd_kind>(a, pc, insn);
+    }
+    if (funct7 == insn_SRLW_DIVUW_SRAW_funct7::SRAW) {
+        return execute_SRAW<rd_kind>(a, pc, insn);
     }
+    return raise_illegal_insn_exception(a, pc, insn);
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_privileged(STATE_ACCESS &a, uint64_t &pc, uint64_t &mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_privileged(STATE_ACCESS a, uint64_t &pc, uint64_t &mcycle, uint32_t insn) {
     switch (static_cast<insn_privileged>(insn)) {
         case insn_privileged::ECALL:
             return execute_ECALL(a, pc, insn);
@@ -3521,7 +3676,7 @@ static inline T float_unbox(uint64_t val) {
 }
 
 template <typename T, typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_float_ternary_op_rm(STATE_ACCESS &a, uint64_t &pc, uint32_t insn,
+static FORCE_INLINE execute_status execute_float_ternary_op_rm(STATE_ACCESS a, uint64_t &pc, uint32_t insn,
     const F &f) {
     const uint64_t fcsr = a.read_fcsr();
     // The rounding mode comes from the insn
@@ -3543,8 +3698,7 @@ static FORCE_INLINE execute_status execute_float_ternary_op_rm(STATE_ACCESS &a,
 }
 
 template <typename T, typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_float_binary_op_rm(STATE_ACCESS &a, uint64_t &pc, uint32_t insn,
-    const F &f) {
+static FORCE_INLINE execute_status execute_float_binary_op_rm(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     const uint64_t fcsr = a.read_fcsr();
     // The rounding mode comes from the insn
     const uint32_t rm = insn_get_rm(insn, fcsr);
@@ -3564,7 +3718,7 @@ static FORCE_INLINE execute_status execute_float_binary_op_rm(STATE_ACCESS &a, u
 }
 
 template <typename T, typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_float_unary_op_rm(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) {
+static FORCE_INLINE execute_status execute_float_unary_op_rm(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     const uint64_t fcsr = a.read_fcsr();
     // Unary operation should have rs2 set to 0
     if (unlikely(insn_get_rs2(insn) != 0)) {
@@ -3587,33 +3741,44 @@ static FORCE_INLINE execute_status execute_float_unary_op_rm(STATE_ACCESS &a, ui
 }
 
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FS(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FS(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     const uint64_t vaddr = a.read_x(insn_get_rs1(insn));
     const int32_t imm = insn_S_get_imm(insn);
     // A narrower n-bit transfer out of the floating-point
     // registers will transfer the lower n bits of the register ignoring the upper FLEN−n bits.
     T val = static_cast<T>(a.read_f(insn_get_rs2(insn)));
     const execute_status status = write_virtual_memory<T>(a, pc, mcycle, vaddr + imm, val);
-    if (unlikely(status == execute_status::failure)) {
-        return advance_to_raised_exception(a, pc);
+    if (unlikely(status != execute_status::success)) {
+        if (status == execute_status::failure) {
+            return advance_to_raised_exception(a, pc);
+        }
+        return advance_to_next_insn(a, pc, status);
     }
-    return advance_to_next_insn(a, pc, status);
+    return advance_to_next_insn(a, pc);
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "fsw");
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     return execute_FS<uint32_t>(a, pc, mcycle, insn);
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "fsd");
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     return execute_FS<uint64_t>(a, pc, mcycle, insn);
 }
 
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FL(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FL(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     // Loads the float value from virtual memory
     const uint64_t vaddr = a.read_x(insn_get_rs1(insn));
     const int32_t imm = insn_I_get_imm(insn);
@@ -3629,19 +3794,27 @@ static FORCE_INLINE execute_status execute_FL(STATE_ACCESS &a, uint64_t &pc, uin
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FLW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FLW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "flw");
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     return execute_FL<uint32_t>(a, pc, mcycle, insn);
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FLD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FLD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
     dump_insn(a, pc, insn, "fld");
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     return execute_FL<uint64_t>(a, pc, mcycle, insn);
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMADD_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMADD_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmadd.s");
     return execute_float_ternary_op_rm<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, uint32_t s3, uint32_t rm, uint32_t *fflags) -> uint32_t {
@@ -3650,7 +3823,7 @@ static FORCE_INLINE execute_status execute_FMADD_S(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMADD_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMADD_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmadd.d");
     return execute_float_ternary_op_rm<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, uint64_t s3, uint32_t rm, uint32_t *fflags) -> uint64_t {
@@ -3659,7 +3832,11 @@ static FORCE_INLINE execute_status execute_FMADD_D(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMADD(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMADD(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     switch (static_cast<insn_FM_funct2_0000000000000000000000000>(insn_get_funct2_0000000000000000000000000(insn))) {
         case insn_FM_funct2_0000000000000000000000000::S:
             return execute_FMADD_S(a, pc, insn);
@@ -3671,7 +3848,7 @@ static FORCE_INLINE execute_status execute_FMADD(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMSUB_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMSUB_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmsub.s");
     return execute_float_ternary_op_rm<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, uint32_t s3, uint32_t rm, uint32_t *fflags) -> uint32_t {
@@ -3680,7 +3857,7 @@ static FORCE_INLINE execute_status execute_FMSUB_S(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMSUB_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMSUB_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmsub.d");
     return execute_float_ternary_op_rm<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, uint64_t s3, uint32_t rm, uint32_t *fflags) -> uint64_t {
@@ -3689,7 +3866,11 @@ static FORCE_INLINE execute_status execute_FMSUB_D(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMSUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMSUB(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     switch (static_cast<insn_FM_funct2_0000000000000000000000000>(insn_get_funct2_0000000000000000000000000(insn))) {
         case insn_FM_funct2_0000000000000000000000000::S:
             return execute_FMSUB_S(a, pc, insn);
@@ -3701,7 +3882,7 @@ static FORCE_INLINE execute_status execute_FMSUB(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FNMADD_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FNMADD_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fnmadd.s");
     return execute_float_ternary_op_rm<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, uint32_t s3, uint32_t rm, uint32_t *fflags) -> uint32_t {
@@ -3711,7 +3892,7 @@ static FORCE_INLINE execute_status execute_FNMADD_S(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FNMADD_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FNMADD_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fnmadd.d");
     return execute_float_ternary_op_rm<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, uint64_t s3, uint32_t rm, uint32_t *fflags) -> uint64_t {
@@ -3721,7 +3902,11 @@ static FORCE_INLINE execute_status execute_FNMADD_D(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FNMADD(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FNMADD(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     switch (static_cast<insn_FM_funct2_0000000000000000000000000>(insn_get_funct2_0000000000000000000000000(insn))) {
         case insn_FM_funct2_0000000000000000000000000::S:
             return execute_FNMADD_S(a, pc, insn);
@@ -3733,7 +3918,7 @@ static FORCE_INLINE execute_status execute_FNMADD(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FNMSUB_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FNMSUB_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fnmsub.s");
     return execute_float_ternary_op_rm<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, uint32_t s3, uint32_t rm, uint32_t *fflags) -> uint32_t {
@@ -3742,7 +3927,7 @@ static FORCE_INLINE execute_status execute_FNMSUB_S(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FNMSUB_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FNMSUB_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fnmsub.d");
     return execute_float_ternary_op_rm<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, uint64_t s3, uint32_t rm, uint32_t *fflags) -> uint64_t {
@@ -3751,7 +3936,11 @@ static FORCE_INLINE execute_status execute_FNMSUB_D(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FNMSUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FNMSUB(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     switch (static_cast<insn_FM_funct2_0000000000000000000000000>(insn_get_funct2_0000000000000000000000000(insn))) {
         case insn_FM_funct2_0000000000000000000000000::S:
             return execute_FNMSUB_S(a, pc, insn);
@@ -3763,7 +3952,7 @@ static FORCE_INLINE execute_status execute_FNMSUB(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FADD_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FADD_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fadd.s");
     return execute_float_binary_op_rm<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, uint32_t rm, uint32_t *fflags) -> uint32_t {
@@ -3772,7 +3961,7 @@ static FORCE_INLINE execute_status execute_FADD_S(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FADD_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FADD_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fadd.d");
     return execute_float_binary_op_rm<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, uint32_t rm, uint32_t *fflags) -> uint64_t {
@@ -3781,7 +3970,7 @@ static FORCE_INLINE execute_status execute_FADD_D(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSUB_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSUB_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fsub.s");
     return execute_float_binary_op_rm<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, uint32_t rm, uint32_t *fflags) -> uint32_t {
@@ -3790,7 +3979,7 @@ static FORCE_INLINE execute_status execute_FSUB_S(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSUB_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSUB_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fsub.d");
     return execute_float_binary_op_rm<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, uint32_t rm, uint32_t *fflags) -> uint64_t {
@@ -3799,7 +3988,7 @@ static FORCE_INLINE execute_status execute_FSUB_D(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMUL_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMUL_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmul.s");
     return execute_float_binary_op_rm<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, uint32_t rm, uint32_t *fflags) -> uint32_t {
@@ -3808,7 +3997,7 @@ static FORCE_INLINE execute_status execute_FMUL_S(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMUL_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMUL_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmul.d");
     return execute_float_binary_op_rm<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, uint32_t rm, uint32_t *fflags) -> uint64_t {
@@ -3817,7 +4006,7 @@ static FORCE_INLINE execute_status execute_FMUL_D(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FDIV_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FDIV_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fdiv.s");
     return execute_float_binary_op_rm<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, uint32_t rm, uint32_t *fflags) -> uint32_t {
@@ -3826,7 +4015,7 @@ static FORCE_INLINE execute_status execute_FDIV_S(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FDIV_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FDIV_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fdiv.d");
     return execute_float_binary_op_rm<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, uint32_t rm, uint32_t *fflags) -> uint64_t {
@@ -3835,7 +4024,7 @@ static FORCE_INLINE execute_status execute_FDIV_D(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename T, typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_FCLASS(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) {
+static FORCE_INLINE execute_status execute_FCLASS(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     const uint32_t rd = insn_get_rd(insn);
     if (unlikely(rd == 0)) {
         return advance_to_next_insn(a, pc);
@@ -3847,7 +4036,7 @@ static FORCE_INLINE execute_status execute_FCLASS(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename T, typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_float_binary_op(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) {
+static FORCE_INLINE execute_status execute_float_binary_op(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     const uint64_t fcsr = a.read_fcsr();
     // We must always check if input operands are properly NaN-boxed.
     T s1 = float_unbox<T>(a.read_f(insn_get_rs1(insn)));
@@ -3861,7 +4050,7 @@ static FORCE_INLINE execute_status execute_float_binary_op(STATE_ACCESS &a, uint
 }
 
 template <typename T, typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_float_cmp_op(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) {
+static FORCE_INLINE execute_status execute_float_cmp_op(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     const uint64_t fcsr = a.read_fcsr();
     // We must always check if input operands are properly NaN-boxed.
     T s1 = float_unbox<T>(a.read_f(insn_get_rs1(insn)));
@@ -3879,7 +4068,7 @@ static FORCE_INLINE execute_status execute_float_cmp_op(STATE_ACCESS &a, uint64_
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSGNJ_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSGNJ_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fsgnj.s");
     return execute_float_binary_op<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, const uint32_t * /*fflags*/) -> uint32_t {
@@ -3888,7 +4077,7 @@ static FORCE_INLINE execute_status execute_FSGNJ_S(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSGNJN_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSGNJN_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fsgnjn.s");
     return execute_float_binary_op<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, const uint32_t * /*fflags*/) -> uint32_t {
@@ -3897,7 +4086,7 @@ static FORCE_INLINE execute_status execute_FSGNJN_S(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSGNJX_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSGNJX_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fsgnjx.s");
     return execute_float_binary_op<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, const uint32_t * /*fflags*/) -> uint32_t {
@@ -3906,7 +4095,7 @@ static FORCE_INLINE execute_status execute_FSGNJX_S(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSGN_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSGN_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     switch (static_cast<insn_FSGN_funct3_000000000000>(insn_get_funct3_000000000000(insn))) {
         case insn_FSGN_funct3_000000000000::J:
             return execute_FSGNJ_S(a, pc, insn);
@@ -3920,7 +4109,7 @@ static FORCE_INLINE execute_status execute_FSGN_S(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSGNJ_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSGNJ_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fsgnj.d");
     return execute_float_binary_op<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, const uint32_t * /*fflags*/) -> uint64_t {
@@ -3929,7 +4118,7 @@ static FORCE_INLINE execute_status execute_FSGNJ_D(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSGNJN_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSGNJN_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fsgnjn.d");
     return execute_float_binary_op<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, const uint32_t * /*fflags*/) -> uint64_t {
@@ -3938,7 +4127,7 @@ static FORCE_INLINE execute_status execute_FSGNJN_D(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSGNJX_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSGNJX_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fsgnjx.d");
     return execute_float_binary_op<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, const uint32_t * /*fflags*/) -> uint64_t {
@@ -3947,7 +4136,7 @@ static FORCE_INLINE execute_status execute_FSGNJX_D(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSGN_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSGN_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     switch (static_cast<insn_FSGN_funct3_000000000000>(insn_get_funct3_000000000000(insn))) {
         case insn_FSGN_funct3_000000000000::J:
             return execute_FSGNJ_D(a, pc, insn);
@@ -3961,21 +4150,21 @@ static FORCE_INLINE execute_status execute_FSGN_D(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMIN_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMIN_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmin.s");
     return execute_float_binary_op<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, uint32_t *fflags) -> uint32_t { return i_sfloat32::min(s1, s2, fflags); });
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMAX_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMAX_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmax.s");
     return execute_float_binary_op<uint32_t>(a, pc, insn,
         [](uint32_t s1, uint32_t s2, uint32_t *fflags) -> uint32_t { return i_sfloat32::max(s1, s2, fflags); });
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMINMAX_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMINMAX_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     switch (static_cast<insn_FMIN_FMAX_funct3_000000000000>(insn_get_funct3_000000000000(insn))) {
         case insn_FMIN_FMAX_funct3_000000000000::MIN:
             return execute_FMIN_S(a, pc, insn);
@@ -3987,21 +4176,21 @@ static FORCE_INLINE execute_status execute_FMINMAX_S(STATE_ACCESS &a, uint64_t &
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMIN_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMIN_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmin.d");
     return execute_float_binary_op<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, uint32_t *fflags) -> uint64_t { return i_sfloat64::min(s1, s2, fflags); });
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMAX_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMAX_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmax.d");
     return execute_float_binary_op<uint64_t>(a, pc, insn,
         [](uint64_t s1, uint64_t s2, uint32_t *fflags) -> uint64_t { return i_sfloat64::max(s1, s2, fflags); });
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMINMAX_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMINMAX_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     switch (static_cast<insn_FMIN_FMAX_funct3_000000000000>(insn_get_funct3_000000000000(insn))) {
         case insn_FMIN_FMAX_funct3_000000000000::MIN:
             return execute_FMIN_D(a, pc, insn);
@@ -4013,7 +4202,7 @@ static FORCE_INLINE execute_status execute_FMINMAX_D(STATE_ACCESS &a, uint64_t &
 }
 
 template <typename ST, typename DT, typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_FCVT_F_F(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) {
+static FORCE_INLINE execute_status execute_FCVT_F_F(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     const uint64_t fcsr = a.read_fcsr();
     // The rounding mode comes from the insn
     const uint32_t rm = insn_get_rm(insn, fcsr);
@@ -4033,7 +4222,7 @@ static FORCE_INLINE execute_status execute_FCVT_F_F(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename T, typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_FCVT_X_F(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) {
+static FORCE_INLINE execute_status execute_FCVT_X_F(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     const uint64_t fcsr = a.read_fcsr();
     // The rounding mode comes from the insn
     const uint32_t rm = insn_get_rm(insn, fcsr);
@@ -4055,7 +4244,7 @@ static FORCE_INLINE execute_status execute_FCVT_X_F(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename T, typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_FCVT_F_X(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) {
+static FORCE_INLINE execute_status execute_FCVT_F_X(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     const uint64_t fcsr = a.read_fcsr();
     // The rounding mode comes from the insn
     const uint32_t rm = insn_get_rm(insn, fcsr);
@@ -4074,7 +4263,7 @@ static FORCE_INLINE execute_status execute_FCVT_F_X(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_S_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_S_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.s.d");
     return execute_FCVT_F_F<uint64_t, uint32_t>(a, pc, insn,
         [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t {
@@ -4083,7 +4272,7 @@ static FORCE_INLINE execute_status execute_FCVT_S_D(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_D_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_D_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.d.s");
     return execute_FCVT_F_F<uint32_t, uint64_t>(a, pc, insn,
         [](uint32_t s1, uint32_t /*rm*/, uint32_t *fflags) -> uint64_t {
@@ -4093,7 +4282,7 @@ static FORCE_INLINE execute_status execute_FCVT_D_S(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSQRT_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSQRT_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fsqrt.s");
     return execute_float_unary_op_rm<uint32_t>(a, pc, insn, [](uint32_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t {
         return i_sfloat32::sqrt(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4101,7 +4290,7 @@ static FORCE_INLINE execute_status execute_FSQRT_S(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FSQRT_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FSQRT_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fsqrt.d");
     return execute_float_unary_op_rm<uint64_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         return i_sfloat64::sqrt(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4109,7 +4298,7 @@ static FORCE_INLINE execute_status execute_FSQRT_D(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FLE_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FLE_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fle.s");
     return execute_float_cmp_op<uint32_t>(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t *fflags) -> uint64_t {
         return static_cast<uint64_t>(i_sfloat32::le(s1, s2, fflags));
@@ -4117,7 +4306,7 @@ static FORCE_INLINE execute_status execute_FLE_S(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FLT_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FLT_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "flt.s");
     return execute_float_cmp_op<uint32_t>(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t *fflags) -> uint64_t {
         return static_cast<uint64_t>(i_sfloat32::lt(s1, s2, fflags));
@@ -4125,7 +4314,7 @@ static FORCE_INLINE execute_status execute_FLT_S(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FEQ_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FEQ_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "feq.s");
     return execute_float_cmp_op<uint32_t>(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t *fflags) -> uint64_t {
         return static_cast<uint64_t>(i_sfloat32::eq(s1, s2, fflags));
@@ -4133,7 +4322,7 @@ static FORCE_INLINE execute_status execute_FEQ_S(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCMP_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCMP_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     switch (static_cast<insn_FCMP_funct3_000000000000>(insn_get_funct3_000000000000(insn))) {
         case insn_FCMP_funct3_000000000000::LT:
             return execute_FLT_S(a, pc, insn);
@@ -4147,7 +4336,7 @@ static FORCE_INLINE execute_status execute_FCMP_S(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FLE_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FLE_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fle.d");
     return execute_float_cmp_op<uint64_t>(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t *fflags) -> uint64_t {
         return static_cast<uint64_t>(i_sfloat64::le(s1, s2, fflags));
@@ -4155,7 +4344,7 @@ static FORCE_INLINE execute_status execute_FLE_D(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FLT_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FLT_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "flt.d");
     return execute_float_cmp_op<uint64_t>(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t *fflags) -> uint64_t {
         return static_cast<uint64_t>(i_sfloat64::lt(s1, s2, fflags));
@@ -4163,7 +4352,7 @@ static FORCE_INLINE execute_status execute_FLT_D(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FEQ_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FEQ_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "feq.d");
     return execute_float_cmp_op<uint64_t>(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t *fflags) -> uint64_t {
         return static_cast<uint64_t>(i_sfloat64::eq(s1, s2, fflags));
@@ -4171,7 +4360,7 @@ static FORCE_INLINE execute_status execute_FEQ_D(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCMP_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCMP_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     switch (static_cast<insn_FCMP_funct3_000000000000>(insn_get_funct3_000000000000(insn))) {
         case insn_FCMP_funct3_000000000000::LT:
             return execute_FLT_D(a, pc, insn);
@@ -4185,7 +4374,7 @@ static FORCE_INLINE execute_status execute_FCMP_D(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_W_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_W_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.w.s");
     return execute_FCVT_X_F<uint32_t>(a, pc, insn, [](uint32_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         const auto val = i_sfloat32::cvt_f_i<int32_t>(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4195,7 +4384,7 @@ static FORCE_INLINE execute_status execute_FCVT_W_S(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_WU_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_WU_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.wu.s");
     return execute_FCVT_X_F<uint32_t>(a, pc, insn, [](uint32_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         const auto val = i_sfloat32::cvt_f_i<uint32_t>(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4205,7 +4394,7 @@ static FORCE_INLINE execute_status execute_FCVT_WU_S(STATE_ACCESS &a, uint64_t &
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_L_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_L_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.l.s");
     return execute_FCVT_X_F<uint32_t>(a, pc, insn, [](uint32_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         const auto val = i_sfloat32::cvt_f_i<int64_t>(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4214,7 +4403,7 @@ static FORCE_INLINE execute_status execute_FCVT_L_S(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_LU_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_LU_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.lu.s");
     return execute_FCVT_X_F<uint32_t>(a, pc, insn, [](uint32_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         return i_sfloat32::cvt_f_i<uint64_t>(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4222,7 +4411,7 @@ static FORCE_INLINE execute_status execute_FCVT_LU_S(STATE_ACCESS &a, uint64_t &
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_W_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_W_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.w.d");
     return execute_FCVT_X_F<uint64_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         const auto val = i_sfloat64::cvt_f_i<int32_t>(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4232,7 +4421,7 @@ static FORCE_INLINE execute_status execute_FCVT_W_D(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_WU_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_WU_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.wu.d");
     return execute_FCVT_X_F<uint64_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         const auto val = i_sfloat64::cvt_f_i<uint32_t>(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4242,7 +4431,7 @@ static FORCE_INLINE execute_status execute_FCVT_WU_D(STATE_ACCESS &a, uint64_t &
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_L_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_L_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.l.d");
     return execute_FCVT_X_F<uint64_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         const auto val = i_sfloat64::cvt_f_i<int64_t>(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4251,7 +4440,7 @@ static FORCE_INLINE execute_status execute_FCVT_L_D(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_LU_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_LU_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.lu.d");
     return execute_FCVT_X_F<uint64_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         return i_sfloat64::cvt_f_i<uint64_t>(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4259,7 +4448,7 @@ static FORCE_INLINE execute_status execute_FCVT_LU_D(STATE_ACCESS &a, uint64_t &
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_S_W(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_S_W(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.s.w");
     return execute_FCVT_F_X<uint32_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t {
         return i_sfloat32::cvt_i_f(static_cast<int32_t>(s1), static_cast<FRM_modes>(rm), fflags);
@@ -4267,7 +4456,7 @@ static FORCE_INLINE execute_status execute_FCVT_S_W(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_S_WU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_S_WU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.s.wu");
     return execute_FCVT_F_X<uint32_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t {
         return i_sfloat32::cvt_i_f(static_cast<uint32_t>(s1), static_cast<FRM_modes>(rm), fflags);
@@ -4275,7 +4464,7 @@ static FORCE_INLINE execute_status execute_FCVT_S_WU(STATE_ACCESS &a, uint64_t &
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_S_L(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_S_L(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.s.l");
     return execute_FCVT_F_X<uint32_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t {
         return i_sfloat32::cvt_i_f(static_cast<int64_t>(s1), static_cast<FRM_modes>(rm), fflags);
@@ -4283,7 +4472,7 @@ static FORCE_INLINE execute_status execute_FCVT_S_L(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_S_LU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_S_LU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.s.lu");
     return execute_FCVT_F_X<uint32_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t {
         return i_sfloat32::cvt_i_f(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4291,7 +4480,7 @@ static FORCE_INLINE execute_status execute_FCVT_S_LU(STATE_ACCESS &a, uint64_t &
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_D_W(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_D_W(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.d.w");
     return execute_FCVT_F_X<uint64_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         return i_sfloat64::cvt_i_f(static_cast<int32_t>(s1), static_cast<FRM_modes>(rm), fflags);
@@ -4299,7 +4488,7 @@ static FORCE_INLINE execute_status execute_FCVT_D_W(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_D_WU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_D_WU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.d.wu");
     return execute_FCVT_F_X<uint64_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         return i_sfloat64::cvt_i_f(static_cast<uint32_t>(s1), static_cast<FRM_modes>(rm), fflags);
@@ -4307,7 +4496,7 @@ static FORCE_INLINE execute_status execute_FCVT_D_WU(STATE_ACCESS &a, uint64_t &
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_D_L(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_D_L(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.d.l");
     return execute_FCVT_F_X<uint64_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         return i_sfloat64::cvt_i_f(static_cast<int64_t>(s1), static_cast<FRM_modes>(rm), fflags);
@@ -4315,7 +4504,7 @@ static FORCE_INLINE execute_status execute_FCVT_D_L(STATE_ACCESS &a, uint64_t &p
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_D_LU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_D_LU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fcvt.d.lu");
     return execute_FCVT_F_X<uint64_t>(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t {
         return i_sfloat64::cvt_i_f(s1, static_cast<FRM_modes>(rm), fflags);
@@ -4323,7 +4512,7 @@ static FORCE_INLINE execute_status execute_FCVT_D_LU(STATE_ACCESS &a, uint64_t &
 }
 
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMV_F_X(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMV_F_X(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     // Should have funct3 set to 0
     if (unlikely(insn_get_funct3(insn) != 0)) {
         return raise_illegal_insn_exception(a, pc, insn);
@@ -4336,25 +4525,25 @@ static FORCE_INLINE execute_status execute_FMV_F_X(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMV_W_X(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMV_W_X(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmv.w.x");
     return execute_FMV_F_X<uint32_t>(a, pc, insn);
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMV_D_X(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMV_D_X(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmv.d.x");
     return execute_FMV_F_X<uint64_t>(a, pc, insn);
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCLASS_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCLASS_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fclass.s");
     return execute_FCLASS<uint32_t>(a, pc, insn, [](uint32_t s1) -> uint64_t { return i_sfloat32::fclass(s1); });
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMV_X_W(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMV_X_W(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmv.x.w");
     const uint32_t rd = insn_get_rd(insn);
     if (unlikely(rd == 0)) {
@@ -4369,7 +4558,7 @@ static FORCE_INLINE execute_status execute_FMV_X_W(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMV_FCLASS_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMV_FCLASS_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     switch (static_cast<insn_FMV_FCLASS_funct3_000000000000>(insn_get_funct3_000000000000(insn))) {
         case insn_FMV_FCLASS_funct3_000000000000::FMV:
             return execute_FMV_X_W(a, pc, insn);
@@ -4381,13 +4570,13 @@ static FORCE_INLINE execute_status execute_FMV_FCLASS_S(STATE_ACCESS &a, uint64_
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCLASS_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCLASS_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fclass.d");
     return execute_FCLASS<uint64_t>(a, pc, insn, [](uint64_t s1) -> uint64_t { return i_sfloat64::fclass(s1); });
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMV_X_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMV_X_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     dump_insn(a, pc, insn, "fmv.x.d");
     const uint32_t rd = insn_get_rd(insn);
     if (unlikely(rd == 0)) {
@@ -4399,7 +4588,7 @@ static FORCE_INLINE execute_status execute_FMV_X_D(STATE_ACCESS &a, uint64_t &pc
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FMV_FCLASS_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FMV_FCLASS_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     switch (static_cast<insn_FMV_FCLASS_funct3_000000000000>(insn_get_funct3_000000000000(insn))) {
         case insn_FMV_FCLASS_funct3_000000000000::FMV:
             return execute_FMV_X_D(a, pc, insn);
@@ -4411,7 +4600,7 @@ static FORCE_INLINE execute_status execute_FMV_FCLASS_D(STATE_ACCESS &a, uint64_
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FCVT_FMV_FCLASS(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FCVT_FMV_FCLASS(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
     switch (static_cast<insn_FD_funct7_rs2>(insn_get_funct7_rs2(insn))) {
         case insn_FD_funct7_rs2::FCVT_W_S:
             return execute_FCVT_W_S(a, pc, insn);
@@ -4463,7 +4652,11 @@ static FORCE_INLINE execute_status execute_FCVT_FMV_FCLASS(STATE_ACCESS &a, uint
 }
 
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_FD(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
+static FORCE_INLINE execute_status execute_FD(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     switch (static_cast<insn_FD_funct7>(insn_get_funct7(insn))) {
         case insn_FD_funct7::FADD_S:
             return execute_FADD_S(a, pc, insn);
@@ -4503,8 +4696,8 @@ static FORCE_INLINE execute_status execute_FD(STATE_ACCESS &a, uint64_t &pc, uin
 }
 
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_L(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t rd,
-    uint32_t rs1, int32_t imm) {
+static FORCE_INLINE execute_status execute_C_L(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t rd, uint32_t rs1,
+    int32_t imm) {
     const uint64_t vaddr = a.read_x(rs1);
     T val = 0;
     if (unlikely(!read_virtual_memory<T>(a, pc, mcycle, vaddr + imm, &val))) {
@@ -4520,19 +4713,22 @@ static FORCE_INLINE execute_status execute_C_L(STATE_ACCESS &a, uint64_t &pc, ui
 }
 
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_S(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t rs2,
+static FORCE_INLINE execute_status execute_C_S(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t rs2,
     uint32_t rs1, int32_t imm) {
     const uint64_t vaddr = a.read_x(rs1);
     const uint64_t val = a.read_x(rs2);
     const execute_status status = write_virtual_memory<T>(a, pc, mcycle, vaddr + imm, val);
-    if (unlikely(status == execute_status::failure)) {
-        return advance_to_raised_exception(a, pc);
+    if (unlikely(status != execute_status::success)) {
+        if (status == execute_status::failure) {
+            return advance_to_raised_exception(a, pc);
+        }
+        return advance_to_next_insn<2>(a, pc, status);
     }
-    return advance_to_next_insn<2>(a, pc, status);
+    return advance_to_next_insn<2>(a, pc);
 }
 
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_FL(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t rd,
+static FORCE_INLINE execute_status execute_C_FL(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t rd,
     uint32_t rs1, int32_t imm) {
     // Loads the float value from virtual memory
     const uint64_t vaddr = a.read_x(rs1);
@@ -4547,33 +4743,30 @@ static FORCE_INLINE execute_status execute_C_FL(STATE_ACCESS &a, uint64_t &pc, u
 }
 
 template <typename T, typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_FS(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t rs2,
+static FORCE_INLINE execute_status execute_C_FS(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t rs2,
     uint32_t rs1, int32_t imm) {
     const uint64_t vaddr = a.read_x(rs1);
     // A narrower n-bit transfer out of the floating-point
     // registers will transfer the lower n bits of the register ignoring the upper FLEN−n bits.
     T val = static_cast<T>(a.read_f(rs2));
     const execute_status status = write_virtual_memory<T>(a, pc, mcycle, vaddr + imm, val);
-    if (unlikely(status == execute_status::failure)) {
-        return advance_to_raised_exception(a, pc);
+    if (unlikely(status != execute_status::success)) {
+        if (status == execute_status::failure) {
+            return advance_to_raised_exception(a, pc);
+        }
+        return advance_to_next_insn<2>(a, pc, status);
     }
-    return advance_to_next_insn<2>(a, pc, status);
+    return advance_to_next_insn<2>(a, pc);
 }
 
 /// \brief Implementation of the C.ADDI4SPN instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_ADDI4SPN(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    // "A 16-bit instruction with all bits zero is permanently reserved as an illegal instruction."
-    if (unlikely(insn == 0)) {
-        return raise_illegal_insn_exception(a, pc, insn);
-    }
-    dump_insn(a, pc, insn, "c.addi4spn");
-    // rd cannot be zero
+static FORCE_INLINE execute_status execute_C_ADDI4SPN(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.addi4spn");
+    // rd cannot be zero (guaranteed by RISC-V spec design)
     const uint32_t rd = insn_get_CIW_CL_rd_CS_CA_rs2(insn);
+    // imm cannot be zero (guaranteed by the jump table)
     const uint32_t imm = insn_get_CIW_imm(insn);
-    if (unlikely(imm == 0)) {
-        return raise_illegal_insn_exception(a, pc, insn);
-    }
     const uint64_t rs1 = a.read_x(2);
     int64_t val = 0;
     __builtin_add_overflow(static_cast<int64_t>(rs1), static_cast<int64_t>(imm), &val);
@@ -4583,8 +4776,13 @@ static FORCE_INLINE execute_status execute_C_ADDI4SPN(STATE_ACCESS &a, uint64_t
 
 /// \brief Implementation of the C.FLD instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_FLD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.fld");
+static FORCE_INLINE execute_status execute_C_FLD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.fld");
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction
+    // exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     const uint32_t rd = insn_get_CIW_CL_rd_CS_CA_rs2(insn);
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
     const int32_t imm = insn_get_CL_CS_imm(insn);
@@ -4593,8 +4791,8 @@ static FORCE_INLINE execute_status execute_C_FLD(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.LW instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_LW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.lw");
+static FORCE_INLINE execute_status execute_C_LW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.lw");
     const uint32_t rd = insn_get_CIW_CL_rd_CS_CA_rs2(insn);
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
     const int32_t imm = insn_get_C_LW_C_SW_imm(insn);
@@ -4603,8 +4801,8 @@ static FORCE_INLINE execute_status execute_C_LW(STATE_ACCESS &a, uint64_t &pc, u
 
 /// \brief Implementation of the C.LD instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_LD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.ld");
+static FORCE_INLINE execute_status execute_C_LD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.ld");
     const uint32_t rd = insn_get_CIW_CL_rd_CS_CA_rs2(insn);
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
     const int32_t imm = insn_get_CL_CS_imm(insn);
@@ -4613,8 +4811,13 @@ static FORCE_INLINE execute_status execute_C_LD(STATE_ACCESS &a, uint64_t &pc, u
 
 /// \brief Implementation of the C.FSD instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_FSD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.fsd");
+static FORCE_INLINE execute_status execute_C_FSD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.fsd");
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction
+    // exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
     const uint32_t rs2 = insn_get_CIW_CL_rd_CS_CA_rs2(insn);
     const int32_t imm = insn_get_CL_CS_imm(insn);
@@ -4623,8 +4826,8 @@ static FORCE_INLINE execute_status execute_C_FSD(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.SW instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_SW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.sw");
+static FORCE_INLINE execute_status execute_C_SW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.sw");
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
     const uint32_t rs2 = insn_get_CIW_CL_rd_CS_CA_rs2(insn);
     const int32_t imm = insn_get_C_LW_C_SW_imm(insn);
@@ -4633,8 +4836,8 @@ static FORCE_INLINE execute_status execute_C_SW(STATE_ACCESS &a, uint64_t &pc, u
 
 /// \brief Implementation of the C.SD instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_SD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.sd");
+static FORCE_INLINE execute_status execute_C_SD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.sd");
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
     const uint32_t rs2 = insn_get_CIW_CL_rd_CS_CA_rs2(insn);
     const int32_t imm = insn_get_CL_CS_imm(insn);
@@ -4643,22 +4846,20 @@ static FORCE_INLINE execute_status execute_C_SD(STATE_ACCESS &a, uint64_t &pc, u
 
 /// \brief Implementation of the C.NOP instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_NOP(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.nop");
-    // C.NOP with imm != 0 is just a HINT that must execute as no-op (see RISC-V spec)
+static FORCE_INLINE execute_status execute_C_NOP(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.nop");
     // Really do nothing
     return advance_to_next_insn<2>(a, pc);
 }
 
 /// \brief Implementation of the C.ADDI instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_ADDI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rd) {
-    dump_insn(a, pc, insn, "c.addi");
+static FORCE_INLINE execute_status execute_C_ADDI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.addi");
+    // rd cannot be zero (guaranteed by jump table)
+    const uint32_t rd = insn_get_rd(insn);
     const int32_t imm = insn_get_CI_CB_imm_se(insn);
-    // C.ADDI with imm == 0 is just a HINT that must execute as no-op (see RISC-V spec)
-    if (unlikely(imm == 0)) {
-        return advance_to_next_insn<2>(a, pc);
-    }
+    // imm cannot be zero (guaranteed by jump table)
     const uint64_t rd_value = a.read_x(rd);
     int64_t val = 0;
     __builtin_add_overflow(static_cast<int64_t>(rd_value), static_cast<int64_t>(imm), &val);
@@ -4666,23 +4867,12 @@ static FORCE_INLINE execute_status execute_C_ADDI(STATE_ACCESS &a, uint64_t &pc,
     return advance_to_next_insn<2>(a, pc);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_Q1_SET0(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    const uint32_t rd = insn_get_rd(insn);
-    if (unlikely(rd == 0)) {
-        return execute_C_NOP(a, pc, insn);
-    }
-    return execute_C_ADDI(a, pc, insn, rd);
-}
-
 /// \brief Implementation of the C.addiw instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_ADDIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.addiw");
+static FORCE_INLINE execute_status execute_C_ADDIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.addiw");
+    // rd cannot be zero (guaranteed by jump table)
     const uint32_t rd = insn_get_rd(insn);
-    if (unlikely(rd == 0)) {
-        return raise_illegal_insn_exception(a, pc, insn);
-    }
     const uint64_t rd_value = a.read_x(rd);
     const int32_t imm = insn_get_CI_CB_imm_se(insn);
     int32_t val = 0;
@@ -4693,13 +4883,10 @@ static FORCE_INLINE execute_status execute_C_ADDIW(STATE_ACCESS &a, uint64_t &pc
 
 /// \brief Implementation of the C.LI instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_LI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.li");
+static FORCE_INLINE execute_status execute_C_LI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.li");
+    // rd cannot be zero (guaranteed by jump table)
     const uint32_t rd = insn_get_rd(insn);
-    // C.LI with rd == 0 is just a HINT that must execute as no-op (see RISC-V spec)
-    if (unlikely(rd == 0)) {
-        return advance_to_next_insn<2>(a, pc);
-    }
     const int32_t imm = insn_get_CI_CB_imm_se(insn);
     a.write_x(rd, static_cast<uint64_t>(imm));
     return advance_to_next_insn<2>(a, pc);
@@ -4707,12 +4894,10 @@ static FORCE_INLINE execute_status execute_C_LI(STATE_ACCESS &a, uint64_t &pc, u
 
 /// \brief Implementation of the C.ADDI16SP instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_ADDI16SP(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.addi16sp");
+static FORCE_INLINE execute_status execute_C_ADDI16SP(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.addi16sp");
+    // imm cannot be zero (guaranteed by the jump table)
     const int32_t imm = insn_get_C_ADDI16SP_imm(insn);
-    if (unlikely(imm == 0)) {
-        return raise_illegal_insn_exception(a, pc, insn);
-    }
     const uint64_t rs1_value = a.read_x(2);
     int64_t val = 0;
     __builtin_add_overflow(static_cast<int64_t>(rs1_value), static_cast<int64_t>(imm), &val);
@@ -4722,39 +4907,23 @@ static FORCE_INLINE execute_status execute_C_ADDI16SP(STATE_ACCESS &a, uint64_t
 
 /// \brief Implementation of the C.LUI instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_LUI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rd) {
-    dump_insn(a, pc, insn, "c.lui");
+static FORCE_INLINE execute_status execute_C_LUI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.lui");
+    // imm cannot be zero (guaranteed by the jump table)
     const int32_t imm = insn_get_C_LUI_imm(insn);
-    if (unlikely(imm == 0)) {
-        return raise_illegal_insn_exception(a, pc, insn);
-    }
-    // C.LUI with rd == 0 is just a HINT that must execute as no-op (see RISC-V spec)
-    if (unlikely(rd == 0)) {
-        return advance_to_next_insn<2>(a, pc);
-    }
+    // rd cannot be zero (guaranteed by the jump table)
+    const uint32_t rd = insn_get_rd(insn);
     a.write_x(rd, static_cast<uint64_t>(imm));
     return advance_to_next_insn<2>(a, pc);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_Q1_SET1(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    const uint32_t rd = insn_get_rd(insn);
-    if (rd == 2) {
-        return execute_C_ADDI16SP(a, pc, insn);
-    }
-    return execute_C_LUI(a, pc, insn, rd);
-}
-
 /// \brief Implementation of the C.SRLI instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_SRLI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.srli");
+static FORCE_INLINE execute_status execute_C_SRLI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.srli");
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
+    // imm cannot be zero (guaranteed by the jump table)
     const uint32_t imm = insn_get_CI_CB_imm(insn);
-    // C.SRLI with imm == 0 is just a HINT that must execute as no-op (see RISC-V spec)
-    if (unlikely(imm == 0)) {
-        return advance_to_next_insn<2>(a, pc);
-    }
     const uint64_t rs1_value = a.read_x(rs1);
     a.write_x(rs1, rs1_value >> imm);
     return advance_to_next_insn<2>(a, pc);
@@ -4762,14 +4931,11 @@ static FORCE_INLINE execute_status execute_C_SRLI(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.SRAI instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_SRAI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.srai");
+static FORCE_INLINE execute_status execute_C_SRAI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.srai");
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
+    // imm cannot be zero (guaranteed by the jump table)
     const uint32_t imm = insn_get_CI_CB_imm(insn);
-    // C.SRAI with imm == 0 is just a HINT that must execute as no-op (see RISC-V spec)
-    if (unlikely(imm == 0)) {
-        return advance_to_next_insn<2>(a, pc);
-    }
     const auto rs1_value = static_cast<int64_t>(a.read_x(rs1));
     a.write_x(rs1, static_cast<uint64_t>(rs1_value >> imm));
     return advance_to_next_insn<2>(a, pc);
@@ -4777,8 +4943,8 @@ static FORCE_INLINE execute_status execute_C_SRAI(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.ANDI instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_ANDI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.andi");
+static FORCE_INLINE execute_status execute_C_ANDI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.andi");
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
     const int32_t imm = insn_get_CI_CB_imm_se(insn);
     const uint64_t rs1_value = a.read_x(rs1);
@@ -4787,7 +4953,7 @@ static FORCE_INLINE execute_status execute_C_ANDI(STATE_ACCESS &a, uint64_t &pc,
 }
 
 template <typename STATE_ACCESS, typename F>
-static FORCE_INLINE execute_status execute_C_arithmetic(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) {
+static FORCE_INLINE execute_status execute_C_arithmetic(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) {
     // Ensure rs1 and rs2 are loaded in order: do not nest with call to f() as
     // the order of evaluation of arguments in a function call is undefined.
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
@@ -4800,8 +4966,8 @@ static FORCE_INLINE execute_status execute_C_arithmetic(STATE_ACCESS &a, uint64_
 
 /// \brief Implementation of the C.SUB instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_SUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.sub");
+static FORCE_INLINE execute_status execute_C_SUB(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.sub");
     return execute_C_arithmetic(a, pc, insn, [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t {
         uint64_t val = 0;
         __builtin_sub_overflow(rs1_value, rs2_value, &val);
@@ -4811,32 +4977,32 @@ static FORCE_INLINE execute_status execute_C_SUB(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.XOR instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_XOR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.xor");
+static FORCE_INLINE execute_status execute_C_XOR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.xor");
     return execute_C_arithmetic(a, pc, insn,
         [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t { return rs1_value ^ rs2_value; });
 }
 
 /// \brief Implementation of the C.OR instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_OR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.or");
+static FORCE_INLINE execute_status execute_C_OR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.or");
     return execute_C_arithmetic(a, pc, insn,
         [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t { return rs1_value | rs2_value; });
 }
 
 /// \brief Implementation of the C.AND instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_AND(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.and");
+static FORCE_INLINE execute_status execute_C_AND(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.and");
     return execute_C_arithmetic(a, pc, insn,
         [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t { return rs1_value & rs2_value; });
 }
 
 /// \brief Implementation of the C.SUBW instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_SUBW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.subw");
+static FORCE_INLINE execute_status execute_C_SUBW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.subw");
     return execute_C_arithmetic(a, pc, insn, [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t {
         // Convert 64-bit to 32-bit
         auto rs1w = static_cast<int32_t>(rs1_value);
@@ -4849,8 +5015,8 @@ static FORCE_INLINE execute_status execute_C_SUBW(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.ADDW instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_ADDW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.addw");
+static FORCE_INLINE execute_status execute_C_ADDW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.addw");
     return execute_C_arithmetic(a, pc, insn, [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t {
         // Discard upper 32 bits
         auto rs1w = static_cast<int32_t>(rs1_value);
@@ -4861,54 +5027,18 @@ static FORCE_INLINE execute_status execute_C_ADDW(STATE_ACCESS &a, uint64_t &pc,
     });
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_CB_funct2(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    auto cb_funct2 = static_cast<insn_CB_funct2>(insn_get_CB_funct2(insn));
-    switch (cb_funct2) {
-        case insn_CB_funct2::C_SRLI:
-            return execute_C_SRLI(a, pc, insn);
-        case insn_CB_funct2::C_SRAI:
-            return execute_C_SRAI(a, pc, insn);
-        case insn_CB_funct2::C_ANDI:
-            return execute_C_ANDI(a, pc, insn);
-    }
-    return raise_illegal_insn_exception(a, pc, insn);
-}
-
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_Q1_SET2(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    auto ca_funct6_funct2 = static_cast<insn_CA_funct6_funct2>(insn_get_CA_funct6_funct2(insn));
-    switch (ca_funct6_funct2) {
-        case insn_CA_funct6_funct2::C_SUB:
-            return execute_C_SUB(a, pc, insn);
-        case insn_CA_funct6_funct2::C_XOR:
-            return execute_C_XOR(a, pc, insn);
-        case insn_CA_funct6_funct2::C_OR:
-            return execute_C_OR(a, pc, insn);
-        case insn_CA_funct6_funct2::C_AND:
-            return execute_C_AND(a, pc, insn);
-        case insn_CA_funct6_funct2::C_SUBW:
-            return execute_C_SUBW(a, pc, insn);
-        case insn_CA_funct6_funct2::C_ADDW:
-            return execute_C_ADDW(a, pc, insn);
-        default:
-            return execute_CB_funct2(a, pc, insn);
-    }
-    return raise_illegal_insn_exception(a, pc, insn);
-}
-
 /// \brief Implementation of the C_J instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_J(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.j");
+static FORCE_INLINE execute_status execute_C_J(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.j");
     const uint64_t new_pc = pc + static_cast<uint64_t>(insn_get_C_J_imm(insn));
     return execute_jump(a, pc, new_pc);
 }
 
 /// \brief Implementation of the C.BEQZ instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_BEQZ(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.beqz");
+static FORCE_INLINE execute_status execute_C_BEQZ(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.beqz");
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
     if (a.read_x(rs1) == 0) {
         const int32_t imm = insn_get_C_BEQZ_BNEZ_imm(insn);
@@ -4920,8 +5050,8 @@ static FORCE_INLINE execute_status execute_C_BEQZ(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.BNEZ instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_BNEZ(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.bnez");
+static FORCE_INLINE execute_status execute_C_BNEZ(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.bnez");
     const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn);
     if (a.read_x(rs1) != 0) {
         const int32_t imm = insn_get_C_BEQZ_BNEZ_imm(insn);
@@ -4933,18 +5063,12 @@ static FORCE_INLINE execute_status execute_C_BNEZ(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.SLLI instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_SLLI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.slli");
+static FORCE_INLINE execute_status execute_C_SLLI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.slli");
+    // rd cannot be zero (guaranteed by jump table)
     const uint32_t rd = insn_get_rd(insn);
-    // C.SLLI with rd == 0 is just a HINT that must execute as no-op (see RISC-V spec)
-    if (unlikely(rd == 0)) {
-        return advance_to_next_insn<2>(a, pc);
-    }
+    // imm cannot be zero (guaranteed by jump table)
     const uint32_t imm = insn_get_CI_CB_imm(insn);
-    // C.SLLI with imm == 0 is just a HINT that must execute as no-op (see RISC-V spec)
-    if (unlikely(imm == 0)) {
-        return advance_to_next_insn<2>(a, pc);
-    }
     const uint64_t rs1_value = a.read_x(rd);
     a.write_x(rd, rs1_value << imm);
     return advance_to_next_insn<2>(a, pc);
@@ -4952,8 +5076,13 @@ static FORCE_INLINE execute_status execute_C_SLLI(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.FLDSP instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_FLDSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.fldsp");
+static FORCE_INLINE execute_status execute_C_FLDSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.fldsp");
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction
+    // exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     const uint32_t rd = insn_get_rd(insn);
     const int32_t imm = insn_get_C_FLDSP_LDSP_imm(insn);
     return execute_C_FL<uint64_t>(a, pc, mcycle, rd, 0x2, imm);
@@ -4961,48 +5090,41 @@ static FORCE_INLINE execute_status execute_C_FLDSP(STATE_ACCESS &a, uint64_t &pc
 
 /// \brief Implementation of the C.LWSP instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_LWSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.lwsp");
+static FORCE_INLINE execute_status execute_C_LWSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.lwsp");
+    // rd cannot be zero (guaranteed by jump table)
     const uint32_t rd = insn_get_rd(insn);
-    if (unlikely(rd == 0)) {
-        return raise_illegal_insn_exception(a, pc, insn);
-    }
     const int32_t imm = insn_get_C_LWSP_imm(insn);
     return execute_C_L<int32_t>(a, pc, mcycle, rd, 0x2, imm);
 }
 
 /// \brief Implementation of the C.LDSP instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_LDSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.ldsp");
+static FORCE_INLINE execute_status execute_C_LDSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.ldsp");
+    // rd cannot be zero (guaranteed by jump table)
     const uint32_t rd = insn_get_rd(insn);
-    if (unlikely(rd == 0)) {
-        return raise_illegal_insn_exception(a, pc, insn);
-    }
     const int32_t imm = insn_get_C_FLDSP_LDSP_imm(insn);
     return execute_C_L<int64_t>(a, pc, mcycle, rd, 0x2, imm);
 }
 
 /// \brief Implementation of the C.JR instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_JR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rs1) {
-    dump_insn(a, pc, insn, "c.jr");
-    if (unlikely(rs1 == 0)) {
-        return raise_illegal_insn_exception(a, pc, insn);
-    }
+static FORCE_INLINE execute_status execute_C_JR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.jr");
+    // rs1 cannot be zero (guaranteed by the jump table)
+    const uint32_t rs1 = insn_get_rd(insn);
     const uint64_t new_pc = a.read_x(rs1) & ~static_cast<uint64_t>(1);
     return execute_jump(a, pc, new_pc);
 }
 
 /// \brief Implementation of the C.MV instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_MV(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rd,
-    uint32_t rs2) {
-    dump_insn(a, pc, insn, "c.mv");
-    // C.SLLI with rd == 0 is just a HINT that must execute as no-op (see RISC-V spec)
-    if (unlikely(rd == 0)) {
-        return advance_to_next_insn<2>(a, pc);
-    }
+static FORCE_INLINE execute_status execute_C_MV(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.mv");
+    // rd cannot be zero (guaranteed by the jump table)
+    const uint32_t rd = insn_get_rd(insn);
+    const uint32_t rs2 = insn_get_CR_CSS_rs2(insn);
     const uint64_t val = a.read_x(rs2);
     a.write_x(rd, val);
     return advance_to_next_insn<2>(a, pc);
@@ -5010,16 +5132,17 @@ static FORCE_INLINE execute_status execute_C_MV(STATE_ACCESS &a, uint64_t &pc, u
 
 /// \brief Implementation of the C.EBREAK instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_EBREAK(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.ebreak");
+static FORCE_INLINE execute_status execute_C_EBREAK(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.ebreak");
     pc = raise_exception(a, pc, MCAUSE_BREAKPOINT, pc);
     return advance_to_raised_exception(a, pc);
 }
 
 /// \brief Implementation of the C.JALR instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_JALR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rs1) {
-    dump_insn(a, pc, insn, "c.jalr");
+static FORCE_INLINE execute_status execute_C_JALR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.jalr");
+    const uint32_t rs1 = insn_get_rd(insn);
     const uint64_t new_pc = a.read_x(rs1) & ~static_cast<uint64_t>(1);
     const uint64_t val = pc + 2;
     a.write_x(0x1, val);
@@ -5028,13 +5151,11 @@ static FORCE_INLINE execute_status execute_C_JALR(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.ADD instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_ADD(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rd,
-    uint32_t rs2) {
-    dump_insn(a, pc, insn, "c.add");
-    // C.ADD with rd == 0 is just a HINT that must execute as no-op (see RISC-V spec)
-    if (unlikely(rd == 0)) {
-        return advance_to_next_insn<2>(a, pc);
-    }
+static FORCE_INLINE execute_status execute_C_ADD(STATE_ACCESS a, uint64_t &pc, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.add");
+    // rd cannot be zero (guaranteed by the jump table)
+    const uint32_t rd = insn_get_rd(insn);
+    const uint32_t rs2 = insn_get_CR_CSS_rs2(insn);
     const uint64_t rd_value = a.read_x(rd);
     const uint64_t rs2_value = a.read_x(rs2);
     uint64_t val = 0;
@@ -5043,29 +5164,15 @@ static FORCE_INLINE execute_status execute_C_ADD(STATE_ACCESS &a, uint64_t &pc,
     return advance_to_next_insn<2>(a, pc);
 }
 
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_Q2_SET0(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) {
-    const uint32_t rs1 = insn_get_rd(insn);
-    const uint32_t rs2 = insn_get_CR_CSS_rs2(insn);
-    if (insn & 0b0001000000000000) {
-        if (rs2 == 0) {
-            if (rs1 == 0) {
-                return execute_C_EBREAK(a, pc, insn);
-            }
-            return execute_C_JALR(a, pc, insn, rs1);
-        }
-        return execute_C_ADD(a, pc, insn, rs1, rs2);
-    }
-    if (rs2 == 0) {
-        return execute_C_JR(a, pc, insn, rs1);
-    }
-    return execute_C_MV(a, pc, insn, rs1, rs2);
-}
-
 /// \brief Implementation of the C.FSDSP instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_FSDSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.fsdsp");
+static FORCE_INLINE execute_status execute_C_FSDSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.fsdsp");
+    // If FS is OFF, attempts to read or write the float state will cause an illegal instruction
+    // exception.
+    if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
+        return raise_illegal_insn_exception(a, pc, insn);
+    }
     const uint32_t rs2 = insn_get_CR_CSS_rs2(insn);
     const int32_t imm = insn_get_C_FSDSP_SDSP_imm(insn);
     return execute_C_FS<uint64_t>(a, pc, mcycle, rs2, 0x2, imm);
@@ -5073,8 +5180,8 @@ static FORCE_INLINE execute_status execute_C_FSDSP(STATE_ACCESS &a, uint64_t &pc
 
 /// \brief Implementation of the C.SWSP instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_SWSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.swsp");
+static FORCE_INLINE execute_status execute_C_SWSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.swsp");
     const uint32_t rs2 = insn_get_CR_CSS_rs2(insn);
     const int32_t imm = insn_get_C_SWSP_imm(insn);
     return execute_C_S<uint32_t>(a, pc, mcycle, rs2, 0x2, imm);
@@ -5082,297 +5189,13 @@ static FORCE_INLINE execute_status execute_C_SWSP(STATE_ACCESS &a, uint64_t &pc,
 
 /// \brief Implementation of the C.SDSP instruction.
 template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_C_SDSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
-    dump_insn(a, pc, insn, "c.sdsp");
+static FORCE_INLINE execute_status execute_C_SDSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) {
+    dump_insn(a, pc, static_cast<uint16_t>(insn), "c.sdsp");
     const uint32_t rs2 = insn_get_CR_CSS_rs2(insn);
     const int32_t imm = insn_get_C_FSDSP_SDSP_imm(insn);
     return execute_C_S<uint64_t>(a, pc, mcycle, rs2, 0x2, imm);
 }
 
-/// \brief Decodes and executes an instruction.
-/// \tparam STATE_ACCESS Class of machine state accessor object.
-/// \param a Machine state accessor object.
-/// \param pc Current pc.
-/// \param insn Instruction.
-/// \return execute_status::failure if an exception was raised, or
-///  execute_status::success otherwise.
-/// \details The execute_insn function decodes the instruction in multiple levels. When we know for sure that
-///  the instruction could only be a &lt;FOO&gt;, a function with the name execute_&lt;FOO&gt; will be called.
-///  See [RV32/64G Instruction Set
-///  Listings](https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf#chapter.19) and [Instruction
-///  listings for RISC-V](https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf#table.19.2).
-template <typename STATE_ACCESS>
-static FORCE_INLINE execute_status execute_insn(STATE_ACCESS &a, uint64_t &pc, uint64_t &mcycle, uint32_t insn) {
-    // Is compressed instruction
-    if ((insn & 3) != 3) {
-        // The fetch may read 4 bytes as an optimization,
-        // but the compressed instruction uses only the 2 less significant bytes
-        insn = static_cast<uint16_t>(insn);
-        auto c_funct3 = static_cast<insn_c_funct3>(insn_get_c_funct3(insn));
-        switch (c_funct3) {
-            case insn_c_funct3::C_ADDI4SPN:
-                return execute_C_ADDI4SPN(a, pc, insn);
-            case insn_c_funct3::C_LW:
-                return execute_C_LW(a, pc, mcycle, insn);
-            case insn_c_funct3::C_LD:
-                return execute_C_LD(a, pc, mcycle, insn);
-            case insn_c_funct3::C_SW:
-                return execute_C_SW(a, pc, mcycle, insn);
-            case insn_c_funct3::C_SD:
-                return execute_C_SD(a, pc, mcycle, insn);
-            case insn_c_funct3::C_Q1_SET0:
-                return execute_C_Q1_SET0(a, pc, insn);
-            case insn_c_funct3::C_ADDIW:
-                return execute_C_ADDIW(a, pc, insn);
-            case insn_c_funct3::C_LI:
-                return execute_C_LI(a, pc, insn);
-            case insn_c_funct3::C_Q1_SET1:
-                return execute_C_Q1_SET1(a, pc, insn);
-            case insn_c_funct3::C_Q1_SET2:
-                return execute_C_Q1_SET2(a, pc, insn);
-            case insn_c_funct3::C_J:
-                return execute_C_J(a, pc, insn);
-            case insn_c_funct3::C_BEQZ:
-                return execute_C_BEQZ(a, pc, insn);
-            case insn_c_funct3::C_BNEZ:
-                return execute_C_BNEZ(a, pc, insn);
-            case insn_c_funct3::C_SLLI:
-                return execute_C_SLLI(a, pc, insn);
-            case insn_c_funct3::C_LWSP:
-                return execute_C_LWSP(a, pc, mcycle, insn);
-            case insn_c_funct3::C_LDSP:
-                return execute_C_LDSP(a, pc, mcycle, insn);
-            case insn_c_funct3::C_Q2_SET0:
-                return execute_C_Q2_SET0(a, pc, insn);
-            case insn_c_funct3::C_SWSP:
-                return execute_C_SWSP(a, pc, mcycle, insn);
-            case insn_c_funct3::C_SDSP:
-                return execute_C_SDSP(a, pc, mcycle, insn);
-            default: {
-                // Here we are sure that the next instruction, at best, can only be a floating point instruction,
-                // or, at worst, an illegal instruction.
-                // Since all float instructions try to read the float state,
-                // we can put the next check before all of them.
-                // If FS is OFF, attempts to read or write the float state will cause an illegal instruction
-                // exception.
-                if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
-                    return raise_illegal_insn_exception(a, pc, insn);
-                }
-                switch (c_funct3) {
-                    case insn_c_funct3::C_FLD:
-                        return execute_C_FLD(a, pc, mcycle, insn);
-                    case insn_c_funct3::C_FSD:
-                        return execute_C_FSD(a, pc, mcycle, insn);
-                    case insn_c_funct3::C_FLDSP:
-                        return execute_C_FLDSP(a, pc, mcycle, insn);
-                    case insn_c_funct3::C_FSDSP:
-                        return execute_C_FSDSP(a, pc, mcycle, insn);
-                    default:
-                        return raise_illegal_insn_exception(a, pc, insn);
-                }
-            }
-        }
-    } else {
-        //??D We should probably try doing the first branch on the combined opcode, funct3, and funct7.
-        //    Maybe it reduces the number of levels needed to decode most instructions.
-        auto funct3_00000_opcode = static_cast<insn_funct3_00000_opcode>(insn_get_funct3_00000_opcode(insn));
-        switch (funct3_00000_opcode) {
-            case insn_funct3_00000_opcode::LB:
-                return execute_LB(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::LH:
-                return execute_LH(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::LW:
-                return execute_LW(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::LD:
-                return execute_LD(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::LBU:
-                return execute_LBU(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::LHU:
-                return execute_LHU(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::LWU:
-                return execute_LWU(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::SB:
-                return execute_SB(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::SH:
-                return execute_SH(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::SW:
-                return execute_SW(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::SD:
-                return execute_SD(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::FENCE:
-                return execute_FENCE(a, pc, insn);
-            case insn_funct3_00000_opcode::FENCE_I:
-                return execute_FENCE_I(a, pc, insn);
-            case insn_funct3_00000_opcode::ADDI:
-                return execute_ADDI(a, pc, insn);
-            case insn_funct3_00000_opcode::SLLI:
-                return execute_SLLI(a, pc, insn);
-            case insn_funct3_00000_opcode::SLTI:
-                return execute_SLTI(a, pc, insn);
-            case insn_funct3_00000_opcode::SLTIU:
-                return execute_SLTIU(a, pc, insn);
-            case insn_funct3_00000_opcode::XORI:
-                return execute_XORI(a, pc, insn);
-            case insn_funct3_00000_opcode::ORI:
-                return execute_ORI(a, pc, insn);
-            case insn_funct3_00000_opcode::ANDI:
-                return execute_ANDI(a, pc, insn);
-            case insn_funct3_00000_opcode::ADDIW:
-                return execute_ADDIW(a, pc, insn);
-            case insn_funct3_00000_opcode::SLLIW:
-                return execute_SLLIW(a, pc, insn);
-            case insn_funct3_00000_opcode::SLLW:
-                return execute_SLLW(a, pc, insn);
-            case insn_funct3_00000_opcode::DIVW:
-                return execute_DIVW(a, pc, insn);
-            case insn_funct3_00000_opcode::REMW:
-                return execute_REMW(a, pc, insn);
-            case insn_funct3_00000_opcode::REMUW:
-                return execute_REMUW(a, pc, insn);
-            case insn_funct3_00000_opcode::BEQ:
-                return execute_BEQ(a, pc, insn);
-            case insn_funct3_00000_opcode::BNE:
-                return execute_BNE(a, pc, insn);
-            case insn_funct3_00000_opcode::BLT:
-                return execute_BLT(a, pc, insn);
-            case insn_funct3_00000_opcode::BGE:
-                return execute_BGE(a, pc, insn);
-            case insn_funct3_00000_opcode::BLTU:
-                return execute_BLTU(a, pc, insn);
-            case insn_funct3_00000_opcode::BGEU:
-                return execute_BGEU(a, pc, insn);
-            case insn_funct3_00000_opcode::JALR:
-                return execute_JALR(a, pc, insn);
-            case insn_funct3_00000_opcode::CSRRW:
-                return execute_CSRRW(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::CSRRS:
-                return execute_CSRRS(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::CSRRC:
-                return execute_CSRRC(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::CSRRWI:
-                return execute_CSRRWI(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::CSRRSI:
-                return execute_CSRRSI(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::CSRRCI:
-                return execute_CSRRCI(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::AUIPC_000:
-            case insn_funct3_00000_opcode::AUIPC_001:
-            case insn_funct3_00000_opcode::AUIPC_010:
-            case insn_funct3_00000_opcode::AUIPC_011:
-            case insn_funct3_00000_opcode::AUIPC_100:
-            case insn_funct3_00000_opcode::AUIPC_101:
-            case insn_funct3_00000_opcode::AUIPC_110:
-            case insn_funct3_00000_opcode::AUIPC_111:
-                return execute_AUIPC(a, pc, insn);
-            case insn_funct3_00000_opcode::LUI_000:
-            case insn_funct3_00000_opcode::LUI_001:
-            case insn_funct3_00000_opcode::LUI_010:
-            case insn_funct3_00000_opcode::LUI_011:
-            case insn_funct3_00000_opcode::LUI_100:
-            case insn_funct3_00000_opcode::LUI_101:
-            case insn_funct3_00000_opcode::LUI_110:
-            case insn_funct3_00000_opcode::LUI_111:
-                return execute_LUI(a, pc, insn);
-            case insn_funct3_00000_opcode::JAL_000:
-            case insn_funct3_00000_opcode::JAL_001:
-            case insn_funct3_00000_opcode::JAL_010:
-            case insn_funct3_00000_opcode::JAL_011:
-            case insn_funct3_00000_opcode::JAL_100:
-            case insn_funct3_00000_opcode::JAL_101:
-            case insn_funct3_00000_opcode::JAL_110:
-            case insn_funct3_00000_opcode::JAL_111:
-                return execute_JAL(a, pc, insn);
-            case insn_funct3_00000_opcode::SRLI_SRAI:
-                return execute_SRLI_SRAI(a, pc, insn);
-            case insn_funct3_00000_opcode::SRLIW_SRAIW:
-                return execute_SRLIW_SRAIW(a, pc, insn);
-            case insn_funct3_00000_opcode::AMO_W:
-                return execute_AMO_W(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::AMO_D:
-                return execute_AMO_D(a, pc, mcycle, insn);
-            case insn_funct3_00000_opcode::ADD_MUL_SUB:
-                return execute_ADD_MUL_SUB(a, pc, insn);
-            case insn_funct3_00000_opcode::SLL_MULH:
-                return execute_SLL_MULH(a, pc, insn);
-            case insn_funct3_00000_opcode::SLT_MULHSU:
-                return execute_SLT_MULHSU(a, pc, insn);
-            case insn_funct3_00000_opcode::SLTU_MULHU:
-                return execute_SLTU_MULHU(a, pc, insn);
-            case insn_funct3_00000_opcode::XOR_DIV:
-                return execute_XOR_DIV(a, pc, insn);
-            case insn_funct3_00000_opcode::SRL_DIVU_SRA:
-                return execute_SRL_DIVU_SRA(a, pc, insn);
-            case insn_funct3_00000_opcode::OR_REM:
-                return execute_OR_REM(a, pc, insn);
-            case insn_funct3_00000_opcode::AND_REMU:
-                return execute_AND_REMU(a, pc, insn);
-            case insn_funct3_00000_opcode::ADDW_MULW_SUBW:
-                return execute_ADDW_MULW_SUBW(a, pc, insn);
-            case insn_funct3_00000_opcode::SRLW_DIVUW_SRAW:
-                return execute_SRLW_DIVUW_SRAW(a, pc, insn);
-            case insn_funct3_00000_opcode::PRIVILEGED:
-                return execute_privileged(a, pc, mcycle, insn);
-            default: {
-                // Here we are sure that the next instruction, at best, can only be a floating point instruction,
-                // or, at worst, an illegal instruction.
-                // Since all float instructions try to read the float state,
-                // we can put the next check before all of them.
-                // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception.
-                if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) {
-                    return raise_illegal_insn_exception(a, pc, insn);
-                }
-                switch (funct3_00000_opcode) {
-                    case insn_funct3_00000_opcode::FSW:
-                        return execute_FSW(a, pc, mcycle, insn);
-                    case insn_funct3_00000_opcode::FSD:
-                        return execute_FSD(a, pc, mcycle, insn);
-                    case insn_funct3_00000_opcode::FLW:
-                        return execute_FLW(a, pc, mcycle, insn);
-                    case insn_funct3_00000_opcode::FLD:
-                        return execute_FLD(a, pc, mcycle, insn);
-                    case insn_funct3_00000_opcode::FMADD_RNE:
-                    case insn_funct3_00000_opcode::FMADD_RTZ:
-                    case insn_funct3_00000_opcode::FMADD_RDN:
-                    case insn_funct3_00000_opcode::FMADD_RUP:
-                    case insn_funct3_00000_opcode::FMADD_RMM:
-                    case insn_funct3_00000_opcode::FMADD_DYN:
-                        return execute_FMADD(a, pc, insn);
-                    case insn_funct3_00000_opcode::FMSUB_RNE:
-                    case insn_funct3_00000_opcode::FMSUB_RTZ:
-                    case insn_funct3_00000_opcode::FMSUB_RDN:
-                    case insn_funct3_00000_opcode::FMSUB_RUP:
-                    case insn_funct3_00000_opcode::FMSUB_RMM:
-                    case insn_funct3_00000_opcode::FMSUB_DYN:
-                        return execute_FMSUB(a, pc, insn);
-                    case insn_funct3_00000_opcode::FNMSUB_RNE:
-                    case insn_funct3_00000_opcode::FNMSUB_RTZ:
-                    case insn_funct3_00000_opcode::FNMSUB_RDN:
-                    case insn_funct3_00000_opcode::FNMSUB_RUP:
-                    case insn_funct3_00000_opcode::FNMSUB_RMM:
-                    case insn_funct3_00000_opcode::FNMSUB_DYN:
-                        return execute_FNMSUB(a, pc, insn);
-                    case insn_funct3_00000_opcode::FNMADD_RNE:
-                    case insn_funct3_00000_opcode::FNMADD_RTZ:
-                    case insn_funct3_00000_opcode::FNMADD_RDN:
-                    case insn_funct3_00000_opcode::FNMADD_RUP:
-                    case insn_funct3_00000_opcode::FNMADD_RMM:
-                    case insn_funct3_00000_opcode::FNMADD_DYN:
-                        return execute_FNMADD(a, pc, insn);
-                    case insn_funct3_00000_opcode::FD_000:
-                    case insn_funct3_00000_opcode::FD_001:
-                    case insn_funct3_00000_opcode::FD_010:
-                    case insn_funct3_00000_opcode::FD_011:
-                    case insn_funct3_00000_opcode::FD_100:
-                    case insn_funct3_00000_opcode::FD_111:
-                        return execute_FD(a, pc, insn);
-                    default:
-                        return raise_illegal_insn_exception(a, pc, insn);
-                }
-            }
-        }
-    }
-}
-
 /// \brief Instruction fetch status code
 enum class fetch_status : int {
     exception, ///< Instruction fetch failed: exception raised
@@ -5388,7 +5211,7 @@ enum class fetch_status : int {
 /// \return Returns fetch_status::success if load succeeded, fetch_status::exception if it caused an exception.
 //          In that case, raise the exception.
 template <typename STATE_ACCESS>
-static FORCE_INLINE fetch_status fetch_translate_pc_slow(STATE_ACCESS &a, uint64_t &pc, uint64_t vaddr,
+static FORCE_INLINE fetch_status fetch_translate_pc_slow(STATE_ACCESS a, uint64_t &pc, uint64_t vaddr,
     unsigned char **phptr) {
     uint64_t paddr{};
     // Walk page table and obtain the physical address
@@ -5419,7 +5242,7 @@ static FORCE_INLINE fetch_status fetch_translate_pc_slow(STATE_ACCESS &a, uint64
 /// \return Returns fetch_status::success if load succeeded, fetch_status::exception if it caused an exception.
 //          In that case, raise the exception.
 template <typename STATE_ACCESS>
-static FORCE_INLINE fetch_status fetch_translate_pc(STATE_ACCESS &a, uint64_t &pc, uint64_t vaddr,
+static FORCE_INLINE fetch_status fetch_translate_pc(STATE_ACCESS a, uint64_t &pc, uint64_t vaddr,
     unsigned char **phptr) {
     // Try to perform the address translation via TLB first
     if (unlikely(!(a.template translate_vaddr_via_tlb<TLB_CODE, uint16_t>(vaddr, phptr)))) {
@@ -5441,43 +5264,55 @@ static FORCE_INLINE fetch_status fetch_translate_pc(STATE_ACCESS &a, uint64_t &p
 /// \return Returns fetch_status::success if load succeeded, fetch_status::exception if it caused an exception.
 //          In that case, raise the exception.
 template <typename STATE_ACCESS>
-static FORCE_INLINE fetch_status fetch_insn(STATE_ACCESS &a, uint64_t &pc, uint32_t &insn, uint64_t &fetch_vaddr_page,
+static FORCE_INLINE fetch_status fetch_insn(STATE_ACCESS a, uint64_t &pc, uint32_t &insn, uint64_t &fetch_vaddr_page,
     uint64_t &fetch_vh_offset) {
+    // Efficiently checks if current pc is in the same page as last pc fetch
+    // and it's not crossing a page boundary.
+    if (likely((pc ^ fetch_vaddr_page) < (PMA_PAGE_SIZE - 2))) {
+        // Fetch pc is in the same page as the last pc fetch and it's not crossing a page boundary,
+        // we can just reuse last fetch translation, skipping TLB or slow address translation altogether.
+        const unsigned char *hptr = cast_addr_to_ptr<unsigned char *>(pc + fetch_vh_offset);
+
+        // Here we are sure that reading 4 bytes won't cross a page boundary.
+        // However pc may not be 4 byte aligned, at best it can only be 2-byte aligned,
+        // therefore we must perform a misaligned 4 byte read on a 2 byte aligned pointer.
+        // In case pc holds a compressed instruction, insn will store 2 additional bytes,
+        // but this is fine because later the instruction decoder will discard them.
+        insn = aliased_unaligned_read<uint32_t, uint16_t>(hptr);
+        return fetch_status::success;
+    }
+    // Fetch pc is either not the same as last cache or crossing a page boundary.
+
+    // Perform address translation
     unsigned char *hptr = nullptr;
-    const uint64_t vaddr_page = pc & ~PAGE_OFFSET_MASK;
-    // If pc is in the same page as the last pc fetch,
-    // we can just reuse last fetch translation, skipping TLB or slow address translation altogether.
-    if (likely(vaddr_page == fetch_vaddr_page)) {
-        hptr = cast_addr_to_ptr<unsigned char *>(pc + fetch_vh_offset);
-    } else {
-        // Not in the same page as last the fetch, we need to perform address translation
-        if (unlikely(fetch_translate_pc(a, pc, pc, &hptr) == fetch_status::exception)) {
-            return fetch_status::exception;
-        }
-        // Update fetch address translation cache
-        fetch_vaddr_page = vaddr_page;
-        fetch_vh_offset = cast_ptr_to_addr<uint64_t>(hptr) - pc;
+    if (unlikely(fetch_translate_pc(a, pc, pc, &hptr) == fetch_status::exception)) {
+        return fetch_status::exception;
     }
+    // Update fetch address translation cache
+    fetch_vaddr_page = pc & ~PAGE_OFFSET_MASK;
+    fetch_vh_offset = cast_ptr_to_addr<uint64_t>(hptr) - pc;
+
     // The following code assumes pc is always 2-byte aligned, this is guaranteed by RISC-V spec.
     // If pc is pointing to the very last 2 bytes of a page, it's crossing a page boundary.
     if (unlikely(((~pc & PAGE_OFFSET_MASK) >> 1) == 0)) {
         // Here we are crossing page boundary, this is unlikely (1 in 2048 possible cases)
         insn = aliased_aligned_read<uint16_t>(hptr);
         // If not a compressed instruction, we must read 2 additional bytes from the next page.
-        if (unlikely((insn & 3) == 3)) {
+        if (unlikely(insn_is_uncompressed(insn))) {
             // We have to perform a new address translation to read the next 2 bytes since we changed pages.
             const uint64_t vaddr = pc + 2;
             if (unlikely(fetch_translate_pc(a, pc, vaddr, &hptr) == fetch_status::exception)) {
                 return fetch_status::exception;
             }
             // Update fetch translation cache
-            fetch_vaddr_page = vaddr & ~PAGE_OFFSET_MASK;
+            fetch_vaddr_page = vaddr;
             fetch_vh_offset = cast_ptr_to_addr<uint64_t>(hptr) - vaddr;
             // Produce the final 4-byte instruction
             insn |= aliased_aligned_read<uint16_t>(hptr) << 16;
         }
         return fetch_status::success;
     }
+
     // Here we are sure that reading 4 bytes won't cross a page boundary.
     // However pc may not be 4 byte aligned, at best it can only be 2-byte aligned,
     // therefore we must perform a misaligned 4 byte read on a 2 byte aligned pointer.
@@ -5489,7 +5324,7 @@ static FORCE_INLINE fetch_status fetch_insn(STATE_ACCESS &a, uint64_t &pc, uint3
 
 /// \brief Checks that false brk is consistent with rest of state
 template <typename STATE_ACCESS>
-static void assert_no_brk([[maybe_unused]] STATE_ACCESS &a) {
+static void assert_no_brk([[maybe_unused]] STATE_ACCESS a) {
     assert(get_pending_irq_mask(a) == 0); // LCOV_EXCL_LINE
     assert(a.read_iflags_X() == 0);       // LCOV_EXCL_LINE
     assert(a.read_iflags_Y() == 0);       // LCOV_EXCL_LINE
@@ -5498,7 +5333,7 @@ static void assert_no_brk([[maybe_unused]] STATE_ACCESS &a) {
 
 /// \brief Interpreter hot loop
 template <typename STATE_ACCESS>
-static NO_INLINE execute_status interpret_loop(STATE_ACCESS &a, uint64_t mcycle_end, uint64_t mcycle) {
+static NO_INLINE execute_status interpret_loop(STATE_ACCESS a, uint64_t mcycle_end, uint64_t mcycle) {
     // The interpret loop is constantly reading and modifying the pc and mcycle variables,
     // because of this care is taken to make them stack variables that are propagated across inline functions,
     // helping the C++ compiler optimize them into registers instead of stack variables when compiling,
@@ -5512,7 +5347,7 @@ static NO_INLINE execute_status interpret_loop(STATE_ACCESS &a, uint64_t mcycle_
     uint64_t pc = a.read_pc();
 
     // Initialize fetch address translation cache invalidated
-    uint64_t fetch_vaddr_page = PAGE_OFFSET_MASK;
+    uint64_t fetch_vaddr_page = ~pc;
     uint64_t fetch_vh_offset = 0;
 
     // The outer loop continues until there is an interruption that should be handled
@@ -5551,8 +5386,462 @@ static NO_INLINE execute_status interpret_loop(STATE_ACCESS &a, uint64_t mcycle_
 
             // Try to fetch the next instruction
             if (likely(fetch_insn(a, pc, insn, fetch_vaddr_page, fetch_vh_offset) == fetch_status::success)) {
-                // Try to execute it
-                const execute_status status = execute_insn(a, pc, mcycle, insn);
+                // clang-format off
+                // NOLINTBEGIN
+                execute_status status; // explicit uninitialized as an optimization
+
+                // This header define the instruction jump table table, which is very large.
+                // It also defines the jump table related macros used in the next big switch.
+                #include "interpret-jump-table.h"
+
+                // This will use computed goto on supported compilers,
+                // otherwise normal switch in unsupported platforms.
+                INSN_SWITCH(insn_get_id(insn)) {
+                    // The instructions is this switch are ordered so
+                    // infrequent instructions are placed at the end.
+
+                    // IM extensions
+                    INSN_CASE(LUI_rdN):
+                        status = execute_LUI<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(AUIPC_rdN):
+                        status = execute_AUIPC<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(JAL_rd0):
+                        status = execute_JAL<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(JAL_rdN):
+                        status = execute_JAL<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(JALR_rd0):
+                        status = execute_JALR<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(JALR_rdN):
+                        status = execute_JALR<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(BEQ):
+                        status = execute_BEQ(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(BNE):
+                        status = execute_BNE(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(BLT):
+                        status = execute_BLT(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(BGE):
+                        status = execute_BGE(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(BLTU):
+                        status = execute_BLTU(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(BGEU):
+                        status = execute_BGEU(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ADDI_rdN):
+                        status = execute_ADDI<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLTI_rdN):
+                        status = execute_SLTI<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLTIU_rdN):
+                        status = execute_SLTIU<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(XORI_rdN):
+                        status = execute_XORI<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ORI_rdN):
+                        status = execute_ORI<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ANDI_rdN):
+                        status = execute_ANDI<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLLI_rdN):
+                        status = execute_SLLI<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SRLI_SRAI_rdN):
+                        status = execute_SRLI_SRAI<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ADD_MUL_SUB_rdN):
+                        status = execute_ADD_MUL_SUB<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLL_MULH_rdN):
+                        status = execute_SLL_MULH<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLT_MULHSU_rdN):
+                        status = execute_SLT_MULHSU<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLTU_MULHU_rdN):
+                        status = execute_SLTU_MULHU<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(XOR_DIV_rdN):
+                        status = execute_XOR_DIV<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SRL_DIVU_SRA_rdN):
+                        status = execute_SRL_DIVU_SRA<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(OR_REM_rdN):
+                        status = execute_OR_REM<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(AND_REMU_rdN):
+                        status = execute_AND_REMU<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ADDIW_rdN):
+                        status = execute_ADDIW<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLLIW_rdN):
+                        status = execute_SLLIW<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SRLIW_SRAIW_rdN):
+                        status = execute_SRLIW_SRAIW<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ADDW_MULW_SUBW_rdN):
+                        status = execute_ADDW_MULW_SUBW<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLLW_rdN):
+                        status = execute_SLLW<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SRLW_DIVUW_SRAW_rdN):
+                        status = execute_SRLW_DIVUW_SRAW<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(DIVW_rdN):
+                        status = execute_DIVW<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(REMW_rdN):
+                        status = execute_REMW<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(REMUW_rdN):
+                        status = execute_REMUW<rd_kind::xN>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LD_rdN):
+                        status = execute_LD<rd_kind::xN>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LW_rdN):
+                        status = execute_LW<rd_kind::xN>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LWU_rdN):
+                        status = execute_LWU<rd_kind::xN>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LH_rdN):
+                        status = execute_LH<rd_kind::xN>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LHU_rdN):
+                        status = execute_LHU<rd_kind::xN>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LB_rdN):
+                        status = execute_LB<rd_kind::xN>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LBU_rdN):
+                        status = execute_LBU<rd_kind::xN>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SD):
+                        status = execute_SD(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SW):
+                        status = execute_SW(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SH):
+                        status = execute_SH(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SB):
+                        status = execute_SB(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    // C extension
+                    INSN_CASE(C_HINT):
+                    INSN_CASE(C_NOP):
+                        status = execute_C_NOP(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_LUI):
+                        status = execute_C_LUI(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_LI):
+                        status = execute_C_LI(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_J):
+                        status = execute_C_J(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_JR):
+                        status = execute_C_JR(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_JALR):
+                        status = execute_C_JALR(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_MV):
+                        status = execute_C_MV(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_BEQZ):
+                        status = execute_C_BEQZ(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_BNEZ):
+                        status = execute_C_BNEZ(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_ADDI):
+                        status = execute_C_ADDI(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_ADDIW):
+                        status = execute_C_ADDIW(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_ADDI4SPN):
+                        status = execute_C_ADDI4SPN(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_ADDI16SP):
+                        status = execute_C_ADDI16SP(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_ANDI):
+                        status = execute_C_ANDI(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_SLLI):
+                        status = execute_C_SLLI(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_SRAI):
+                        status = execute_C_SRAI(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_SRLI):
+                        status = execute_C_SRLI(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_ADD):
+                        status = execute_C_ADD(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_SUB):
+                        status = execute_C_SUB(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_XOR):
+                        status = execute_C_XOR(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_OR):
+                        status = execute_C_OR(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_AND):
+                        status = execute_C_AND(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_ADDW):
+                        status = execute_C_ADDW(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_SUBW):
+                        status = execute_C_SUBW(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_LD):
+                        status = execute_C_LD(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_LW):
+                        status = execute_C_LW(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_LDSP):
+                        status = execute_C_LDSP(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_LWSP):
+                        status = execute_C_LWSP(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_SD):
+                        status = execute_C_SD(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_SW):
+                        status = execute_C_SW(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_SDSP):
+                        status = execute_C_SDSP(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_SWSP):
+                        status = execute_C_SWSP(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_FLD):
+                        status = execute_C_FLD(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_FLDSP):
+                        status = execute_C_FLDSP(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_FSD):
+                        status = execute_C_FSD(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_FSDSP):
+                        status = execute_C_FSDSP(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(C_EBREAK):
+                        status = execute_C_EBREAK(a, pc, insn);
+                        INSN_BREAK();
+                    // FD extensions
+                    INSN_CASE(FD):
+                        status = execute_FD(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(FLD):
+                        status = execute_FLD(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(FLW):
+                        status = execute_FLW(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(FSD):
+                        status = execute_FSD(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(FSW):
+                        status = execute_FSW(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(FMADD):
+                        status = execute_FMADD(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(FMSUB):
+                        status = execute_FMSUB(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(FNMADD):
+                        status = execute_FNMADD(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(FNMSUB):
+                        status = execute_FNMSUB(a, pc, insn);
+                        INSN_BREAK();
+                    // A extension
+                    INSN_CASE(AMO_D):
+                        status = execute_AMO_D(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(AMO_W):
+                        status = execute_AMO_W(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    // Zicsr extension
+                    INSN_CASE(CSRRW):
+                        status = execute_CSRRW(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(CSRRS):
+                        status = execute_CSRRS(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(CSRRC):
+                        status = execute_CSRRC(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(CSRRWI):
+                        status = execute_CSRRWI(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(CSRRSI):
+                        status = execute_CSRRSI(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(CSRRCI):
+                        status = execute_CSRRCI(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    // Special instructions that are less frequent
+                    INSN_CASE(FENCE):
+                        status = execute_FENCE(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(FENCE_I):
+                        status = execute_FENCE_I(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(PRIVILEGED):
+                        status = execute_privileged(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    // Instructions with hints where rd=0
+                    INSN_CASE(LUI_rd0):
+                        status = execute_LUI<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(AUIPC_rd0):
+                        status = execute_AUIPC<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ADDI_rd0):
+                        status = execute_ADDI<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLTI_rd0):
+                        status = execute_SLTI<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLTIU_rd0):
+                        status = execute_SLTIU<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(XORI_rd0):
+                        status = execute_XORI<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ORI_rd0):
+                        status = execute_ORI<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ANDI_rd0):
+                        status = execute_ANDI<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLLI_rd0):
+                        status = execute_SLLI<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SRLI_SRAI_rd0):
+                        status = execute_SRLI_SRAI<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ADD_MUL_SUB_rd0):
+                        status = execute_ADD_MUL_SUB<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLL_MULH_rd0):
+                        status = execute_SLL_MULH<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLT_MULHSU_rd0):
+                        status = execute_SLT_MULHSU<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLTU_MULHU_rd0):
+                        status = execute_SLTU_MULHU<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(XOR_DIV_rd0):
+                        status = execute_XOR_DIV<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SRL_DIVU_SRA_rd0):
+                        status = execute_SRL_DIVU_SRA<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(OR_REM_rd0):
+                        status = execute_OR_REM<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(AND_REMU_rd0):
+                        status = execute_AND_REMU<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ADDIW_rd0):
+                        status = execute_ADDIW<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLLIW_rd0):
+                        status = execute_SLLIW<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SRLIW_SRAIW_rd0):
+                        status = execute_SRLIW_SRAIW<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(ADDW_MULW_SUBW_rd0):
+                        status = execute_ADDW_MULW_SUBW<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SLLW_rd0):
+                        status = execute_SLLW<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(SRLW_DIVUW_SRAW_rd0):
+                        status = execute_SRLW_DIVUW_SRAW<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(DIVW_rd0):
+                        status = execute_DIVW<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(REMW_rd0):
+                        status = execute_REMW<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(REMUW_rd0):
+                        status = execute_REMUW<rd_kind::x0>(a, pc, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LD_rd0):
+                        status = execute_LD<rd_kind::x0>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LW_rd0):
+                        status = execute_LW<rd_kind::x0>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LWU_rd0):
+                        status = execute_LWU<rd_kind::x0>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LH_rd0):
+                        status = execute_LH<rd_kind::x0>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LHU_rd0):
+                        status = execute_LHU<rd_kind::x0>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LB_rd0):
+                        status = execute_LB<rd_kind::x0>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    INSN_CASE(LBU_rd0):
+                        status = execute_LBU<rd_kind::x0>(a, pc, mcycle, insn);
+                        INSN_BREAK();
+                    // Illegal instructions
+                    INSN_CASE(ILLEGAL):
+                        status = raise_illegal_insn_exception(a, pc, insn);
+                        INSN_BREAK();
+#ifndef USE_COMPUTED_GOTO
+                    // When using a naive switch statement, other cases are impossible.
+                    // The following will give a hint to the compiler that it can remove range checks
+                    // (relevant for the WebAssembly target, which cannot use computed gotos).
+                    default:
+                        __builtin_unreachable();
+                        break;
+#endif
+                }
+                INSN_SWITCH_OUT();
+
+                // NOLINTEND
+                // clang-format on
 
                 // When execute status is above success, we have to deal with special loop conditions,
                 // this is very unlikely to happen most of the time
@@ -5562,7 +5851,7 @@ static NO_INLINE execute_status interpret_loop(STATE_ACCESS &a, uint64_t mcycle_
                     // due to MRET/SRET instructions (execute_status::success_and_serve_interrupts)
                     // As a simplification (and optimization), the next line will also invalidate in more cases,
                     // but this it's fine.
-                    fetch_vaddr_page = PAGE_OFFSET_MASK;
+                    fetch_vaddr_page = ~pc;
                     // All status above execute_status::success_and_serve_interrupts will require breaking the loop
                     if (unlikely(status >= execute_status::success_and_serve_interrupts)) {
                         // Increment the cycle counter mcycle
@@ -5598,7 +5887,7 @@ static NO_INLINE execute_status interpret_loop(STATE_ACCESS &a, uint64_t mcycle_
 }
 
 template <typename STATE_ACCESS>
-interpreter_break_reason interpret(STATE_ACCESS &a, uint64_t mcycle_end) {
+interpreter_break_reason interpret(STATE_ACCESS a, uint64_t mcycle_end) {
     static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "code assumes little-endian byte ordering");
     static_assert(is_an_i_state_access<STATE_ACCESS>::value, "not an i_state_access");
 
@@ -5645,14 +5934,14 @@ interpreter_break_reason interpret(STATE_ACCESS &a, uint64_t mcycle_end) {
 
 #ifdef MICROARCHITECTURE
 // Explicit instantiation for uarch_machine_state_access
-template interpreter_break_reason interpret(uarch_machine_state_access &a, uint64_t mcycle_end);
+template interpreter_break_reason interpret(uarch_machine_state_access a, uint64_t mcycle_end);
 #else
 // Explicit instantiation for state_access
-template interpreter_break_reason interpret(state_access &a, uint64_t mcycle_end);
+template interpreter_break_reason interpret(state_access a, uint64_t mcycle_end);
 // Explicit instantiation for record_step_state_access
-template interpreter_break_reason interpret(record_step_state_access &a, uint64_t mcycle_end);
+template interpreter_break_reason interpret(record_step_state_access a, uint64_t mcycle_end);
 // Explicit instantiation for replay_step_state_access
-template interpreter_break_reason interpret(replay_step_state_access &a, uint64_t mcycle_end);
+template interpreter_break_reason interpret(replay_step_state_access a, uint64_t mcycle_end);
 #endif // MICROARCHITECTURE
 
 } // namespace cartesi
diff --git a/src/interpret.h b/src/interpret.h
index 0e02d6dac..8e5297f93 100644
--- a/src/interpret.h
+++ b/src/interpret.h
@@ -54,7 +54,7 @@ enum class interpreter_break_reason {
 /// \details The interpret may stop early if the machine halts permanently or becomes temporarily idle (waiting for
 /// interrupts).
 template <typename STATE_ACCESS>
-interpreter_break_reason interpret(STATE_ACCESS &a, uint64_t mcycle_end);
+interpreter_break_reason interpret(STATE_ACCESS a, uint64_t mcycle_end);
 
 } // namespace cartesi
 
diff --git a/src/machine-state.h b/src/machine-state.h
index 915b9c8b2..f587c348e 100644
--- a/src/machine-state.h
+++ b/src/machine-state.h
@@ -59,9 +59,10 @@ struct machine_state {
 
     // The following state fields are very hot,
     // and are carefully ordered to have better data locality in the interpreter loop.
+    // The X registers are the very first to optimize access of registers in the interpreter.
+    std::array<uint64_t, X_REG_COUNT> x{}; ///< Register file
     uint64_t mcycle{};                     ///< CSR mcycle.
     uint64_t pc{};                         ///< Program counter.
-    std::array<uint64_t, X_REG_COUNT> x{}; ///< Register file.
     uint64_t fcsr{};                       ///< CSR fcsr.
     std::array<uint64_t, F_REG_COUNT> f{}; ///< Floating-point register file.
 
diff --git a/src/machine.cpp b/src/machine.cpp
index 17b1c2179..8bbf0a9d7 100644
--- a/src/machine.cpp
+++ b/src/machine.cpp
@@ -18,12 +18,14 @@
 
 #include <algorithm>
 #include <cerrno>
+#include <cinttypes>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <iomanip>
 #include <iostream>
 #include <memory>
+#include <stdexcept>
 #include <string>
 #include <system_error>
 #include <type_traits>
@@ -837,7 +839,7 @@ machine::~machine() {
 #ifdef DUMP_HIST
     std::ignore = fprintf(stderr, "\nInstruction Histogram:\n");
     for (auto v : m_s.insn_hist) {
-        std::ignore = fprintf(stderr, "%s: %" PRIu64 "\n", v.first.c_str(), v.second);
+        std::ignore = fprintf(stderr, "%12" PRIu64 "  %s\n", v.second, v.first.c_str());
     }
 #endif
 #if DUMP_COUNTERS
@@ -2258,7 +2260,7 @@ void machine::fill_memory(uint64_t address, uint8_t data, uint64_t length) {
 }
 
 void machine::read_virtual_memory(uint64_t vaddr_start, unsigned char *data, uint64_t length) {
-    state_access a(*this);
+    const state_access a(*this);
     if (length == 0) {
         return;
     }
@@ -2289,7 +2291,7 @@ void machine::read_virtual_memory(uint64_t vaddr_start, unsigned char *data, uin
 }
 
 void machine::write_virtual_memory(uint64_t vaddr_start, const unsigned char *data, uint64_t length) {
-    state_access a(*this);
+    const state_access a(*this);
     if (length == 0) {
         return;
     }
@@ -2322,7 +2324,7 @@ void machine::write_virtual_memory(uint64_t vaddr_start, const unsigned char *da
 }
 
 uint64_t machine::translate_virtual_address(uint64_t vaddr) {
-    state_access a(*this);
+    const state_access a(*this);
     // perform address translation using read access mode
     uint64_t paddr = 0;
     if (!cartesi::translate_virtual_address<state_access, false>(a, &paddr, vaddr, PTE_XWR_R_SHIFT)) {
@@ -2357,7 +2359,7 @@ uint64_t machine::read_word(uint64_t word_address) const {
 }
 
 void machine::send_cmio_response(uint16_t reason, const unsigned char *data, uint64_t length) {
-    state_access a(*this);
+    const state_access a(*this);
     cartesi::send_cmio_response(a, reason, data, length);
 }
 
@@ -2365,8 +2367,9 @@ access_log machine::log_send_cmio_response(uint16_t reason, const unsigned char
     const access_log::type &log_type) {
     hash_type root_hash_before;
     get_root_hash(root_hash_before);
+    access_log log(log_type);
     // Call send_cmio_response  with the recording state accessor
-    record_state_access a(*this, log_type);
+    record_state_access a(*this, log);
     a.push_bracket(bracket_type::begin, "send cmio response");
     cartesi::send_cmio_response(a, reason, data, length);
     a.push_bracket(bracket_type::end, "send cmio response");
@@ -2374,8 +2377,8 @@ access_log machine::log_send_cmio_response(uint16_t reason, const unsigned char
     hash_type root_hash_after;
     update_merkle_tree();
     get_root_hash(root_hash_after);
-    verify_send_cmio_response(reason, data, length, root_hash_before, *a.get_log(), root_hash_after);
-    return std::move(*a.get_log());
+    verify_send_cmio_response(reason, data, length, root_hash_before, log, root_hash_after);
+    return log;
 }
 
 void machine::verify_send_cmio_response(uint16_t reason, const unsigned char *data, uint64_t length,
@@ -2384,9 +2387,9 @@ void machine::verify_send_cmio_response(uint16_t reason, const unsigned char *da
     if (log.get_accesses().empty()) {
         throw std::invalid_argument{"too few accesses in log"};
     }
-
+    replay_state_access::context context(log, root_hash_before);
     // Verify all intermediate state transitions
-    replay_state_access a(log, root_hash_before);
+    replay_state_access a(context);
     cartesi::send_cmio_response(a, reason, data, length);
     a.finish();
 
@@ -2507,7 +2510,8 @@ interpreter_break_reason machine::log_step(uint64_t mcycle_count, const std::str
     }
     hash_type root_hash_before;
     get_root_hash(root_hash_before);
-    record_step_state_access a(*this, filename);
+    record_step_state_access::context context(filename);
+    record_step_state_access a(context, *this);
     uint64_t mcycle_end{};
     if (__builtin_add_overflow(a.read_mcycle(), mcycle_count, &mcycle_end)) {
         mcycle_end = UINT64_MAX;
@@ -2524,7 +2528,8 @@ interpreter_break_reason machine::verify_step(const hash_type &root_hash_before,
     uint64_t mcycle_count, const hash_type &root_hash_after) {
     auto data_length = os_get_file_length(filename.c_str(), "step log file");
     auto *data = os_map_file(filename.c_str(), data_length, false /* not shared */);
-    replay_step_state_access a(data, data_length, root_hash_before);
+    replay_step_state_access::context context;
+    replay_step_state_access a(context, data, data_length, root_hash_before);
     uint64_t mcycle_end{};
     if (__builtin_add_overflow(a.read_mcycle(), mcycle_count, &mcycle_end)) {
         mcycle_end = UINT64_MAX;
@@ -2539,7 +2544,7 @@ interpreter_break_reason machine::run(uint64_t mcycle_end) {
     if (mcycle_end < read_reg(reg::mcycle)) {
         throw std::invalid_argument{"mcycle is past"};
     }
-    state_access a(*this);
+    const state_access a(*this);
     return interpret(a, mcycle_end);
 }
 
diff --git a/src/record-state-access.h b/src/record-state-access.h
index fcc0800dd..2c5b24853 100644
--- a/src/record-state-access.h
+++ b/src/record-state-access.h
@@ -45,10 +45,9 @@ class record_state_access : public i_state_access<record_state_access, pma_entry
     using hasher_type = machine_merkle_tree::hasher_type;
     using hash_type = machine_merkle_tree::hash_type;
     // NOLINTBEGIN(cppcoreguidelines-avoid-const-or-ref-data-members)
-    machine &m_m; ///< Associated machine
-    machine_state &m_s;
+    machine &m_m;      ///< Associated machine
+    access_log &m_log; ///< Pointer to access log
     // NOLINTEND(cppcoreguidelines-avoid-const-or-ref-data-members)
-    std::shared_ptr<access_log> m_log; ///< Pointer to access log
 
     static void get_hash(const access_data &data, hash_type &hash) {
         hasher_type hasher;
@@ -57,35 +56,12 @@ class record_state_access : public i_state_access<record_state_access, pma_entry
 
 public:
     /// \brief Constructor from machine state.
-    /// \param m Pointer to machine state.
-    explicit record_state_access(machine &m, access_log::type log_type) :
-        m_m(m),
-        m_s(m.get_state()),
-        m_log(std::make_shared<access_log>(log_type)) {
+    /// \param m Reference to machine state.
+    /// \param log Reference to access log.
+    explicit record_state_access(machine &m, access_log &log) : m_m(m), m_log(log) {
         ;
     }
 
-    /// \brief No copy constructor
-    record_state_access(const record_state_access &) = delete;
-    /// \brief No copy assignment
-    record_state_access &operator=(const record_state_access &) = delete;
-    /// \brief No move constructor
-    record_state_access(record_state_access &&) = delete;
-    /// \brief No move assignment
-    record_state_access &operator=(record_state_access &&) = delete;
-    /// \brief Default destructor
-    ~record_state_access() = default;
-
-    /// \brief Returns const pointer to access log.
-    std::shared_ptr<const access_log> get_log() const {
-        return m_log;
-    }
-
-    /// \brief Returns pointer to access log.
-    std::shared_ptr<access_log> get_log() {
-        return m_log;
-    }
-
 private:
     /// \brief Logs a read access of a uint64_t word from the machine state.
     /// \param paligned Physical address in the machine state, aligned to a 64-bit word.
@@ -119,7 +95,7 @@ class record_state_access : public i_state_access<record_state_access, pma_entry
         m_m.read_memory(pleaf_aligned, a.get_read().value().data(), machine_merkle_tree::get_word_size());
         get_hash(a.get_read().value(), a.get_read_hash());
         // NOLINTEND(bugprone-unchecked-optional-access)
-        m_log->push_access(std::move(a), text);
+        m_log.push_access(std::move(a), text);
     }
 
     /// \brief Logs a write access before it happens.
@@ -162,7 +138,7 @@ class record_state_access : public i_state_access<record_state_access, pma_entry
         a.get_written_hash().emplace();
         get_hash(a.get_written().value(), a.get_written_hash().value());
         // NOLINTEND(bugprone-unchecked-optional-access)
-        m_log->push_access(std::move(a), text);
+        m_log.push_access(std::move(a), text);
     }
 
     /// \brief Updates the Merkle tree after the modification of a word in the machine state.
@@ -196,7 +172,7 @@ class record_state_access : public i_state_access<record_state_access, pma_entry
     friend i_state_access<record_state_access, pma_entry>;
 
     void do_push_bracket(bracket_type &type, const char *text) {
-        m_log->push_bracket(type, text);
+        m_log.push_bracket(type, text);
     }
 
     void do_reset_iflags_Y() {
@@ -248,7 +224,7 @@ class record_state_access : public i_state_access<record_state_access, pma_entry
         auto proof = m_m.get_proof(paddr, write_length_log2_size);
         // log hash and data before write
         a.set_read_hash(proof.get_target_hash());
-        if (m_log->get_log_type().has_large_data()) {
+        if (m_log.get_log_type().has_large_data()) {
             access_data &data = a.get_read().emplace(write_length);
             memcpy(data.data(), pma.get_memory().get_host_memory(), write_length);
         }
@@ -270,12 +246,12 @@ class record_state_access : public i_state_access<record_state_access, pma_entry
         hasher_type hasher{};
         get_merkle_tree_hash(hasher, pma.get_memory().get_host_memory(), write_length,
             machine_merkle_tree::get_word_size(), a.get_written_hash().value());
-        if (m_log->get_log_type().has_large_data()) {
+        if (m_log.get_log_type().has_large_data()) {
             access_data &data = a.get_written().emplace(write_length);
             memcpy(data.data(), pma.get_memory().get_host_memory(), write_length);
         }
         // NOLINTEND(bugprone-unchecked-optional-access)
-        m_log->push_access(a, "cmio rx buffer");
+        m_log.push_access(a, "cmio rx buffer");
     }
 };
 
diff --git a/src/record-step-state-access.h b/src/record-step-state-access.h
index 323b0f1ce..0ef701520 100644
--- a/src/record-step-state-access.h
+++ b/src/record-step-state-access.h
@@ -33,6 +33,7 @@ namespace cartesi {
 /// \class record_step_state_access
 /// \brief Records machine state access into a step log file
 class record_step_state_access : public i_state_access<record_step_state_access, pma_entry> {
+public:
     constexpr static int LOG2_ROOT_SIZE = machine_merkle_tree::get_log2_root_size();
     constexpr static int LOG2_PAGE_SIZE = machine_merkle_tree::get_log2_page_size();
     constexpr static uint64_t PAGE_SIZE = UINT64_C(1) << LOG2_PAGE_SIZE;
@@ -44,32 +45,38 @@ class record_step_state_access : public i_state_access<record_step_state_access,
     using sibling_hashes_type = std::vector<hash_type>;
     using page_indices_type = std::vector<address_type>;
 
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
-    machine &m_m;                       ///<  reference to machine
-    std::string m_filename;             ///<  where to save the log
-    mutable pages_type m_touched_pages; ///<  copy of all pages touched during execution
+    struct context {
+        /// \brief Constructor of record step state access context
+        /// \param filename where to save the log
+        explicit context(std::string filename) : filename(std::move(filename)) {
+            ;
+        }
+        std::string filename;             ///<  where to save the log
+        mutable pages_type touched_pages; ///<  copy of all pages touched during execution
+    };
+
+private:
+    // NOLINTBEGIN(cppcoreguidelines-avoid-const-or-ref-data-members)
+    context &m_context; ///<  context for the recording
+    machine &m_m;       ///<  reference to machine
+    // NOLINTEND(cppcoreguidelines-avoid-const-or-ref-data-members)
 
 public:
-    /// \brief Constructor
+    /// \brief Constructor of record step state access
+    /// \param context Context for the recording with the log filename
     /// \param m reference to machine
-    /// \param filename where to save the log
     /// \details The log file is saved when finish() is called
-    record_step_state_access(machine &m, const std::string &filename) : m_m(m), m_filename(filename) {
-        if (os_file_exists(filename.c_str())) {
+    record_step_state_access(context &context, machine &m) : m_context(context), m_m(m) {
+        if (os_file_exists(m_context.filename.c_str())) {
             throw std::runtime_error("file already exists");
         }
     }
-    record_step_state_access(const record_step_state_access &) = delete;
-    record_step_state_access(record_step_state_access &&) = delete;
-    record_step_state_access &operator=(const record_step_state_access &) = delete;
-    record_step_state_access &operator=(record_step_state_access &&) = delete;
-    ~record_step_state_access() = default;
 
     /// \brief Finish recording and save the log file
     void finish() {
         // get sibling hashes of all touched pages
         auto sibling_hashes = get_sibling_hashes();
-        uint64_t page_count = m_touched_pages.size();
+        uint64_t page_count = m_context.touched_pages.size();
         uint64_t sibling_count = sibling_hashes.size();
 
         // Write log file.
@@ -78,11 +85,11 @@ class record_step_state_access : public i_state_access<record_step_state_access,
         // We store the page index, instead of the page address.
         // Scratch area is used by the replay to store page hashes, which change during replay
         // This is to work around the lack of dynamic memory allocation when replaying the log in microarchitectures
-        auto fp = unique_fopen(m_filename.c_str(), "wb");
+        auto fp = unique_fopen(m_context.filename.c_str(), "wb");
         if (fwrite(&page_count, sizeof(page_count), 1, fp.get()) != 1) {
             throw std::runtime_error("Could not write page count to log file");
         }
-        for (auto &[address, data] : m_touched_pages) {
+        for (auto &[address, data] : m_context.touched_pages) {
             const auto page_index = address >> LOG2_PAGE_SIZE;
             if (fwrite(&page_index, sizeof(page_index), 1, fp.get()) != 1) {
                 throw std::runtime_error("Could not write page index to log file");
@@ -112,10 +119,10 @@ class record_step_state_access : public i_state_access<record_step_state_access,
     /// \param address address of the page
     void touch_page(address_type address) const {
         auto page = address & ~(PAGE_SIZE - 1);
-        if (m_touched_pages.find(page) != m_touched_pages.end()) {
+        if (m_context.touched_pages.find(page) != m_context.touched_pages.end()) {
             return; // already saved
         }
-        auto [it, _] = m_touched_pages.emplace(page, page_data_type());
+        auto [it, _] = m_context.touched_pages.emplace(page, page_data_type());
         m_m.read_memory(page, it->second.data(), it->second.size());
     }
 
@@ -125,7 +132,7 @@ class record_step_state_access : public i_state_access<record_step_state_access,
         // page address are converted to page indices, in order to avoid overflows
         page_indices_type page_indices{};
         // iterate in ascending order of page addresses (the container is ordered by key)
-        for (const auto &[address, _] : m_touched_pages) {
+        for (const auto &[address, _] : m_context.touched_pages) {
             page_indices.push_back(address >> LOG2_PAGE_SIZE);
         }
         auto next_page_index = page_indices.cbegin();
diff --git a/src/replay-state-access.h b/src/replay-state-access.h
index 8ef97074f..991c2c0c3 100644
--- a/src/replay-state-access.h
+++ b/src/replay-state-access.h
@@ -43,50 +43,49 @@ namespace cartesi {
 
 /// \brief Allows replaying a uarch reset operation from an access log.
 class replay_state_access : public i_state_access<replay_state_access, pma_entry> {
+public:
     using tree_type = machine_merkle_tree;
     using hash_type = tree_type::hash_type;
     using hasher_type = tree_type::hasher_type;
     using proof_type = tree_type::proof_type;
 
-    ///< Access log generated by log_reset_uarch
-    const std::vector<access> &m_accesses; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
-    ///< Index of next access to ne consumed
-    unsigned m_next_access{};
-    ///< Root hash before next access
-    machine_merkle_tree::hash_type m_root_hash;
-    ///< Hasher needed to verify proofs
-    machine_merkle_tree::hasher_type m_hasher;
+    struct context {
+        /// \brief Constructor replay_state_access context
+        /// \param log Access log to be replayed
+        /// \param initial_hash Initial root hash
+        context(const access_log &log, machine_merkle_tree::hash_type initial_hash) :
+            accesses(log.get_accesses()),
+            root_hash(initial_hash) {
+            ;
+        }
+        const std::vector<access> &accesses; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
+        ///< Index of next access to ne consumed
+        unsigned int next_access{};
+        ///< Root hash before next access
+        machine_merkle_tree::hash_type root_hash;
+        ///< Hasher needed to verify proofs
+        machine_merkle_tree::hasher_type hasher;
+    };
+
+private:
+    context &m_context; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
 
 public:
     /// \brief Constructor from access log
-    /// \param log Access log to be replayed
-    /// \param initial_hash Initial root hash
-    explicit replay_state_access(const access_log &log, const hash_type &initial_hash) :
-        m_accesses(log.get_accesses()),
-        m_root_hash{initial_hash} {
-        if (m_accesses.empty()) {
+    /// \param context Context with access log and initial root hash
+    explicit replay_state_access(replay_state_access::context &context) : m_context{context} {
+        if (m_context.accesses.empty()) {
             throw std::invalid_argument{"the access log has no accesses"};
         }
     }
 
-    /// \brief No copy constructor
-    replay_state_access(const replay_state_access &) = delete;
-    /// \brief No copy assignment
-    replay_state_access &operator=(const replay_state_access &) = delete;
-    /// \brief No move constructor
-    replay_state_access(replay_state_access &&) = delete;
-    /// \brief No move assignment
-    replay_state_access &operator=(replay_state_access &&) = delete;
-    /// \brief Default destructor
-    ~replay_state_access() = default;
-
     void get_root_hash(machine_merkle_tree::hash_type &hash) const {
-        hash = m_root_hash;
+        hash = m_context.root_hash;
     }
 
     /// \brief Checks if access log was fully consumed after reset operation is finished
     void finish() {
-        if (m_next_access != m_accesses.size()) {
+        if (m_context.next_access != m_context.accesses.size()) {
             throw std::invalid_argument{"access log was not fully consumed"};
         }
     }
@@ -95,7 +94,7 @@ class replay_state_access : public i_state_access<replay_state_access, pma_entry
     friend i_state_access<replay_state_access, pma_entry>;
 
     std::string access_to_report() const {
-        auto index = m_next_access + 1;
+        auto index = m_context.next_access + 1;
         switch (index) {
             case 1:
                 return "1st access";
@@ -125,10 +124,10 @@ class replay_state_access : public i_state_access<replay_state_access, pma_entry
         if ((paligned & (sizeof(uint64_t) - 1)) != 0) {
             throw std::invalid_argument{"address not aligned to word size"};
         }
-        if (m_next_access >= m_accesses.size()) {
+        if (m_context.next_access >= m_context.accesses.size()) {
             throw std::invalid_argument{"too few accesses in log"};
         }
-        const auto &access = m_accesses[m_next_access];
+        const auto &access = m_context.accesses[m_context.next_access];
         if (access.get_type() != access_type::read) {
             throw std::invalid_argument{"expected " + access_to_report() + " to read " + text};
         }
@@ -153,18 +152,18 @@ class replay_state_access : public i_state_access<replay_state_access, pma_entry
         }
         // check if logged read data hashes to the logged read hash
         hash_type computed_read_hash{};
-        get_hash(m_hasher, read_data, computed_read_hash);
+        get_hash(m_context.hasher, read_data, computed_read_hash);
         if (access.get_read_hash() != computed_read_hash) {
             throw std::invalid_argument{"logged read data of " + std::string(text) +
                 " data does not hash to the logged read hash at " + access_to_report()};
         }
         // NOLINTEND(bugprone-unchecked-optional-access)
         // check proof
-        auto proof = access.make_proof(m_root_hash);
-        if (!proof.verify(m_hasher)) {
+        auto proof = access.make_proof(m_context.root_hash);
+        if (!proof.verify(m_context.hasher)) {
             throw std::invalid_argument{"Mismatch in root hash of " + access_to_report()};
         }
-        m_next_access++;
+        m_context.next_access++;
         const uint64_t pleaf_aligned = paligned & ~(machine_merkle_tree::get_word_size() - 1);
         const int word_offset = static_cast<int>(paligned - pleaf_aligned);
         return get_word_access_data(read_data, word_offset);
@@ -181,10 +180,10 @@ class replay_state_access : public i_state_access<replay_state_access, pma_entry
         if ((paligned & (sizeof(uint64_t) - 1)) != 0) {
             throw std::invalid_argument{"paligned not aligned to word size"};
         }
-        if (m_next_access >= m_accesses.size()) {
+        if (m_context.next_access >= m_context.accesses.size()) {
             throw std::invalid_argument{"too few accesses in log"};
         }
-        const auto &access = m_accesses[m_next_access];
+        const auto &access = m_context.accesses[m_context.next_access];
         if (access.get_type() != access_type::write) {
             throw std::invalid_argument{"expected " + access_to_report() + " to write " + text};
         }
@@ -210,7 +209,7 @@ class replay_state_access : public i_state_access<replay_state_access, pma_entry
         }
         // check if read data hashes to the logged read hash
         hash_type computed_read_hash{};
-        get_hash(m_hasher, read_data, computed_read_hash);
+        get_hash(m_context.hasher, read_data, computed_read_hash);
         if (access.get_read_hash() != computed_read_hash) {
             throw std::invalid_argument{"logged read data of " + std::string(text) +
                 " does not hash to the logged read hash at " + access_to_report()};
@@ -230,7 +229,7 @@ class replay_state_access : public i_state_access<replay_state_access, pma_entry
         }
         // check if written data hashes to the logged written hash
         hash_type computed_written_hash{};
-        get_hash(m_hasher, written_data, computed_written_hash);
+        get_hash(m_context.hasher, written_data, computed_written_hash);
         if (written_hash != computed_written_hash) {
             throw std::invalid_argument{"logged written data of " + std::string(text) +
                 " does not hash to the logged written hash at " + access_to_report()};
@@ -252,13 +251,13 @@ class replay_state_access : public i_state_access<replay_state_access, pma_entry
         }
         // NOLINTEND(bugprone-unchecked-optional-access)
         // check proof
-        auto proof = access.make_proof(m_root_hash);
-        if (!proof.verify(m_hasher)) {
+        auto proof = access.make_proof(m_context.root_hash);
+        if (!proof.verify(m_context.hasher)) {
             throw std::invalid_argument{"Mismatch in root hash of " + access_to_report()};
         }
         // Update root hash to reflect the data written by this access
-        m_root_hash = proof.bubble_up(m_hasher, written_hash);
-        m_next_access++;
+        m_context.root_hash = proof.bubble_up(m_context.hasher, written_hash);
+        m_context.next_access++;
     }
 
     void do_push_bracket(bracket_type & /*type*/, const char * /*text*/) {}
@@ -289,10 +288,10 @@ class replay_state_access : public i_state_access<replay_state_access, pma_entry
             throw std::invalid_argument{"write length is less than data length"};
         }
         const auto text = std::string("cmio rx buffer");
-        if (m_next_access >= m_accesses.size()) {
+        if (m_context.next_access >= m_context.accesses.size()) {
             throw std::invalid_argument{"too few accesses in log"};
         }
-        const auto &access = m_accesses[m_next_access];
+        const auto &access = m_context.accesses[m_context.next_access];
         if (access.get_address() != paddr) {
             throw std::invalid_argument{"expected address of " + access_to_report() + " to match address of " + text};
         }
@@ -344,13 +343,13 @@ class replay_state_access : public i_state_access<replay_state_access, pma_entry
         }
         // NOLINTEND(bugprone-unchecked-optional-access)
         // check proof
-        auto proof = access.make_proof(m_root_hash);
-        if (!proof.verify(m_hasher)) {
+        auto proof = access.make_proof(m_context.root_hash);
+        if (!proof.verify(m_context.hasher)) {
             throw std::invalid_argument{"Mismatch in root hash of " + access_to_report()};
         }
         // Update root hash to reflect the data written by this access
-        m_root_hash = proof.bubble_up(m_hasher, written_hash);
-        m_next_access++;
+        m_context.root_hash = proof.bubble_up(m_context.hasher, written_hash);
+        m_context.next_access++;
     }
 };
 
diff --git a/src/replay-step-state-access.h b/src/replay-step-state-access.h
index 1c39c8167..8028803dd 100644
--- a/src/replay-step-state-access.h
+++ b/src/replay-step-state-access.h
@@ -27,6 +27,7 @@
 #include "i-state-access.h"
 #include "plic.h"
 #include "pma-constants.h"
+#include "replay-state-access.h"
 #include "replay-step-state-access-interop.h"
 #include "riscv-constants.h"
 #include "shadow-pmas.h"
@@ -165,12 +166,10 @@ static_assert(sizeof(shadow_tlb_state::hot) + sizeof(shadow_tlb_state::cold) ==
 // \brief Provides machine state from a step log file
 class replay_step_state_access : public i_state_access<replay_step_state_access, mock_pma_entry> {
 public:
-    using hash_type = std::array<unsigned char, interop_machine_hash_byte_size>;
-    static_assert(sizeof(hash_type) == interop_machine_hash_byte_size);
-
-private:
     using address_type = uint64_t;
     using data_type = unsigned char[PMA_PAGE_SIZE];
+    using hash_type = std::array<unsigned char, interop_machine_hash_byte_size>;
+    static_assert(sizeof(hash_type) == interop_machine_hash_byte_size);
 
     struct PACKED page_type {
         address_type index;
@@ -178,19 +177,27 @@ class replay_step_state_access : public i_state_access<replay_step_state_access,
         hash_type hash;
     };
 
-    uint64_t m_page_count{0};                                    ///< Number of pages in the step log
-    page_type *m_pages{nullptr};                                 ///< Array of page data
-    uint64_t m_sibling_count{0};                                 ///< Number of sibling hashes in the step log
-    hash_type *m_sibling_hashes{nullptr};                        ///< Array of sibling hashes
-    std::array<std::optional<mock_pma_entry>, PMA_MAX> m_pmas{}; ///< Array of PMA entries
+    struct context {
+        uint64_t page_count{0};                                    ///< Number of pages in the step log
+        page_type *pages{nullptr};                                 ///< Array of page data
+        uint64_t sibling_count{0};                                 ///< Number of sibling hashes in the step log
+        hash_type *sibling_hashes{nullptr};                        ///< Array of sibling hashes
+        std::array<std::optional<mock_pma_entry>, PMA_MAX> pmas{}; ///< Array of PMA entries
+    };
+
+private:
+    context &m_context; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
 
 public:
     // \brief Construct a replay_step_state_access object from a log image and expected initial root hash
+    // \param context The context object to be filled with the replay step log data
     // \param log_image Image of the step log file
     // \param log_size The size of the log data
     // \param root_hash_before The expected machine root hash before the replay
     // \throw runtime_error if the initial root hash does not match or the log data is invalid
-    replay_step_state_access(unsigned char *log_image, uint64_t log_size, const hash_type &root_hash_before) {
+    replay_step_state_access(context &context, unsigned char *log_image, uint64_t log_size,
+        const hash_type &root_hash_before) :
+        m_context(context) {
         // relevant offsets in the log data
         uint64_t first_page_offset{};
         uint64_t first_siblng_offset{};
@@ -203,36 +210,36 @@ class replay_step_state_access : public i_state_access<replay_step_state_access,
         }
 
         // set page count
-        if (!validate_and_advance_offset(log_size, 0, sizeof(m_page_count), 1, &first_page_offset)) {
+        if (!validate_and_advance_offset(log_size, 0, sizeof(m_context.page_count), 1, &first_page_offset)) {
             interop_throw_runtime_error("page count past end of step log");
         }
-        memcpy(&m_page_count, log_image, sizeof(m_page_count));
-        if (m_page_count == 0) {
+        memcpy(&m_context.page_count, log_image, sizeof(m_context.page_count));
+        if (m_context.page_count == 0) {
             interop_throw_runtime_error("page count is zero");
         }
 
         // set page data
-        if (!validate_and_advance_offset(log_size, first_page_offset, sizeof(page_type), m_page_count,
+        if (!validate_and_advance_offset(log_size, first_page_offset, sizeof(page_type), m_context.page_count,
                 &sibling_count_offset)) {
             interop_throw_runtime_error("page data past end of step log");
         }
         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-        m_pages = reinterpret_cast<page_type *>(log_image + first_page_offset);
+        m_context.pages = reinterpret_cast<page_type *>(log_image + first_page_offset);
 
         // set sibling count and hashes
-        if (!validate_and_advance_offset(log_size, sibling_count_offset, sizeof(m_sibling_count), 1,
+        if (!validate_and_advance_offset(log_size, sibling_count_offset, sizeof(m_context.sibling_count), 1,
                 &first_siblng_offset)) {
             interop_throw_runtime_error("sibling count past end of step log");
         }
-        memcpy(&m_sibling_count, log_image + sibling_count_offset, sizeof(m_sibling_count));
+        memcpy(&m_context.sibling_count, log_image + sibling_count_offset, sizeof(m_context.sibling_count));
 
         // set sibling hashes
-        if (!validate_and_advance_offset(log_size, first_siblng_offset, sizeof(hash_type), m_sibling_count,
+        if (!validate_and_advance_offset(log_size, first_siblng_offset, sizeof(hash_type), m_context.sibling_count,
                 &end_offset)) {
             interop_throw_runtime_error("sibling hashes past end of step log");
         }
         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-        m_sibling_hashes = reinterpret_cast<hash_type *>(log_image + first_siblng_offset);
+        m_context.sibling_hashes = reinterpret_cast<hash_type *>(log_image + first_siblng_offset);
 
         // ensure that we read exactly the expected log size
         if (end_offset != log_size) {
@@ -242,11 +249,11 @@ class replay_step_state_access : public i_state_access<replay_step_state_access,
         // ensure that the page indexes are in increasing order
         // and that the scratch hash area is all zeros
         static const hash_type all_zeros{};
-        for (uint64_t i = 0; i < m_page_count; i++) {
-            if (i > 0 && m_pages[i - 1].index >= m_pages[i].index) {
+        for (uint64_t i = 0; i < m_context.page_count; i++) {
+            if (i > 0 && m_context.pages[i - 1].index >= m_context.pages[i].index) {
                 interop_throw_runtime_error("invalid log format: page index is not in increasing order");
             }
-            if (m_pages[i].hash != all_zeros) {
+            if (m_context.pages[i].hash != all_zeros) {
                 interop_throw_runtime_error("invalid log format: page scratch hash area is not zero");
             }
         }
@@ -262,12 +269,6 @@ class replay_step_state_access : public i_state_access<replay_step_state_access,
         relocate_all_tlb_vh_offset<TLB_WRITE>();
     }
 
-    replay_step_state_access(const replay_step_state_access &) = delete;
-    replay_step_state_access(replay_step_state_access &&) = delete;
-    replay_step_state_access &operator=(const replay_step_state_access &) = delete;
-    replay_step_state_access &operator=(replay_step_state_access &&) = delete;
-    ~replay_step_state_access() = default;
-
     // \brief Finish the replay and check the final machine root hash
     // \param final_root_hash The expected final machine root hash
     // \throw runtime_error if the final root hash does not match
@@ -344,13 +345,13 @@ class replay_step_state_access : public i_state_access<replay_step_state_access,
     page_type *try_find_page(uint64_t address) const {
         const auto page_index = address >> PMA_PAGE_SIZE_LOG2;
         uint64_t min{0};
-        uint64_t max{m_page_count};
+        uint64_t max{m_context.page_count};
         while (min < max) {
             auto mid = (min + max) >> 1;
-            if (m_pages[mid].index == page_index) {
-                return &m_pages[mid];
+            if (m_context.pages[mid].index == page_index) {
+                return &m_context.pages[mid];
             }
-            if (m_pages[mid].index < page_index) {
+            if (m_context.pages[mid].index < page_index) {
                 min = mid + 1;
             } else {
                 max = mid;
@@ -420,20 +421,20 @@ class replay_step_state_access : public i_state_access<replay_step_state_access,
 
     // \brief Compute the current machine root hash
     hash_type compute_root_hash() {
-        for (uint64_t i = 0; i < m_page_count; i++) {
-            interop_merkle_tree_hash(m_pages[i].data, PMA_PAGE_SIZE,
+        for (uint64_t i = 0; i < m_context.page_count; i++) {
+            interop_merkle_tree_hash(m_context.pages[i].data, PMA_PAGE_SIZE,
                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-                reinterpret_cast<interop_hash_type>(&m_pages[i].hash));
+                reinterpret_cast<interop_hash_type>(&m_context.pages[i].hash));
         }
 
         size_t next_page = 0;
         size_t next_sibling = 0;
         auto root_hash =
             compute_root_hash_impl(0, interop_log2_root_size - PMA_PAGE_SIZE_LOG2, next_page, next_sibling);
-        if (next_page != m_page_count) {
-            interop_throw_runtime_error("compute_root_hash: next_page != m_page_count");
+        if (next_page != m_context.page_count) {
+            interop_throw_runtime_error("compute_root_hash: next_page != m_context.page_count");
         }
-        if (next_sibling != m_sibling_count) {
+        if (next_sibling != m_context.sibling_count) {
             interop_throw_runtime_error("compute_root_hash: sibling hashes not totally consumed");
         }
         return root_hash;
@@ -449,12 +450,12 @@ class replay_step_state_access : public i_state_access<replay_step_state_access,
         size_t &next_sibling) {
         // NOLINTBEGIN(cppcoreguidelines-pro-type-reinterpret-cast))
         auto page_count = UINT64_C(1) << page_count_log2_size;
-        if (next_page >= m_page_count || page_index + page_count <= m_pages[next_page].index) {
-            if (next_sibling >= m_sibling_count) {
+        if (next_page >= m_context.page_count || page_index + page_count <= m_context.pages[next_page].index) {
+            if (next_sibling >= m_context.sibling_count) {
                 interop_throw_runtime_error(
                     "compute_root_hash_impl: trying to access beyond sibling count while skipping range");
             }
-            return m_sibling_hashes[next_sibling++];
+            return m_context.sibling_hashes[next_sibling++];
         }
         if (page_count_log2_size > 0) {
             auto left = compute_root_hash_impl(page_index, page_count_log2_size - 1, next_page, next_sibling);
@@ -465,13 +466,13 @@ class replay_step_state_access : public i_state_access<replay_step_state_access,
                 reinterpret_cast<interop_hash_type>(&hash));
             return hash;
         }
-        if (m_pages[next_page].index == page_index) {
-            return m_pages[next_page++].hash;
+        if (m_context.pages[next_page].index == page_index) {
+            return m_context.pages[next_page++].hash;
         }
-        if (next_sibling >= m_sibling_count) {
+        if (next_sibling >= m_context.sibling_count) {
             interop_throw_runtime_error("compute_root_hash_impl: trying to access beyond sibling count");
         }
-        return m_sibling_hashes[next_sibling++];
+        return m_context.sibling_hashes[next_sibling++];
         // NOLINTEND(cppcoreguidelines-pro-type-reinterpret-cast))
     }
 
@@ -893,7 +894,7 @@ class replay_step_state_access : public i_state_access<replay_step_state_access,
 
     template <typename T>
     mock_pma_entry &do_find_pma_entry(uint64_t paddr) {
-        for (size_t i = 0; i < m_pmas.size(); i++) {
+        for (size_t i = 0; i < m_context.pmas.size(); i++) {
             auto &pma = get_pma_entry(static_cast<int>(i));
             if (pma.get_istart_E()) {
                 return pma;
@@ -908,11 +909,11 @@ class replay_step_state_access : public i_state_access<replay_step_state_access,
     mock_pma_entry &do_get_pma_entry(int index) {
         const uint64_t istart = read_pma_istart(index);
         const uint64_t ilength = read_pma_ilength(index);
-        if (!m_pmas[index]) {
-            m_pmas[index] = build_mock_pma_entry(index, istart, ilength);
+        if (!m_context.pmas[index]) {
+            m_context.pmas[index] = build_mock_pma_entry(index, istart, ilength);
         }
         // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
-        return m_pmas[index].value();
+        return m_context.pmas[index].value();
     }
 
     unsigned char *do_get_host_memory(mock_pma_entry &pma) { // NOLINT(readability-convert-member-functions-to-static)
diff --git a/src/riscv-constants.h b/src/riscv-constants.h
index 15f3f0ce7..b6a91e43c 100644
--- a/src/riscv-constants.h
+++ b/src/riscv-constants.h
@@ -634,177 +634,6 @@ enum class CSR_address : uint32_t {
     tdata3 = 0x7a3,
 };
 
-/// \brief The result of insn & 0b1110000000000011 can be used to identify
-/// most compressed instructions directly
-enum class insn_c_funct3 : uint32_t {
-    // Quadrant 0
-    C_ADDI4SPN = 0b0000000000000000,
-    C_FLD = 0b0010000000000000,
-    C_LW = 0b0100000000000000,
-    C_LD = 0b0110000000000000,
-    C_FSD = 0b1010000000000000,
-    C_SW = 0b1100000000000000,
-    C_SD = 0b1110000000000000,
-
-    // Quadrant 1
-    C_Q1_SET0 = 0b0000000000000001, // C_NOP and C_ADDI
-    C_ADDIW = 0b0010000000000001,
-    C_LI = 0b0100000000000001,
-    C_Q1_SET1 = 0b0110000000000001, // C_ADDI16SP and C_LUI
-    C_Q1_SET2 = 0b1000000000000001, // C_SRLI64, C_SRAI64, C_ANDI, C_SUB
-                                    // C_XOR, C_OR, C_AND, C_SUBW and C_ADDW
-    C_J = 0b1010000000000001,
-    C_BEQZ = 0b1100000000000001,
-    C_BNEZ = 0b1110000000000001,
-
-    // Quadrant 2
-    C_SLLI = 0b0000000000000010,
-    C_FLDSP = 0b0010000000000010,
-    C_LWSP = 0b0100000000000010,
-    C_LDSP = 0b0110000000000010,
-    C_Q2_SET0 = 0b1000000000000010, // C_JR, C_MV, C_EBREAK, C_JALR, C_ADD
-    C_FSDSP = 0b1010000000000010,
-    C_SWSP = 0b1100000000000010,
-    C_SDSP = 0b1110000000000010,
-};
-
-/// \brief The result of insn & 0b1110110000000011 can be used to identify
-/// most compressed instructions directly
-enum class insn_CB_funct2 : uint32_t {
-    C_SRLI = 0b1000000000000001,
-    C_SRAI = 0b1000010000000001,
-    C_ANDI = 0b1000100000000001,
-};
-
-/// \brief The result of insn & 0b1111110001100011 can be used to identify
-/// most compressed instructions directly
-enum class insn_CA_funct6_funct2 : uint32_t {
-    C_SUB = 0b1000110000000001,
-    C_XOR = 0b1000110000100001,
-    C_OR = 0b1000110001000001,
-    C_AND = 0b1000110001100001,
-    C_SUBW = 0b1001110000000001,
-    C_ADDW = 0b1001110000100001,
-};
-
-/// \brief The result of insn & 0b111000001111111 can be used to identify
-/// most instructions directly
-enum class insn_funct3_00000_opcode : uint32_t {
-    LB = 0b000000000000011,
-    LH = 0b001000000000011,
-    LW = 0b010000000000011,
-    LD = 0b011000000000011,
-    LBU = 0b100000000000011,
-    LHU = 0b101000000000011,
-    LWU = 0b110000000000011,
-    SB = 0b000000000100011,
-    SH = 0b001000000100011,
-    SW = 0b010000000100011,
-    SD = 0b011000000100011,
-    FENCE = 0b000000000001111,
-    FENCE_I = 0b001000000001111,
-    ADDI = 0b000000000010011,
-    SLLI = 0b001000000010011,
-    SLTI = 0b010000000010011,
-    SLTIU = 0b011000000010011,
-    XORI = 0b100000000010011,
-    ORI = 0b110000000010011,
-    ANDI = 0b111000000010011,
-    ADDIW = 0b000000000011011,
-    SLLIW = 0b001000000011011,
-    SLLW = 0b001000000111011,
-    DIVW = 0b100000000111011,
-    REMW = 0b110000000111011,
-    REMUW = 0b111000000111011,
-    BEQ = 0b000000001100011,
-    BNE = 0b001000001100011,
-    BLT = 0b100000001100011,
-    BGE = 0b101000001100011,
-    BLTU = 0b110000001100011,
-    BGEU = 0b111000001100011,
-    JALR = 0b000000001100111,
-    CSRRW = 0b001000001110011,
-    CSRRS = 0b010000001110011,
-    CSRRC = 0b011000001110011,
-    CSRRWI = 0b101000001110011,
-    CSRRSI = 0b110000001110011,
-    CSRRCI = 0b111000001110011,
-    AUIPC_000 = 0b000000000010111,
-    AUIPC_001 = 0b001000000010111,
-    AUIPC_010 = 0b010000000010111,
-    AUIPC_011 = 0b011000000010111,
-    AUIPC_100 = 0b100000000010111,
-    AUIPC_101 = 0b101000000010111,
-    AUIPC_110 = 0b110000000010111,
-    AUIPC_111 = 0b111000000010111,
-    LUI_000 = 0b000000000110111,
-    LUI_001 = 0b001000000110111,
-    LUI_010 = 0b010000000110111,
-    LUI_011 = 0b011000000110111,
-    LUI_100 = 0b100000000110111,
-    LUI_101 = 0b101000000110111,
-    LUI_110 = 0b110000000110111,
-    LUI_111 = 0b111000000110111,
-    JAL_000 = 0b000000001101111,
-    JAL_001 = 0b001000001101111,
-    JAL_010 = 0b010000001101111,
-    JAL_011 = 0b011000001101111,
-    JAL_100 = 0b100000001101111,
-    JAL_101 = 0b101000001101111,
-    JAL_110 = 0b110000001101111,
-    JAL_111 = 0b111000001101111,
-    FSW = 0b010000000100111,
-    FSD = 0b011000000100111,
-    FLW = 0b010000000000111,
-    FLD = 0b011000000000111,
-    FMADD_RNE = 0b000000001000011,
-    FMADD_RTZ = 0b001000001000011,
-    FMADD_RDN = 0b010000001000011,
-    FMADD_RUP = 0b011000001000011,
-    FMADD_RMM = 0b100000001000011,
-    FMADD_DYN = 0b111000001000011,
-    FMSUB_RNE = 0b000000001000111,
-    FMSUB_RTZ = 0b001000001000111,
-    FMSUB_RDN = 0b010000001000111,
-    FMSUB_RUP = 0b011000001000111,
-    FMSUB_RMM = 0b100000001000111,
-    FMSUB_DYN = 0b111000001000111,
-    FNMSUB_RNE = 0b000000001001011,
-    FNMSUB_RTZ = 0b001000001001011,
-    FNMSUB_RDN = 0b010000001001011,
-    FNMSUB_RUP = 0b011000001001011,
-    FNMSUB_RMM = 0b100000001001011,
-    FNMSUB_DYN = 0b111000001001011,
-    FNMADD_RNE = 0b000000001001111,
-    FNMADD_RTZ = 0b001000001001111,
-    FNMADD_RDN = 0b010000001001111,
-    FNMADD_RUP = 0b011000001001111,
-    FNMADD_RMM = 0b100000001001111,
-    FNMADD_DYN = 0b111000001001111,
-    // some instructions need additional inspection of funct7 (or part thereof)
-    FD_000 = 0b000000001010011,
-    FD_001 = 0b001000001010011,
-    FD_010 = 0b010000001010011,
-    FD_011 = 0b011000001010011,
-    FD_100 = 0b100000001010011,
-    FD_111 = 0b111000001010011,
-    SRLI_SRAI = 0b101000000010011,
-    SRLIW_SRAIW = 0b101000000011011,
-    AMO_W = 0b010000000101111,
-    AMO_D = 0b011000000101111,
-    ADD_MUL_SUB = 0b000000000110011,
-    SLL_MULH = 0b001000000110011,
-    SLT_MULHSU = 0b010000000110011,
-    SLTU_MULHU = 0b011000000110011,
-    XOR_DIV = 0b100000000110011,
-    SRL_DIVU_SRA = 0b101000000110011,
-    OR_REM = 0b110000000110011,
-    AND_REMU = 0b111000000110011,
-    ADDW_MULW_SUBW = 0b000000000111011,
-    SRLW_DIVUW_SRAW = 0b101000000111011,
-    PRIVILEGED = 0b000000001110011,
-};
-
 /// \brief The result of insn >> 26 (6 most significant bits of funct7) can be
 /// used to identify the SRI instructions
 enum insn_SRLI_SRAI_funct7_sr1 : uint32_t { SRLI = 0b000000, SRAI = 0b010000 };
diff --git a/src/send-cmio-response.cpp b/src/send-cmio-response.cpp
index e94304413..d03279a93 100644
--- a/src/send-cmio-response.cpp
+++ b/src/send-cmio-response.cpp
@@ -32,7 +32,7 @@
 namespace cartesi {
 
 template <typename STATE_ACCESS>
-void send_cmio_response(STATE_ACCESS &a, uint16 reason, bytes data, uint32 dataLength) {
+void send_cmio_response(STATE_ACCESS a, uint16 reason, bytes data, uint32 dataLength) {
     if (!readIflagsY(a)) {
         throwRuntimeError(a, "iflags.Y is not set");
     }
@@ -61,13 +61,13 @@ void send_cmio_response(STATE_ACCESS &a, uint16 reason, bytes data, uint32 dataL
 }
 
 // Explicit instantiation for state_access
-template void send_cmio_response(state_access &a, uint16_t reason, const unsigned char *data, uint32 length);
+template void send_cmio_response(state_access a, uint16_t reason, const unsigned char *data, uint32 length);
 
 // Explicit instantiation for record_state_access
-template void send_cmio_response(record_state_access &a, uint16_t reason, const unsigned char *data, uint32 length);
+template void send_cmio_response(record_state_access a, uint16_t reason, const unsigned char *data, uint32 length);
 
 // Explicit instantiation for replay_state_access
-template void send_cmio_response(replay_state_access &a, uint16_t reason, const unsigned char *data, uint32 length);
+template void send_cmio_response(replay_state_access a, uint16_t reason, const unsigned char *data, uint32 length);
 
 } // namespace cartesi
 // NOLINTEND(google-readability-casting,misc-const-correctness,modernize-use-auto,hicpp-use-auto)
diff --git a/src/send-cmio-response.h b/src/send-cmio-response.h
index 11fb4c923..1f8d4e4e4 100644
--- a/src/send-cmio-response.h
+++ b/src/send-cmio-response.h
@@ -28,22 +28,22 @@ namespace cartesi {
 /// \param data Response data
 /// \param length Response data length
 template <typename STATE_ACCESS>
-void send_cmio_response(STATE_ACCESS &a, uint16_t reason, const unsigned char *data, uint32_t dataLength);
+void send_cmio_response(STATE_ACCESS a, uint16_t reason, const unsigned char *data, uint32_t dataLength);
 
 class state_access;
 class record_state_access;
 class replay_state_access;
 
 // Declaration of explicit instantiation in module send_cmio_response.cpp
-extern template void send_cmio_response(state_access &a, uint16_t reason, const unsigned char *data,
+extern template void send_cmio_response(state_access a, uint16_t reason, const unsigned char *data,
     uint32_t dataLength);
 
 // Declaration of explicit instantiation in module uarch-reset-state.cpp
-extern template void send_cmio_response(record_state_access &a, uint16_t reason, const unsigned char *data,
+extern template void send_cmio_response(record_state_access a, uint16_t reason, const unsigned char *data,
     uint32_t dataLength);
 
 // Declaration of explicit instantiation in module uarch-reset-state.cpp
-extern template void send_cmio_response(replay_state_access &a, uint16_t reason, const unsigned char *data,
+extern template void send_cmio_response(replay_state_access a, uint16_t reason, const unsigned char *data,
     uint32_t dataLength);
 
 } // namespace cartesi
diff --git a/src/soft-float.h b/src/soft-float.h
index 0354c36ae..d87040be8 100644
--- a/src/soft-float.h
+++ b/src/soft-float.h
@@ -194,7 +194,7 @@ struct i_sfloat {
 
     /// \brief Right shift that takes rounding in account, used for adjust mantissa.
     static F_UINT mant_rshift_rnd(F_UINT a, int d) {
-        if (d != 0) {
+        if (d > 0) {
             if (d >= F_SIZE) {
                 return (a != 0);
             }
@@ -326,7 +326,7 @@ struct i_sfloat {
     }
 
     /// \brief Addition operation.
-    static F_UINT add(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) {
+    static NO_INLINE F_UINT add(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) {
         // swap so that  abs(a) >= abs(b)
         if ((a & ~SIGN_MASK) < (b & ~SIGN_MASK)) {
             const F_UINT tmp = a;
@@ -379,7 +379,7 @@ struct i_sfloat {
     }
 
     /// \brief Multiply operation.
-    static F_UINT mul(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) {
+    static NO_INLINE F_UINT mul(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) {
         const uint32_t a_sign = a >> (F_SIZE - 1);
         const uint32_t b_sign = b >> (F_SIZE - 1);
         const uint32_t r_sign = a_sign ^ b_sign;
@@ -425,7 +425,7 @@ struct i_sfloat {
     }
 
     /// \brief Fused multiply and add operation.
-    static F_UINT fma(F_UINT a, F_UINT b, F_UINT c, FRM_modes rm, uint32_t *pfflags) {
+    static NO_INLINE F_UINT fma(F_UINT a, F_UINT b, F_UINT c, FRM_modes rm, uint32_t *pfflags) {
         const uint32_t a_sign = a >> (F_SIZE - 1);
         const uint32_t b_sign = b >> (F_SIZE - 1);
         uint32_t c_sign = c >> (F_SIZE - 1);
@@ -551,7 +551,7 @@ struct i_sfloat {
     }
 
     /// \brief Division operation.
-    static F_UINT div(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) {
+    static NO_INLINE F_UINT div(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) {
         const uint32_t a_sign = a >> (F_SIZE - 1);
         const uint32_t b_sign = b >> (F_SIZE - 1);
         const uint32_t r_sign = a_sign ^ b_sign;
@@ -612,7 +612,7 @@ struct i_sfloat {
     }
 
     /// \brief Square root operation.
-    static F_UINT sqrt(F_UINT a, FRM_modes rm, uint32_t *pfflags) {
+    static NO_INLINE F_UINT sqrt(F_UINT a, FRM_modes rm, uint32_t *pfflags) {
         const uint32_t a_sign = a >> (F_SIZE - 1);
         int32_t a_exp = (a >> MANT_SIZE) & EXP_MASK;
         F_UINT a_mant = a & MANT_MASK;
@@ -673,7 +673,7 @@ struct i_sfloat {
     }
 
     /// \brief Min operation.
-    static F_UINT min(F_UINT a, F_UINT b, uint32_t *pfflags) {
+    static NO_INLINE F_UINT min(F_UINT a, F_UINT b, uint32_t *pfflags) {
         if (isnan(a) || isnan(b)) {
             return min_max_nan(a, b, pfflags);
         }
@@ -686,7 +686,7 @@ struct i_sfloat {
     }
 
     /// \brief Max operation.
-    static F_UINT max(F_UINT a, F_UINT b, uint32_t *pfflags) {
+    static NO_INLINE F_UINT max(F_UINT a, F_UINT b, uint32_t *pfflags) {
         if (isnan(a) || isnan(b)) {
             return min_max_nan(a, b, pfflags);
         }
@@ -699,7 +699,7 @@ struct i_sfloat {
     }
 
     /// \brief Equal operation.
-    static bool eq(F_UINT a, F_UINT b, uint32_t *pfflags) {
+    static NO_INLINE bool eq(F_UINT a, F_UINT b, uint32_t *pfflags) {
         if (unlikely(isnan(a) || isnan(b))) {
             if (issignan(a) || issignan(b)) {
                 *pfflags |= FFLAGS_NV_MASK;
@@ -713,7 +713,7 @@ struct i_sfloat {
     }
 
     /// \brief Less or equal than operation.
-    static bool le(F_UINT a, F_UINT b, uint32_t *pfflags) {
+    static NO_INLINE bool le(F_UINT a, F_UINT b, uint32_t *pfflags) {
         if (unlikely(isnan(a) || isnan(b))) {
             *pfflags |= FFLAGS_NV_MASK;
             return false;
@@ -727,7 +727,7 @@ struct i_sfloat {
     }
 
     /// \brief Less than operation.
-    static bool lt(F_UINT a, F_UINT b, uint32_t *pfflags) { // NOLINT(misc-confusable-identifiers)
+    static NO_INLINE bool lt(F_UINT a, F_UINT b, uint32_t *pfflags) { // NOLINT(misc-confusable-identifiers)
         if (unlikely(isnan(a) || isnan(b))) {
             *pfflags |= FFLAGS_NV_MASK;
             return false;
@@ -741,7 +741,7 @@ struct i_sfloat {
     }
 
     /// \brief Retrieves float class.
-    static uint32_t fclass(F_UINT a) {
+    static NO_INLINE uint32_t fclass(F_UINT a) {
         const uint32_t a_sign = a >> (F_SIZE - 1);
         const int32_t a_exp = (a >> MANT_SIZE) & EXP_MASK;
         const F_UINT a_mant = a & MANT_MASK;
@@ -762,7 +762,7 @@ struct i_sfloat {
 
     /// \brief Conversion from float to integer.
     template <typename ICVT_INT>
-    static ICVT_INT cvt_f_i(F_UINT a, FRM_modes rm, uint32_t *pfflags) {
+    static NO_INLINE ICVT_INT cvt_f_i(F_UINT a, FRM_modes rm, uint32_t *pfflags) {
         using ICVT_UINT = std::make_unsigned_t<ICVT_INT>;
         constexpr bool IS_UNSIGNED = std::is_unsigned_v<ICVT_INT>;
         constexpr int ICVT_SIZE = sizeof(ICVT_UINT) * 8;
@@ -841,7 +841,7 @@ struct i_sfloat {
 
     /// \brief Conversion from integer to float.
     template <typename ICVT_INT>
-    static F_UINT cvt_i_f(ICVT_INT a, FRM_modes rm, uint32_t *pfflags) {
+    static NO_INLINE F_UINT cvt_i_f(ICVT_INT a, FRM_modes rm, uint32_t *pfflags) {
         using ICVT_UINT = std::make_unsigned_t<ICVT_INT>;
         constexpr bool IS_UNSIGNED = std::is_unsigned_v<ICVT_INT>;
         constexpr int ICVT_SIZE = sizeof(ICVT_UINT) * 8;
@@ -870,7 +870,7 @@ using i_sfloat32 = i_sfloat<uint32_t, 23, 8>;  // Interface for single-precision
 using i_sfloat64 = i_sfloat<uint64_t, 52, 11>; // Interface for double-precision floating-point
 
 /// \brief Conversion from float32 to float64.
-static uint64_t sfloat_cvt_f32_f64(uint32_t a, uint32_t *pfflags) {
+static NO_INLINE uint64_t sfloat_cvt_f32_f64(uint32_t a, uint32_t *pfflags) {
     uint32_t a_sign = 0;
     int32_t a_exp = 0;
     i_sfloat64::F_UINT a_mant = i_sfloat32::unpack(&a_sign, &a_exp, a);
@@ -899,7 +899,7 @@ static uint64_t sfloat_cvt_f32_f64(uint32_t a, uint32_t *pfflags) {
 }
 
 /// \brief Conversion from float64 to float32.
-static uint32_t sfloat_cvt_f64_f32(uint64_t a, FRM_modes rm, uint32_t *pfflags) {
+static NO_INLINE uint32_t sfloat_cvt_f64_f32(uint64_t a, FRM_modes rm, uint32_t *pfflags) {
     uint32_t a_sign = 0;
     int32_t a_exp = 0;
     i_sfloat64::F_UINT a_mant = i_sfloat64::unpack(&a_sign, &a_exp, a);
diff --git a/src/state-access.h b/src/state-access.h
index 71a48a15e..0ebabe3d2 100644
--- a/src/state-access.h
+++ b/src/state-access.h
@@ -56,17 +56,6 @@ class state_access : public i_state_access<state_access, pma_entry> {
         ;
     }
 
-    /// \brief No copy constructor
-    state_access(const state_access &) = delete;
-    /// \brief No copy assignment
-    state_access &operator=(const state_access &) = delete;
-    /// \brief No move constructor
-    state_access(state_access &&) = delete;
-    /// \brief No move assignment
-    state_access &operator=(state_access &&) = delete;
-    /// \brief Default destructor
-    ~state_access() = default;
-
     const machine &get_naked_machine() const {
         return m_m;
     }
diff --git a/src/translate-virtual-address.h b/src/translate-virtual-address.h
index 5044fdfd4..b5e7f8fce 100644
--- a/src/translate-virtual-address.h
+++ b/src/translate-virtual-address.h
@@ -57,7 +57,7 @@ namespace cartesi {
 /// \param val Value to write.
 /// \returns True if succeeded, false otherwise.
 template <typename STATE_ACCESS>
-static inline bool write_ram_uint64(STATE_ACCESS &a, uint64_t paddr, uint64_t val) {
+static inline bool write_ram_uint64(STATE_ACCESS a, uint64_t paddr, uint64_t val) {
     auto &pma = a.template find_pma_entry<uint64_t>(paddr);
     if (unlikely(!pma.get_istart_M() || !pma.get_istart_W())) {
         return false;
@@ -79,7 +79,7 @@ static inline bool write_ram_uint64(STATE_ACCESS &a, uint64_t paddr, uint64_t va
 /// \param pval Pointer to word.
 /// \returns True if succeeded, false otherwise.
 template <typename STATE_ACCESS>
-static inline bool read_ram_uint64(STATE_ACCESS &a, uint64_t paddr, uint64_t *pval) {
+static inline bool read_ram_uint64(STATE_ACCESS a, uint64_t paddr, uint64_t *pval) {
     auto &pma = a.template find_pma_entry<uint64_t>(paddr);
     if (unlikely(!pma.get_istart_M() || !pma.get_istart_R())) {
         return false;
@@ -102,7 +102,7 @@ static inline bool read_ram_uint64(STATE_ACCESS &a, uint64_t paddr, uint64_t *pv
 /// \details This function is outlined to minimize host CPU code cache pressure.
 /// \returns True if succeeded, false otherwise.
 template <typename STATE_ACCESS, bool UPDATE_PTE = true>
-static NO_INLINE bool translate_virtual_address(STATE_ACCESS &a, uint64_t *ppaddr, uint64_t vaddr, int xwr_shift) {
+static NO_INLINE bool translate_virtual_address(STATE_ACCESS a, uint64_t *ppaddr, uint64_t vaddr, int xwr_shift) {
     auto priv = a.read_iflags_PRV();
     const uint64_t mstatus = a.read_mstatus();
 
diff --git a/tools/gen-interpret-jump-table.lua b/tools/gen-interpret-jump-table.lua
new file mode 100755
index 000000000..173173d3e
--- /dev/null
+++ b/tools/gen-interpret-jump-table.lua
@@ -0,0 +1,602 @@
+#!/usr/bin/env lua5.4
+
+--[[
+This file is used to generate interpret-jump-table.h header used in the interpreter.
+It's purpose is to generate a big jump table covering most RISC-V instructions,
+so we can decode most instructions with a single jump.
+
+At the moment this file is a bit hacky and slow, it could be optimized in the future.
+]]
+
+--[[
+List of RISC-V instructions taken from RISC-V specification.
+Bits marked as 0 and 1 are fixed, bits marked as _ is a placeholder accepting both 0 or 1.
+When `rd0_special` is set means the instruction has specialization when rd == 0.
+When `rm = true` is set means the instruction has specialization for floating-point rounding modes.
+]]
+local insns = {
+    -- RV32I
+    { bits = "_________________________0110111", name = "LUI", rd0_special = true },
+    { bits = "_________________________0010111", name = "AUIPC", rd0_special = true },
+    { bits = "_________________________1101111", name = "JAL", rd0_special = true },
+    { bits = "_________________000_____1100111", name = "JALR", rd0_special = true },
+    { bits = "_________________000_____1100011", name = "BEQ" },
+    { bits = "_________________001_____1100011", name = "BNE" },
+    { bits = "_________________100_____1100011", name = "BLT" },
+    { bits = "_________________101_____1100011", name = "BGE" },
+    { bits = "_________________110_____1100011", name = "BLTU" },
+    { bits = "_________________111_____1100011", name = "BGEU" },
+    { bits = "_________________000_____0000011", name = "LB", rd0_special = true },
+    { bits = "_________________001_____0000011", name = "LH", rd0_special = true },
+    { bits = "_________________010_____0000011", name = "LW", rd0_special = true },
+    { bits = "_________________100_____0000011", name = "LBU", rd0_special = true },
+    { bits = "_________________101_____0000011", name = "LHU", rd0_special = true },
+    { bits = "_________________000_____0100011", name = "SB" },
+    { bits = "_________________001_____0100011", name = "SH" },
+    { bits = "_________________010_____0100011", name = "SW" },
+    { bits = "_________________000_____0010011", name = "ADDI", rd0_special = true },
+    { bits = "_________________010_____0010011", name = "SLTI", rd0_special = true },
+    { bits = "_________________011_____0010011", name = "SLTIU", rd0_special = true },
+    { bits = "_________________100_____0010011", name = "XORI", rd0_special = true },
+    { bits = "_________________110_____0010011", name = "ORI", rd0_special = true },
+    { bits = "_________________111_____0010011", name = "ANDI", rd0_special = true },
+    { bits = "000000___________001_____0010011", name = "SLLI", rd0_special = true },
+    { bits = "000000___________101_____0010011", name = "SRLI", rd0_special = true },
+    { bits = "010000___________101_____0010011", name = "SRAI", rd0_special = true },
+    { bits = "0000000__________000_____0110011", name = "ADD", rd0_special = true },
+    { bits = "0100000__________000_____0110011", name = "SUB", rd0_special = true },
+    { bits = "0000000__________001_____0110011", name = "SLL", rd0_special = true },
+    { bits = "0000000__________010_____0110011", name = "SLT", rd0_special = true },
+    { bits = "0000000__________011_____0110011", name = "SLTU", rd0_special = true },
+    { bits = "0000000__________100_____0110011", name = "XOR", rd0_special = true },
+    { bits = "0000000__________101_____0110011", name = "SRL", rd0_special = true },
+    { bits = "0100000__________101_____0110011", name = "SRA", rd0_special = true },
+    { bits = "0000000__________110_____0110011", name = "OR", rd0_special = true },
+    { bits = "0000000__________111_____0110011", name = "AND", rd0_special = true },
+    { bits = "_________________000_____0001111", name = "FENCE" },
+    { bits = "00000000000000000000000001110011", name = "ECALL" },
+    { bits = "00000000000100000000000001110011", name = "EBREAK" },
+
+    -- RV64I
+    { bits = "_________________110_____0000011", name = "LWU", rd0_special = true },
+    { bits = "_________________011_____0000011", name = "LD", rd0_special = true },
+    { bits = "_________________011_____0100011", name = "SD" },
+    { bits = "_________________000_____0011011", name = "ADDIW", rd0_special = true },
+    { bits = "0000000__________001_____0011011", name = "SLLIW", rd0_special = true },
+    { bits = "0000000__________101_____0011011", name = "SRLIW", rd0_special = true },
+    { bits = "0100000__________101_____0011011", name = "SRAIW", rd0_special = true },
+    { bits = "0000000__________000_____0111011", name = "ADDW", rd0_special = true },
+    { bits = "0100000__________000_____0111011", name = "SUBW", rd0_special = true },
+    { bits = "0000000__________001_____0111011", name = "SLLW", rd0_special = true },
+    { bits = "0000000__________101_____0111011", name = "SRLW", rd0_special = true },
+    { bits = "0100000__________101_____0111011", name = "SRAW", rd0_special = true },
+
+    -- RV32M extension
+    { bits = "0000001__________000_____0110011", name = "MUL", rd0_special = true },
+    { bits = "0000001__________001_____0110011", name = "MULH", rd0_special = true },
+    { bits = "0000001__________010_____0110011", name = "MULHSU", rd0_special = true },
+    { bits = "0000001__________011_____0110011", name = "MULHU", rd0_special = true },
+    { bits = "0000001__________100_____0110011", name = "DIV", rd0_special = true },
+    { bits = "0000001__________101_____0110011", name = "DIVU", rd0_special = true },
+    { bits = "0000001__________110_____0110011", name = "REM", rd0_special = true },
+    { bits = "0000001__________111_____0110011", name = "REMU", rd0_special = true },
+
+    -- RV64M
+    { bits = "0000001__________000_____0111011", name = "MULW", rd0_special = true },
+    { bits = "0000001__________100_____0111011", name = "DIVW", rd0_special = true },
+    { bits = "0000001__________101_____0111011", name = "DIVUW", rd0_special = true },
+    { bits = "0000001__________110_____0111011", name = "REMW", rd0_special = true },
+    { bits = "0000001__________111_____0111011", name = "REMUW", rd0_special = true },
+
+    -- RV32A
+    { bits = "00010__00000_____010_____0101111", name = "LR.W" },
+    { bits = "00011____________010_____0101111", name = "SC.W" },
+    { bits = "00001____________010_____0101111", name = "AMOSWAP.W" },
+    { bits = "00000____________010_____0101111", name = "AMOADD.W" },
+    { bits = "00100____________010_____0101111", name = "AMOXOR.W" },
+    { bits = "01100____________010_____0101111", name = "AMOAND.W" },
+    { bits = "01000____________010_____0101111", name = "AMOOR.W" },
+    { bits = "10000____________010_____0101111", name = "AMOMIN.W" },
+    { bits = "10100____________010_____0101111", name = "AMOMAX.W" },
+    { bits = "11000____________010_____0101111", name = "AMOMINU.W" },
+    { bits = "11100____________010_____0101111", name = "AMOMAXU.W" },
+
+    -- RV64A
+    { bits = "00010__00000_____011_____0101111", name = "LR.D" },
+    { bits = "00011____________011_____0101111", name = "SC.D" },
+    { bits = "00001____________011_____0101111", name = "AMOSWAP.D" },
+    { bits = "00000____________011_____0101111", name = "AMOADD.D" },
+    { bits = "00100____________011_____0101111", name = "AMOXOR.D" },
+    { bits = "01100____________011_____0101111", name = "AMOAND.D" },
+    { bits = "01000____________011_____0101111", name = "AMOOR.D" },
+    { bits = "10000____________011_____0101111", name = "AMOMIN.D" },
+    { bits = "10100____________011_____0101111", name = "AMOMAX.D" },
+    { bits = "11000____________011_____0101111", name = "AMOMINU.D" },
+    { bits = "11100____________011_____0101111", name = "AMOMAXU.D" },
+
+    -- RV32F extension
+    { bits = "_________________010_____0000111", name = "FLW" },
+    { bits = "_________________010_____0100111", name = "FSW" },
+    { bits = "_____00__________________1000011", name = "FMADD.S", rm = true },
+    { bits = "_____00__________________1000111", name = "FMSUB.S", rm = true },
+    { bits = "_____00__________________1001011", name = "FNMSUB.S", rm = true },
+    { bits = "_____00__________________1001111", name = "FNMADD.S", rm = true },
+    { bits = "0000000__________________1010011", name = "FADD.S", rm = true },
+    { bits = "0000100__________________1010011", name = "FSUB.S", rm = true },
+    { bits = "0001000__________________1010011", name = "FMUL.S", rm = true },
+    { bits = "0001100__________________1010011", name = "FDIV.S", rm = true },
+    { bits = "010110000000_____________1010011", name = "FSQRT.S", rm = true },
+    { bits = "0010000__________000_____1010011", name = "FSGNJ.S" },
+    { bits = "0010000__________001_____1010011", name = "FSGNJN.S" },
+    { bits = "0010000__________010_____1010011", name = "FSGNJX.S" },
+    { bits = "0010100__________000_____1010011", name = "FMIN.S" },
+    { bits = "0010100__________001_____1010011", name = "FMAX.S" },
+    { bits = "110000000000_____________1010011", name = "FCVT.W.S", rm = true },
+    { bits = "110000000001_____________1010011", name = "FCVT.WU.S", rm = true },
+    { bits = "111000000000_____000_____1010011", name = "FMV.X.W" },
+    { bits = "1010000__________010_____1010011", name = "FEQ.S" },
+    { bits = "1010000__________001_____1010011", name = "FLT.S" },
+    { bits = "1010000__________000_____1010011", name = "FLE.S" },
+    { bits = "111000000000_____001_____1010011", name = "FCLASS.S" },
+    { bits = "110100000000_____________1010011", name = "FCVT.S.W", rm = true },
+    { bits = "110100000001_____________1010011", name = "FCVT.S.WU", rm = true },
+    { bits = "111100000000_____000_____1010011", name = "FMV.W.X" },
+
+    -- RV64F
+    { bits = "110000000010_____________1010011", name = "FCVT.L.S", rm = true },
+    { bits = "110000000011_____________1010011", name = "FCVT.LU.S", rm = true },
+    { bits = "110100000010_____________1010011", name = "FCVT.S.L", rm = true },
+    { bits = "110100000011_____________1010011", name = "FCVT.S.LU", rm = true },
+
+    -- RV32D
+    { bits = "_________________011_____0000111", name = "FLD" },
+    { bits = "_________________011_____0100111", name = "FSD" },
+    { bits = "_____01__________________1000011", name = "FMADD.D", rm = true },
+    { bits = "_____01__________________1000111", name = "FMSUB.D", rm = true },
+    { bits = "_____01__________________1001011", name = "FNMSUB.D", rm = true },
+    { bits = "_____01__________________1001111", name = "FNMADD.D", rm = true },
+    { bits = "0000001__________________1010011", name = "FADD.D", rm = true },
+    { bits = "0000101__________________1010011", name = "FSUB.D", rm = true },
+    { bits = "0001001__________________1010011", name = "FMUL.D", rm = true },
+    { bits = "0001101__________________1010011", name = "FDIV.D", rm = true },
+    { bits = "010110100000_____________1010011", name = "FSQRT.D", rm = true },
+    { bits = "0010001__________000_____1010011", name = "FSGNJ.D" },
+    { bits = "0010001__________001_____1010011", name = "FSGNJN.D" },
+    { bits = "0010001__________010_____1010011", name = "FSGNJX.D" },
+    { bits = "0010101__________000_____1010011", name = "FMIN.D" },
+    { bits = "0010101__________001_____1010011", name = "FMAX.D" },
+    { bits = "010000000001_____________1010011", name = "FCVT.S.D", rm = true },
+    { bits = "010000100000_____________1010011", name = "FCVT.D.S", rm = true },
+    { bits = "1010001__________010_____1010011", name = "FEQ.D" },
+    { bits = "1010001__________001_____1010011", name = "FLT.D" },
+    { bits = "1010001__________000_____1010011", name = "FLE.D" },
+    { bits = "111000100000_____001_____1010011", name = "FCLASS.D", rm = true },
+    { bits = "110000100000_____________1010011", name = "FCVT.W.D", rm = true },
+    { bits = "110000100001_____________1010011", name = "FCVT.WU.D", rm = true },
+    { bits = "110100100000_____________1010011", name = "FCVT.D.W", rm = true },
+    { bits = "110100100001_____________1010011", name = "FCVT.D.WU", rm = true },
+    -- RV64D
+    { bits = "110000100010_____________1010011", name = "FCVT.L.D", rm = true },
+    { bits = "110000100011_____________1010011", name = "FCVT.LU.D", rm = true },
+    { bits = "111000100000_____000_____1010011", name = "FMV.X.D" },
+    { bits = "110100100010_____________1010011", name = "FCVT.D.L", rm = true },
+    { bits = "110100100011_____________1010011", name = "FCVT.D.LU", rm = true },
+    { bits = "111100100000_____000_____1010011", name = "FMV.D.X" },
+
+    -- Zifencei extension
+    { bits = "_________________001_____0001111", name = "FENCE.I" },
+
+    -- Zicsr extension
+    { bits = "_________________001_____1110011", name = "CSRRW" },
+    { bits = "_________________010_____1110011", name = "CSRRS" },
+    { bits = "_________________011_____1110011", name = "CSRRC" },
+    { bits = "_________________101_____1110011", name = "CSRRWI" },
+    { bits = "_________________110_____1110011", name = "CSRRSI" },
+    { bits = "_________________111_____1110011", name = "CSRRCI" },
+
+    -- Privileged
+    { bits = "00010000001000000000000001110011", name = "SRET" },
+    { bits = "00110000001000000000000001110011", name = "MRET" },
+    { bits = "01110000001000000000000001110011", name = "MNRET" },
+    { bits = "00010000010100000000000001110011", name = "WFI" },
+    { bits = "0001001__________000000001110011", name = "SFENCE.VMA" },
+}
+
+-- Converts an integer `num` to a base 2 string of length `nbits`
+local function tobase2(num, nbits)
+    local t = {}
+    local bit = 1 << (nbits - 1)
+    for _ = 1, nbits do
+        table.insert(t, ((num & bit) ~= 0) and "1" or "0")
+        bit = bit >> 1
+    end
+    return table.concat(t)
+end
+
+-- Converts a string in base 2 to an integer.
+local function frombase2(s) return tonumber(s, 2) end
+
+local c_insns = {}
+local c_insn_by_idx = {}
+
+-- Fill compressed instructions table according to the RISC-V spec
+do
+    local function add_c_insn(c_insn)
+        assert(#c_insn.bits == 16)
+        c_insn.name = c_insn.name:gsub("%.", "_")
+        table.insert(c_insns, c_insn)
+        local num_bits = frombase2(c_insn.bits)
+        assert(c_insn_by_idx[num_bits] == nil, "duplicated compressed instruction")
+        c_insn_by_idx[num_bits] = c_insn.name
+    end
+    do -- quadrant 0
+        for rd = 0, (1 << 3) - 1 do
+            for imm = 1, (1 << 8) - 1 do
+                add_c_insn({ bits = "000" .. tobase2(imm, 8) .. tobase2(rd, 3) .. "00", name = "C.ADDI4SPN" })
+            end
+        end
+
+        for mid = 0, (1 << 11) - 1 do
+            add_c_insn({ bits = "001" .. tobase2(mid, 11) .. "00", name = "C.FLD" })
+            add_c_insn({ bits = "010" .. tobase2(mid, 11) .. "00", name = "C.LW" })
+            add_c_insn({ bits = "011" .. tobase2(mid, 11) .. "00", name = "C.LD" })
+            add_c_insn({ bits = "101" .. tobase2(mid, 11) .. "00", name = "C.FSD" })
+            add_c_insn({ bits = "110" .. tobase2(mid, 11) .. "00", name = "C.SW" })
+            add_c_insn({ bits = "111" .. tobase2(mid, 11) .. "00", name = "C.SD" })
+        end
+    end
+
+    do -- quadrant 1
+        for rd = 0, (1 << 5) - 1 do
+            for imm = 0, (1 << 6) - 1 do
+                if rd == 0 then
+                    add_c_insn({
+                        bits = "000" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01",
+                        name = imm ~= 0 and "C.NOP" or "C.HINT",
+                    })
+                else
+                    add_c_insn({
+                        bits = "000" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01",
+                        name = imm ~= 0 and "C.ADDI" or "C.HINT",
+                    })
+                    add_c_insn({
+                        bits = "001" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01",
+                        name = "C.ADDIW",
+                    })
+                end
+                add_c_insn({
+                    bits = "010" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01",
+                    name = rd ~= 0 and "C.LI" or "C.HINT",
+                })
+
+                if imm ~= 0 then
+                    if rd == 2 then
+                        add_c_insn({
+                            bits = "011" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01",
+                            name = rd ~= 0 and "C.ADDI16SP" or "C.HINT",
+                        })
+                    else
+                        add_c_insn({
+                            bits = "011" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01",
+                            name = rd ~= 0 and "C.LUI" or "C.HINT",
+                        })
+                    end
+                end
+            end
+        end
+
+        for rd = 0, (1 << 3) - 1 do
+            for imm = 0, (1 << 6) - 1 do
+                add_c_insn({
+                    bits = "100" .. tobase2(imm & 1, 1) .. "00" .. tobase2(rd, 3) .. tobase2(imm >> 1, 5) .. "01",
+                    name = imm ~= 0 and "C.SRLI" or "C.HINT",
+                })
+                add_c_insn({
+                    bits = "100" .. tobase2(imm & 1, 1) .. "01" .. tobase2(rd, 3) .. tobase2(imm >> 1, 5) .. "01",
+                    name = imm ~= 0 and "C.SRAI" or "C.HINT",
+                })
+                add_c_insn({
+                    bits = "100" .. tobase2(imm & 1, 1) .. "10" .. tobase2(rd, 3) .. tobase2(imm >> 1, 5) .. "01",
+                    name = "C.ANDI",
+                })
+            end
+        end
+
+        for rs1 = 0, (1 << 3) - 1 do
+            for rs2 = 0, (1 << 3) - 1 do
+                add_c_insn({ bits = "100011" .. tobase2(rs1, 3) .. "00" .. tobase2(rs2, 3) .. "01", name = "C.SUB" })
+                add_c_insn({ bits = "100011" .. tobase2(rs1, 3) .. "01" .. tobase2(rs2, 3) .. "01", name = "C.XOR" })
+                add_c_insn({ bits = "100011" .. tobase2(rs1, 3) .. "10" .. tobase2(rs2, 3) .. "01", name = "C.OR" })
+                add_c_insn({ bits = "100011" .. tobase2(rs1, 3) .. "11" .. tobase2(rs2, 3) .. "01", name = "C.AND" })
+                add_c_insn({ bits = "100111" .. tobase2(rs1, 3) .. "00" .. tobase2(rs2, 3) .. "01", name = "C.SUBW" })
+                add_c_insn({ bits = "100111" .. tobase2(rs1, 3) .. "01" .. tobase2(rs2, 3) .. "01", name = "C.ADDW" })
+            end
+        end
+
+        for mid = 0, (1 << 11) - 1 do
+            add_c_insn({ bits = "101" .. tobase2(mid, 11) .. "01", name = "C.J" })
+            add_c_insn({ bits = "110" .. tobase2(mid, 11) .. "01", name = "C.BEQZ" })
+            add_c_insn({ bits = "111" .. tobase2(mid, 11) .. "01", name = "C.BNEZ" })
+        end
+    end
+
+    do -- quadrant 2
+        for rd = 0, (1 << 5) - 1 do
+            for imm = 0, (1 << 6) - 1 do
+                add_c_insn({
+                    bits = "000" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "10",
+                    name = (imm ~= 0 and rd ~= 0) and "C.SLLI" or "C.HINT",
+                })
+
+                add_c_insn({
+                    bits = "001" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "10",
+                    name = "C.FLDSP",
+                })
+
+                if rd ~= 0 then
+                    add_c_insn({
+                        bits = "010" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "10",
+                        name = "C.LWSP",
+                    })
+                    add_c_insn({
+                        bits = "011" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "10",
+                        name = "C.LDSP",
+                    })
+                end
+
+                do
+                    local rs2 = rd
+                    add_c_insn({ bits = "101" .. tobase2(imm, 6) .. tobase2(rs2, 5) .. "10", name = "C.FSDSP" })
+                    add_c_insn({ bits = "110" .. tobase2(imm, 6) .. tobase2(rs2, 5) .. "10", name = "C.SWSP" })
+                    add_c_insn({ bits = "111" .. tobase2(imm, 6) .. tobase2(rs2, 5) .. "10", name = "C.SDSP" })
+                end
+            end
+        end
+
+        for rs1 = 0, (1 << 5) - 1 do
+            for rs2 = 0, (1 << 5) - 1 do
+                if rs2 == 0 then
+                    if rs1 == 0 then
+                        add_c_insn({ bits = "1001" .. tobase2(rs1, 5) .. tobase2(rs2, 5) .. "10", name = "C.EBREAK" })
+                    else
+                        add_c_insn({ bits = "1000" .. tobase2(rs1, 5) .. tobase2(rs2, 5) .. "10", name = "C.JR" })
+                        add_c_insn({ bits = "1001" .. tobase2(rs1, 5) .. tobase2(rs2, 5) .. "10", name = "C.JALR" })
+                    end
+                elseif rs2 ~= 0 then
+                    add_c_insn({
+                        bits = "1000" .. tobase2(rs1, 5) .. tobase2(rs2, 5) .. "10",
+                        name = rs1 ~= 0 and "C.MV" or "C.HINT",
+                    })
+                    add_c_insn({
+                        bits = "1001" .. tobase2(rs1, 5) .. tobase2(rs2, 5) .. "10",
+                        name = rs1 ~= 0 and "C.ADD" or "C.HINT",
+                    })
+                end
+            end
+        end
+    end
+end
+
+-- Replace FD instructions that needs rounding discarding invalid round modes
+local valid_rms = {
+    "000", -- rne
+    "001", -- rtz
+    "010", -- rdn
+    "011", -- rup
+    "100", -- rmm
+    "111", -- dyn
+}
+for _, insn in ipairs(insns) do
+    if insn.rm then
+        local lbits, rbits = insn.bits:sub(1, 17), insn.bits:sub(21, 32)
+        insn.bits = lbits .. valid_rms[1] .. rbits
+        insn.rm = nil
+        for i = 2, #valid_rms do
+            table.insert(insns, { bits = lbits .. valid_rms[i] .. rbits, name = insn.name })
+        end
+    end
+end
+
+-- Table use to rename a group of instructions to a single name.
+local group_names = {
+    -- I
+    ["ADD|SUB|MUL"] = "ADD_MUL_SUB",
+    ["ADDW|SUBW|MULW"] = "ADDW_MULW_SUBW",
+    ["SRL|SRA|DIVU"] = "SRL_DIVU_SRA",
+    ["SRLW|SRAW|DIVUW"] = "SRLW_DIVUW_SRAW",
+    -- A
+    ["LR.W|SC.W|AMOSWAP.W|AMOADD.W|AMOXOR.W|AMOAND.W|AMOOR.W|AMOMIN.W|AMOMAX.W|AMOMINU.W|AMOMAXU.W"] = "AMO_W",
+    ["LR.D|SC.D|AMOSWAP.D|AMOADD.D|AMOXOR.D|AMOAND.D|AMOOR.D|AMOMIN.D|AMOMAX.D|AMOMINU.D|AMOMAXU.D"] = "AMO_D",
+    -- FD
+    ["FMADD.S|FMADD.D"] = "FMADD",
+    ["FMSUB.S|FMSUB.D"] = "FMSUB",
+    ["FNMADD.S|FNMADD.D"] = "FNMADD",
+    ["FNMSUB.S|FNMSUB.D"] = "FNMSUB",
+    ["FADD.S|FSUB.S|FMUL.S|FDIV.S|FSQRT.S|FSGNJ.S|FMIN.S|FCVT.W.S|FCVT.WU.S|FMV.X.W|FLE.S|FCVT.S.W|FCVT.S.WU|FMV.W.X|FCVT.L.S|FCVT.LU.S|FCVT.S.L|FCVT.S.LU|FADD.D|FSUB.D|FMUL.D|FDIV.D|FSQRT.D|FSGNJ.D|FMIN.D|FCVT.S.D|FCVT.D.S|FLE.D|FCLASS.D|FCVT.W.D|FCVT.WU.D|FCVT.D.W|FCVT.D.WU|FCVT.L.D|FCVT.LU.D|FMV.X.D|FCVT.D.L|FCVT.D.LU|FMV.D.X"] = "FD",
+    ["FSGNJN.S|FMAX.S|FLT.S|FCLASS.S|FSGNJN.D|FMAX.D|FLT.D|FADD.S|FSUB.S|FMUL.S|FDIV.S|FSQRT.S|FCVT.W.S|FCVT.WU.S|FCVT.S.W|FCVT.S.WU|FCVT.L.S|FCVT.LU.S|FCVT.S.L|FCVT.S.LU|FADD.D|FSUB.D|FMUL.D|FDIV.D|FSQRT.D|FCVT.S.D|FCVT.D.S|FCLASS.D|FCVT.W.D|FCVT.WU.D|FCVT.D.W|FCVT.D.WU|FCVT.L.D|FCVT.LU.D|FCVT.D.L|FCVT.D.LU"] = "FD",
+    ["FSGNJX.S|FEQ.S|FSGNJX.D|FEQ.D|FADD.S|FSUB.S|FMUL.S|FDIV.S|FSQRT.S|FCVT.W.S|FCVT.WU.S|FCVT.S.W|FCVT.S.WU|FCVT.L.S|FCVT.LU.S|FCVT.S.L|FCVT.S.LU|FADD.D|FSUB.D|FMUL.D|FDIV.D|FSQRT.D|FCVT.S.D|FCVT.D.S|FCLASS.D|FCVT.W.D|FCVT.WU.D|FCVT.D.W|FCVT.D.WU|FCVT.L.D|FCVT.LU.D|FCVT.D.L|FCVT.D.LU"] = "FD",
+    ["FADD.S|FSUB.S|FMUL.S|FDIV.S|FSQRT.S|FCVT.W.S|FCVT.WU.S|FCVT.S.W|FCVT.S.WU|FCVT.L.S|FCVT.LU.S|FCVT.S.L|FCVT.S.LU|FADD.D|FSUB.D|FMUL.D|FDIV.D|FSQRT.D|FCVT.S.D|FCVT.D.S|FCLASS.D|FCVT.W.D|FCVT.WU.D|FCVT.D.W|FCVT.D.WU|FCVT.L.D|FCVT.LU.D|FCVT.D.L|FCVT.D.LU"] = "FD",
+    -- privileged
+    ["ECALL|EBREAK|SRET|MRET|MNRET|WFI|SFENCE.VMA"] = "PRIVILEGED",
+    ["SFENCE.VMA"] = "PRIVILEGED",
+}
+
+--[[
+Instruction mask bits
+- 4 bits on the left (1 bit + funct3)
+- 7 bits on the right (funtc7)
+- Checking these bits is enough to make a big switch covering most uncompressed/compressed instructions.
+]]
+local lmask_bits = 4
+local rmask_bits = 7
+local mask_bits = lmask_bits + rmask_bits
+local lmask = (1 << lmask_bits) - 1
+local rmask = (1 << rmask_bits) - 1
+
+-- Labels
+local labels = { ["ILLEGAL"] = true, [1] = { name = "ILLEGAL", i = 1 << (mask_bits + 1) } }
+
+-- Checks if the a string of bits match the mask of string bits, "_" is accepted as bit placeholder.
+local function matchmask(bits, mask)
+    assert(#bits == 32 and #mask == 32)
+    for i = 1, 32 do
+        local b, m = bits:sub(i, i), mask:sub(i, i)
+        if b ~= "_" and m ~= "_" and b ~= m then return false end
+    end
+    return true
+end
+
+-- Generate the jump table
+local jumptable = {}
+for i = 0, ((1 << mask_bits) - 1) do
+    local mask = "________________"
+        .. tobase2((i >> rmask_bits) & lmask, lmask_bits)
+        .. "_____"
+        .. tobase2(i & rmask, rmask_bits)
+    local matches = {}
+    local firstindex
+    local rd0_special
+    for j, insn in ipairs(insns) do
+        if matchmask(insn.bits, mask) and not matches[insn.name] then
+            if #matches == 0 then
+                rd0_special = insn.rd0_special
+            elseif rd0_special ~= insn.rd0_special then
+                rd0_special = nil
+            end
+            matches[insn.name] = true
+            table.insert(matches, insn.name)
+            firstindex = math.min(firstindex or j, j)
+        end
+    end
+    local namekey = table.concat(matches, "|")
+    local name = group_names[namekey] or namekey:gsub("%.", "_"):gsub("|", "_")
+    if #name == 0 then name = "ILLEGAL" end
+    if not labels[name] then
+        labels[name] = true
+        if rd0_special then
+            table.insert(labels, { name = name .. "_rd0", i = firstindex * 10 + 1 })
+            table.insert(labels, { name = name .. "_rdN", i = firstindex * 10 + 2 })
+        else
+            table.insert(labels, { name = name, i = firstindex * 10 })
+        end
+    end
+    assert(#name < 18, namekey)
+    for rd = 0, 31 do
+        local ename = name
+        if rd0_special then
+            if rd == 0 then
+                ename = ename .. "_rd0"
+            else
+                ename = ename .. "_rdN"
+            end
+        end
+        local emask = mask:sub(1, 20) .. tobase2(rd, 5) .. mask:sub(26, 32)
+        local idx = frombase2(emask:match("[0-1]+"))
+        if ename == "ILLEGAL" then -- check for compressed instruction
+            ename = c_insn_by_idx[idx] or ename
+        end
+        jumptable[idx + 1] = ename
+    end
+end
+-- Make sure the jump table has exactly 64KB
+assert(#jumptable == 65536)
+
+-- Sort labels by its definition order
+table.sort(labels, function(a, b) return a.i < b.i end)
+
+-- Add compressed instructions to the labels
+for _, c_insn in ipairs(c_insns) do
+    if not labels[c_insn.name] then
+        labels[c_insn.name] = true
+        table.insert(labels, #labels, { name = c_insn.name })
+    end
+end
+
+-- Make sure labels can fit a byte
+assert(#labels <= 256)
+
+-- Emit the jump table header
+io.write([[
+// Copyright Cartesi and individual authors (see AUTHORS)
+// SPDX-License-Identifier: LGPL-3.0-or-later
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option) any
+// later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+// PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License along
+// with this program (see COPYING). If not, see <https://www.gnu.org/licenses/>.
+//
+
+// THIS FILE WAS GENERATED BY "lua tools/gen-interpret-jump-table.lua",
+// DO NOT EDIT IT DIRECTLY, EDIT THE GENERATOR SCRIPT INSTEAD.
+
+#ifndef INTERPRET_JUMP_TABLE_H
+#define INTERPRET_JUMP_TABLE_H
+
+// NOLINTBEGIN(cppcoreguidelines-macro-usage)
+#pragma GCC diagnostic ignored "-Wpedantic"
+#pragma GCC diagnostic push
+
+#if !defined(NO_COMPUTED_GOTO) && defined(__GNUC__) && !defined(__wasm__)
+#define USE_COMPUTED_GOTO
+#endif
+
+#ifdef USE_COMPUTED_GOTO
+
+#define INSN_LABEL(x) &&x
+#define INSN_CASE(x) x
+#define INSN_BREAK() goto NEXT_INSN
+#define INSN_SWITCH(x) goto *insn_jumptable[x];
+#define INSN_SWITCH_OUT()                                                                                              \
+    NEXT_INSN:
+#define INSN_JUMPTABLE_TYPE void *
+
+#else
+
+#define INSN_LABEL(x) insn_label_id::x
+#define INSN_CASE(x) case insn_label_id::x
+#define INSN_BREAK() break
+#define INSN_SWITCH(x) switch (insn_jumptable[x])
+#define INSN_SWITCH_OUT()
+#define INSN_JUMPTABLE_TYPE insn_label_id
+
+]])
+
+-- Emit labels
+io.write("enum class insn_label_id : unsigned char {\n")
+for _, label in ipairs(labels) do
+    io.write("    " .. label.name .. ",\n")
+end
+io.write([[};
+
+#endif // USE_COMPUTED_GOTO
+
+]])
+
+-- Emit the jump table
+io.write("static const INSN_JUMPTABLE_TYPE insn_jumptable[", #jumptable, "] = {\n")
+io.write("#ifndef CLANG_TIDY_LINT // Disable clang-tidy via an ifdef because it's too slow\n")
+for i, name in ipairs(jumptable) do
+    io.write(string.format("%-40s", "    INSN_LABEL(" .. name .. "),"), " // " .. string.format("%4d", (i - 1)) .. "\n")
+end
+io.write("#else\n")
+io.write("    INSN_LABEL(ILLEGAL)\n")
+io.write("#endif\n")
+io.write("};\n")
+
+-- Emit the jump table footer
+io.write([[
+
+#pragma GCC diagnostic pop
+// NOLINTEND(cppcoreguidelines-macro-usage)
+
+#endif // INTERPRET_JUMP_TABLE_H
+]])
+
+io.flush()
diff --git a/uarch/uarch-machine-state-access.h b/uarch/uarch-machine-state-access.h
index 01254115a..4e29a7c5a 100644
--- a/uarch/uarch-machine-state-access.h
+++ b/uarch/uarch-machine-state-access.h
@@ -148,12 +148,12 @@ class uarch_pma_entry final {
 
 // Provides access to the state of the big emulator from microcode
 class uarch_machine_state_access : public i_state_access<uarch_machine_state_access, uarch_pma_entry> {
-    std::array<std::optional<uarch_pma_entry>, PMA_MAX> m_pmas;
+    std::array<std::optional<uarch_pma_entry>, PMA_MAX> &m_pmas; //NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
 
 public:
-    uarch_machine_state_access() = default;
-    uarch_machine_state_access(const uarch_machine_state_access &other) = delete;
-    uarch_machine_state_access(uarch_machine_state_access &&other) = delete;
+    explicit uarch_machine_state_access(std::array<std::optional<uarch_pma_entry>, PMA_MAX>& pmas) : m_pmas(pmas) {}
+    uarch_machine_state_access(const uarch_machine_state_access &other) = default;
+    uarch_machine_state_access(uarch_machine_state_access &&other) = default;
     uarch_machine_state_access &operator=(const uarch_machine_state_access &other) = delete;
     uarch_machine_state_access &operator=(uarch_machine_state_access &&other) = delete;
     ~uarch_machine_state_access() = default;
@@ -534,7 +534,7 @@ class uarch_machine_state_access : public i_state_access<uarch_machine_state_acc
     std::pair<uint64_t, bool> do_poll_external_interrupts(uint64_t mcycle, uint64_t /*mcycle_max*/) {
         return {mcycle, false};
     }
-    
+
     uint64_t do_read_pma_istart(int i) {
         return raw_read_memory<uint64_t>(shadow_pmas_get_pma_abs_addr(i));
     }
diff --git a/uarch/uarch-run.cpp b/uarch/uarch-run.cpp
index f077771f0..39222d25e 100644
--- a/uarch/uarch-run.cpp
+++ b/uarch/uarch-run.cpp
@@ -36,18 +36,20 @@ static void set_uarch_halt_flag() {
 }
 
 // Let the state accessor be on static memory storage to speed up uarch initialization
-static uarch_machine_state_access a; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
+static std::array<std::optional<uarch_pma_entry>, PMA_MAX>
+    pmas; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 
 namespace cartesi {
 
 // Declaration of explicit instantiation in module interpret.cpp when compiled with microarchitecture
-extern template interpreter_break_reason interpret(uarch_machine_state_access &a, uint64_t mcycle_end);
+extern template interpreter_break_reason interpret(uarch_machine_state_access a, uint64_t mcycle_end);
 
 } // namespace cartesi
 
 /// \brief  Advances one mcycle by executing the "big machine interpreter" compiled to the microarchitecture
 /// \return This function never returns
 extern "C" NO_RETURN void interpret_next_mcycle_with_uarch() {
+    uarch_machine_state_access a(pmas);
     const uint64_t mcycle_end = a.read_mcycle() + 1;
     interpret(a, mcycle_end);
     // Finished executing a whole mcycle: halt the microarchitecture