Skip to content

Commit

Permalink
feat: optimize instruction fetch and decoding with big jump tables
Browse files Browse the repository at this point in the history
  • Loading branch information
edubart committed Dec 18, 2024
1 parent 3cc7963 commit ccd366b
Show file tree
Hide file tree
Showing 23 changed files with 2,181 additions and 1,462 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ jobs:
uarch-pristine-ram.c
uarch-pristine-hash.c
machine-c-version.h
interpret-jump-table.h
cartesi-machine-v${{ env.MACHINE_EMULATOR_VERSION }}_amd64.deb
cartesi-machine-v${{ env.MACHINE_EMULATOR_VERSION }}_arm64.deb
Expand Down Expand Up @@ -726,6 +727,7 @@ jobs:
if: ${{ startsWith(github.ref, 'refs/tags/v') }}
run: |
mv artifacts/machine-c-version.h src
mv artifacts/interpret-jump-table.h src
mv artifacts/uarch-pristine-ram.c uarch
mv artifacts/uarch-pristine-hash.c uarch
make create-generated-files-patch
Expand Down
9 changes: 6 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ export CXX=g++

endif

GENERATED_FILES= uarch/uarch-pristine-hash.c uarch/uarch-pristine-ram.c src/machine-c-version.h
GENERATED_FILES= uarch/uarch-pristine-hash.c uarch/uarch-pristine-ram.c src/machine-c-version.h src/interpret-jump-table.h
ADD_GENERATED_FILES_DIFF= add-generated-files.diff

all: source-default
Expand Down Expand Up @@ -244,12 +244,15 @@ lint-% check-format-% format-% check-format-lua-% check-lua-% format-lua-%:
source-default:
@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR)

uarch: $(SRCDIR)/machine-c-version.h
uarch: $(SRCDIR)/machine-c-version.h $(SRCDIR)/interpret-jump-table.h
@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C uarch

$(SRCDIR)/machine-c-version.h:
@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR) machine-c-version.h

$(SRCDIR)/interpret-jump-table.h:
@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR) interpret-jump-table.h

build-emulator-builder-image:
docker build $(DOCKER_PLATFORM) --build-arg DEBUG=$(debug) --build-arg COVERAGE=$(coverage) --build-arg SANITIZE=$(sanitize) --target builder -t cartesi/machine-emulator:builder -f Dockerfile .

Expand Down Expand Up @@ -282,6 +285,7 @@ copy:
docker create --name uarch-ram-bin $(DOCKER_PLATFORM) $(DEBIAN_IMG)
docker cp uarch-ram-bin:/usr/src/emulator/$(DEB_FILENAME) .
docker cp uarch-ram-bin:/usr/src/emulator/src/machine-c-version.h .
docker cp uarch-ram-bin:/usr/src/emulator/src/interpret-jump-table.h .
docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-ram.bin .
docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-pristine-ram.c .
docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-pristine-hash.c .
Expand Down Expand Up @@ -399,4 +403,3 @@ $(ADD_GENERATED_FILES_DIFF): $(GENERATED_FILES)

.PHONY: help all submodules doc clean distclean src luacartesi hash uarch \
create-generated-files-patch $(SUBDIRS) $(SUBCLEAN)

1 change: 1 addition & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ compile_flags.txt
coverage*
jsonrpc-discover.cpp
machine-c-version.h
interpret-jump-table.h
44 changes: 28 additions & 16 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -214,18 +214,27 @@ SHA3_CFLAGS=-O3

# Optimization flags for the interpreter
ifneq (,$(filter yes,$(relwithdebinfo) $(release)))
ifneq (,$(filter gcc,$(CC)))
# The following flag helps GCC to eliminate more redundant computations in the interpret loop,
# saving some host instructions and improving performance.
# This flag is usually enabled by default at -O3,
# but we don't use -O3 because it enables some other flags that are not worth for the interpreter.
INTERPRET_CXXFLAGS+=-fgcse-after-reload -fpredictive-commoning -fsplit-paths -ftree-partial-pre
ifneq (,$(findstring gcc,$(CC)))
# The following improves computed goto dispatch as stated in GCC manual
INTERPRET_CXXFLAGS+=-fno-gcse
# The following remove extra jumps in the computed goto dispatch
INTERPRET_CXXFLAGS+=-fno-crossjumping
# The following remove extra NOPs before jumping back to the interpret hot loop
INTERPRET_CXXFLAGS+=-fno-align-loops
# The interpreter dispatch loop performs better as a big inlined function
INTERPRET_CXXFLAGS+=-finline-limit=1024
# The interpreter hot loop is big and puts pressure on register allocation, this improves register use
INTERPRET_CXXFLAGS+=-frename-registers -fweb
# The interpreter instruction dispatch is big, the following reduces its size minimizing CPU cache pressure
INTERPRET_CXXFLAGS+=-freorder-blocks-algorithm=simple
# Some distributions enables stack protector by default, make sure it's disabled
INTERPRET_CXXFLAGS+=-fno-stack-protector
endif
# Disable jump tables, because it degrades the instruction decoding performance in the interpret loop,
# since it generates a memory indirection that has a high cost in opcode switches.
INTERPRET_CXXFLAGS+=-fno-jump-tables
endif

# Make testing new optimization options easier
INTERPRET_CXXFLAGS+=$(MYINTERPRET_CXXFLAGS)

# Link time optimizations
ifeq ($(lto),yes)
OPTFLAGS+=-flto=auto
Expand Down Expand Up @@ -262,7 +271,7 @@ PGO_WORKLOAD=\
whetstone 25000

LINTER_IGNORE_SOURCES=
LINTER_IGNORE_HEADERS=
LINTER_IGNORE_HEADERS=interpret-jump-table.h
LINTER_SOURCES=$(filter-out $(LINTER_IGNORE_SOURCES),$(strip $(wildcard *.cpp) $(wildcard *.c)))
LINTER_HEADERS=$(filter-out $(LINTER_IGNORE_HEADERS),$(strip $(wildcard *.hpp) $(wildcard *.h)))

Expand All @@ -273,7 +282,7 @@ CLANG_FORMAT=clang-format
CLANG_FORMAT_UARCH_FILES:=$(wildcard ../uarch/*.cpp)
CLANG_FORMAT_UARCH_FILES:=$(filter-out %uarch-printf%,$(strip $(CLANG_FORMAT_UARCH_FILES)))
CLANG_FORMAT_FILES:=$(wildcard *.cpp) $(wildcard *.c) $(wildcard *.h) $(wildcard *.hpp) $(CLANG_FORMAT_UARCH_FILES)
CLANG_FORMAT_IGNORE_FILES:=
CLANG_FORMAT_IGNORE_FILES:=interpret-jump-table.h
CLANG_FORMAT_FILES:=$(strip $(CLANG_FORMAT_FILES))
CLANG_FORMAT_FILES:=$(filter-out $(CLANG_FORMAT_IGNORE_FILES),$(strip $(CLANG_FORMAT_FILES)))

Expand Down Expand Up @@ -542,12 +551,12 @@ jsonrpc-discover.cpp: jsonrpc-discover.json
echo '} // namespace cartesi' >> jsonrpc-discover.cpp

%.clang-tidy: %.cpp machine-c-version.h
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CXXFLAGS) $(LUA_INC) 2>/dev/null
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CXXFLAGS) $(LUA_INC) -DCLANG_TIDY_LINT 2>/dev/null
@$(CXX) $(CXXFLAGS) $(LUA_INC) $< -MM -MT $@ -MF $@.d > /dev/null 2>&1
@touch $@

%.clang-tidy: %.c
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CFLAGS) 2>/dev/null
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CFLAGS) -DCLANG_TIDY_LINT 2>/dev/null
@$(CC) $(CFLAGS) $< -MM -MT $@ -MF $@.d > /dev/null 2>&1
@touch $@

Expand All @@ -560,7 +569,10 @@ uarch-pristine-ram.o: $(UARCH_PRISTINE_RAM_C)
uarch-pristine-hash.o: $(UARCH_PRISTINE_HASH_C)
$(CC) $(CFLAGS) -c -o $@ $<

interpret.o: interpret.cpp machine-c-version.h
interpret-jump-table.h: ../tools/gen-interpret-jump-table.lua
$< > $@

interpret.o: interpret.cpp interpret-jump-table.h machine-c-version.h
$(CXX) $(CXXFLAGS) $(INTERPRET_CXXFLAGS) -c -o $@ $<

%.o: %.cpp machine-c-version.h
Expand All @@ -571,7 +583,7 @@ interpret.o: interpret.cpp machine-c-version.h

../uarch/uarch-pristine-ram.c ../uarch/uarch-pristine-hash.c: generate-uarch-pristine

generate-uarch-pristine:
generate-uarch-pristine: machine-c-version.h interpret-jump-table.h
ifeq (,$(wildcard ../uarch/uarch-pristine-hash.c))
@if [ "$(DEV_ENV_HAS_TOOLCHAIN)" = "yes" ]; then \
$(MAKE) -C .. uarch; \
Expand All @@ -583,7 +595,7 @@ endif
clean: clean-auto-generated clean-coverage clean-profile clean-tidy clean-libcartesi clean-executables

clean-auto-generated:
@rm -f jsonrpc-discover.cpp machine-c-version.h
@rm -f jsonrpc-discover.cpp machine-c-version.h interpret-jump-table.h

clean-tidy:
@rm -f *.clang-tidy
Expand Down
4 changes: 2 additions & 2 deletions src/device-state-access.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace cartesi {
template <typename STATE_ACCESS>
class device_state_access : public i_device_state_access {
public:
explicit device_state_access(STATE_ACCESS &a, uint64_t mcycle) : m_a(a), m_mcycle(mcycle) {
explicit device_state_access(STATE_ACCESS a, uint64_t mcycle) : m_a(a), m_mcycle(mcycle) {
static_assert(is_an_i_state_access<STATE_ACCESS>::value, "not an i_state_access");
}

Expand All @@ -52,7 +52,7 @@ class device_state_access : public i_device_state_access {
~device_state_access() override = default;

private:
STATE_ACCESS &m_a; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
STATE_ACCESS m_a; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
uint64_t m_mcycle;

void do_set_mip(uint64_t mask) override {
Expand Down
5 changes: 3 additions & 2 deletions src/i-state-access.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <type_traits>
#include <utility>

#include "compiler-defines.h"
#include "meta.h"
#include "shadow-tlb.h"

Expand Down Expand Up @@ -773,15 +774,15 @@ class i_state_access { // CRTP
}

/// \brief Invalidates all TLB entries of all types.
void flush_all_tlb() {
NO_INLINE void flush_all_tlb() {
derived().template flush_tlb_type<TLB_CODE>();
derived().template flush_tlb_type<TLB_READ>();
derived().template flush_tlb_type<TLB_WRITE>();
}

/// \brief Invalidates TLB entries for a specific virtual address.
/// \param vaddr Target virtual address.
void flush_tlb_vaddr(uint64_t vaddr) {
NO_INLINE void flush_tlb_vaddr(uint64_t vaddr) {
return derived().do_flush_tlb_vaddr(vaddr);
}

Expand Down
Loading

0 comments on commit ccd366b

Please sign in to comment.