diff --git a/.github/workflows/build-syso.yml b/.github/workflows/build-syso.yml new file mode 100644 index 0000000..8c6568c --- /dev/null +++ b/.github/workflows/build-syso.yml @@ -0,0 +1,99 @@ +name: Build syso files + +on: + workflow_dispatch: # Manual trigger + push: + paths: + - 'src/**' + +jobs: + build-syso: + name: Build ${{ matrix.settings.name }} + runs-on: ${{ matrix.settings.runner }} + strategy: + fail-fast: false + matrix: + settings: + - name: Linux x86 + runner: ubuntu-latest + syso_name: hashtree_amd64.syso + - name: macOS arm64 + runner: macos-latest + syso_name: hashtree_darwin_arm64.syso + - name: Linux arm64 + runner: ubuntu-latest + cc: aarch64-linux-gnu-gcc + cross_pkg: gcc-aarch64-linux-gnu + syso_name: hashtree_linux_arm64.syso + - name: Windows x86 + runner: windows-latest + syso_name: hashtree_windows_amd64.syso + - name: Linux RISC-V 64 + runner: ubuntu-latest + cc: riscv64-linux-gnu-gcc + cross_pkg: gcc-riscv64-linux-gnu + syso_name: hashtree_linux_riscv64.syso + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '1.22.x' + + - name: Install cross-compiler + if: ${{ matrix.settings.cross_pkg }} + run: | + sudo apt-get update + sudo apt-get install -y ${{ matrix.settings.cross_pkg }} + + - name: Build (cross-compile) + if: ${{ matrix.settings.cc }} + shell: bash + run: | + CC=${{ matrix.settings.cc }} make go_bindings + mv -f hashtree.syso ${{ matrix.settings.syso_name }} + + - name: Build (native) + if: ${{ !matrix.settings.cc }} + shell: bash + run: | + make go_bindings + mv -f hashtree.syso ${{ matrix.settings.syso_name }} + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.settings.syso_name }} + path: ${{ matrix.settings.syso_name }} + + commit-syso: + name: Commit syso files + needs: build-syso + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: Move syso files to root + run: | + for dir in artifacts/*/; do + mv "$dir"/*.syso . + done + ls -la *.syso + + - name: Commit and push + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add *.syso + git diff --staged --quiet || git commit -m "Update syso files" + git push diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8301f44..99031d1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,16 +32,30 @@ jobs: - name: Windows arm64 target: aarch64-pc-windows-msvc runner: windows-latest + - name: Linux RISC-V 64 + target: riscv64-unknown-linux-gnu + runner: ubuntu-latest + cc: riscv64-linux-gnu-gcc + qemu: qemu-riscv64 steps: - name: Checkout uses: actions/checkout@v4 + - name: Install RISC-V toolchain and QEMU + if: ${{ matrix.settings.qemu }} + run: | + sudo apt-get update + sudo apt-get install -y gcc-riscv64-linux-gnu qemu-user - name: Cross-Compile Build if: ${{ matrix.settings.cc != '' }} run: CC=${{ matrix.settings.cc }} make all - name: Build if: ${{ !matrix.settings.cc }} run: make all + - name: Run tests with QEMU + if: ${{ matrix.settings.qemu }} + run: ${{ matrix.settings.qemu }} -cpu rv64,zbb=true,zbc=true,zbkb=true,zbkc=true,zbkx=true,zknd=true,zkne=true,zknh=true,zkt=true -L /usr/riscv64-linux-gnu ./build/test - name: Run tests + if: ${{ !matrix.settings.qemu }} run: ./build/test go-bindings: @@ -75,6 +89,12 @@ jobs: - name: Windows arm64 target: aarch64-pc-windows-msvc runner: windows-latest + - name: Linux RISC-V 64 + target: riscv64-unknown-linux-gnu + runner: ubuntu-latest + cc: riscv64-linux-gnu-gcc + qemu: qemu-riscv64 + goarch: riscv64 steps: - name: Checkout uses: actions/checkout@v4 @@ -82,15 +102,32 @@ jobs: uses: actions/setup-go@v5 with: go-version: ${{ matrix.go-version }} + - name: Install RISC-V toolchain and QEMU + if: ${{ matrix.settings.qemu }} + run: | + sudo apt-get update + sudo apt-get install -y gcc-riscv64-linux-gnu qemu-user - name: Cross-Compile Build - if: ${{ matrix.settings.cc != '' }} + if: ${{ matrix.settings.cc != '' && !matrix.settings.qemu }} run: CC=${{ matrix.settings.cc }} make go_bindings - name: Build if: ${{ !matrix.settings.cc }} run: make go_bindings + - name: Cross-Compile Build for RISC-V + if: ${{ matrix.settings.qemu }} + run: | + CC=${{ matrix.settings.cc }} make go_bindings + mv hashtree.syso hashtree_linux_riscv64.syso - name: Run tests + if: ${{ !matrix.settings.qemu }} run: go test . + - name: Run tests with QEMU + if: ${{ matrix.settings.qemu }} + run: | + CGO_ENABLED=1 CC=${{ matrix.settings.cc }} GOOS=linux GOARCH=${{ matrix.settings.goarch }} go test -c -o test.riscv64 + ${{ matrix.settings.qemu }} -cpu rv64,zbb=true,zbc=true,zbkb=true,zbkc=true,zbkx=true,zknd=true,zkne=true,zknh=true,zkt=true -L /usr/riscv64-linux-gnu ./test.riscv64 - name: Run benchmarks + if: ${{ !matrix.settings.qemu }} run: go test -bench=. rust-bindings: diff --git a/.gitignore b/.gitignore index 1ab6a3f..bc9e8d5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ Cargo.lock target build +tests +.gitignore +.vscode +CLAUDE.md diff --git a/bindings_riscv64.go b/bindings_riscv64.go new file mode 100644 index 0000000..6d88db1 --- /dev/null +++ b/bindings_riscv64.go @@ -0,0 +1,6 @@ +//go:build riscv64 +// +build riscv64 + +package hashtree + +var supportedCPU = true diff --git a/hashtree_amd64.syso b/hashtree_amd64.syso index 0a5be28..1144ced 100644 Binary files a/hashtree_amd64.syso and b/hashtree_amd64.syso differ diff --git a/hashtree_darwin_arm64.syso b/hashtree_darwin_arm64.syso index c9e606f..4c54a7e 100644 Binary files a/hashtree_darwin_arm64.syso and b/hashtree_darwin_arm64.syso differ diff --git a/hashtree_linux_arm64.syso b/hashtree_linux_arm64.syso index 943c87e..a88f956 100644 Binary files a/hashtree_linux_arm64.syso and b/hashtree_linux_arm64.syso differ diff --git a/hashtree_linux_riscv64.syso b/hashtree_linux_riscv64.syso new file mode 100644 index 0000000..ed1c5b6 Binary files /dev/null and b/hashtree_linux_riscv64.syso differ diff --git a/hashtree_windows_amd64.syso b/hashtree_windows_amd64.syso index 12a436d..f71271d 100644 Binary files a/hashtree_windows_amd64.syso and b/hashtree_windows_amd64.syso differ diff --git a/src/Makefile b/src/Makefile index 913fdf5..b768bd4 100644 --- a/src/Makefile +++ b/src/Makefile @@ -58,9 +58,10 @@ else endif endif -# ARM architecture detection +# architecture detection ifdef CC ARM = $(shell $(CC) -dM -E - < /dev/null | grep "aarch" | awk '{ print $$3 }') + RISCV = $(shell $(CC) -dM -E - < /dev/null | grep "riscv 1" | awk '{ print $$3 }') ifneq ($(findstring mingw, $(CC)),) ifneq ($(ARM),1) PLATFORM = Windows @@ -69,6 +70,7 @@ ifdef CC else ARCH = $(shell uname -m) ARM = $(shell echo $(ARCH) | grep -E '^(arm|aarch64)' >/dev/null && echo 1 || echo 0) + RISCV = $(shell echo $(ARCH) | grep -E '^riscv' >/dev/null && echo 1 || echo 0) endif # Cross-platform compiler selection @@ -105,6 +107,14 @@ ifeq ($(ARM), 1) $(OBJ_DIR)/sha256_armv8_crypto.o\ $(OBJ_DIR)/sha256_generic.o\ $(OBJ_DIR)/hashtree.o +else ifeq ($(RISCV), 1) + CFLAGS += -march=rv64gc_zbb_zk + ASFLAGS += -march=rv64gc_zbb_zk + OBJ_LIST = $(OBJ_DIR)/sha256_riscv_x1.o\ + $(OBJ_DIR)/sha256_riscv_zbb_x1.o\ + $(OBJ_DIR)/sha256_riscv_crypto.o\ + $(OBJ_DIR)/sha256_generic.o\ + $(OBJ_DIR)/hashtree.o else OBJ_LIST = $(OBJ_DIR)/sha256_shani.o\ $(OBJ_DIR)/sha256_avx_x16.o\ diff --git a/src/bench.c b/src/bench.c index 313dea3..3540916 100644 --- a/src/bench.c +++ b/src/bench.c @@ -247,6 +247,45 @@ UBENCH_EX(shani, shani_one_at_time) { free(buffer); } #endif +#ifdef __riscv +UBENCH_EX(riscv, riscv_x1) { + int * buffer = (int *) malloc(buffer_size); + unsigned char * digest = (unsigned char *) malloc(buffer_size/2); + for (int i = 0; i < buffer_size/sizeof(int); i++) { + buffer[i] = rand(); + } + UBENCH_DO_BENCHMARK() { + hashtree_sha256_riscv_x1(digest, (unsigned char *)buffer, buffer_size/64); + } + free(buffer); + free(digest); +} +UBENCH_EX(riscv, riscv_zbb_x1) { + int * buffer = (int *) malloc(buffer_size); + unsigned char * digest = (unsigned char *) malloc(buffer_size/2); + for (int i = 0; i < buffer_size/sizeof(int); i++) { + buffer[i] = rand(); + } + UBENCH_DO_BENCHMARK() { + hashtree_sha256_riscv_zbb_x1(digest, (unsigned char *)buffer, buffer_size/64); + } + free(buffer); + free(digest); +} +UBENCH_EX(riscv, riscv_crypto) { + int * buffer = (int *) malloc(buffer_size); + unsigned char * digest = (unsigned char *) malloc(buffer_size/2); + for (int i = 0; i < buffer_size/sizeof(int); i++) { + buffer[i] = rand(); + } + UBENCH_DO_BENCHMARK() { + hashtree_sha256_riscv_crypto(digest, (unsigned char *)buffer, buffer_size/64); + } + free(buffer); + free(digest); +} +#endif + #ifdef HAVE_OPENSSL UBENCH_EX(openssl, openssl_one_at_time) { int *buffer = (int *)malloc(buffer_size); diff --git a/src/hashtree.c b/src/hashtree.c index c3bd817..f201ac1 100644 --- a/src/hashtree.c +++ b/src/hashtree.c @@ -34,6 +34,33 @@ SOFTWARE. #include #endif #endif +#ifdef __riscv +#include +#include + +/* riscv_hwprobe syscall interface - available in Linux 6.4+ */ +#ifndef __NR_riscv_hwprobe +#define __NR_riscv_hwprobe 258 +#endif + +struct riscv_hwprobe { + int64_t key; + uint64_t value; +}; + +#define RISCV_HWPROBE_KEY_IMA_EXT_0 4 +#define RISCV_HWPROBE_EXT_ZBB (1ULL << 1) +#define RISCV_HWPROBE_EXT_ZBC (1ULL << 2) +#define RISCV_HWPROBE_EXT_ZBKB (1ULL << 3) +#define RISCV_HWPROBE_EXT_ZBKC (1ULL << 4) +#define RISCV_HWPROBE_EXT_ZBKX (1ULL << 5) +#define RISCV_HWPROBE_EXT_ZKND (1ULL << 6) +#define RISCV_HWPROBE_EXT_ZKNE (1ULL << 7) +#define RISCV_HWPROBE_EXT_ZKNH (1ULL << 8) +#define RISCV_HWPROBE_EXT_ZKSED (1ULL << 9) +#define RISCV_HWPROBE_EXT_ZKSH (1ULL << 10) +#define RISCV_HWPROBE_EXT_ZKT (1ULL << 11) +#endif static void init_and_hash(unsigned char *output, const unsigned char *input, uint64_t count); @@ -64,6 +91,29 @@ static hashtree_hash_fcn hashtree_detect() { return &hashtree_sha256_sse_x1; } #endif +#ifdef __riscv + struct riscv_hwprobe pairs[1] = { + { .key = RISCV_HWPROBE_KEY_IMA_EXT_0 } + }; + + long ret = syscall(__NR_riscv_hwprobe, pairs, 1, 0, NULL, 0); + if (ret == 0) { + uint64_t ext = pairs[0].value; + + /* Check for SHA-256 crypto extension (Zknh) + related extensions */ + if ((ext & RISCV_HWPROBE_EXT_ZKNH) && (ext & RISCV_HWPROBE_EXT_ZBKB)) { + return &hashtree_sha256_riscv_crypto; + } + + /* Check for Zbb bit manipulation extension */ + if (ext & RISCV_HWPROBE_EXT_ZBB) { + return &hashtree_sha256_riscv_zbb_x1; + } + } + + /* Fall back to basic RISC-V implementation */ + return &hashtree_sha256_riscv_x1; +#endif #ifdef __aarch64__ #ifdef __APPLE__ return &hashtree_sha256_sha_x1; diff --git a/src/hashtree.h b/src/hashtree.h index e740319..2db486c 100644 --- a/src/hashtree.h +++ b/src/hashtree.h @@ -50,6 +50,12 @@ void hashtree_sha256_neon_x4(unsigned char* output, const unsigned char* input, void hashtree_sha256_sha_x1(unsigned char* output, const unsigned char* input, uint64_t count); #endif +#ifdef __riscv +void hashtree_sha256_riscv_x1(unsigned char* output, const unsigned char* input, uint64_t count); +void hashtree_sha256_riscv_zbb_x1(unsigned char* output, const unsigned char* input, uint64_t count); +void hashtree_sha256_riscv_crypto(unsigned char* output, const unsigned char* input, uint64_t count); +#endif + #ifdef __x86_64__ void hashtree_sha256_sse_x1(unsigned char* output, const unsigned char* input, uint64_t count); void hashtree_sha256_avx_x1(unsigned char* output, const unsigned char* input, uint64_t count); diff --git a/src/sha256_riscv_crypto.S b/src/sha256_riscv_crypto.S new file mode 100644 index 0000000..bf12f87 --- /dev/null +++ b/src/sha256_riscv_crypto.S @@ -0,0 +1,298 @@ +/* +MIT License + +Copyright (c) 2021-2025 Prysmatic Labs + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifdef __riscv +.section .rodata +.align 4 +.LK256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.LDIGEST: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +.LPADDING: + .long 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374 + .long 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254 + .long 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa + .long 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7 + .long 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0 + .long 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd + .long 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16 + .long 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537 + .long 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37 + .long 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7 + .long 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890 + .long 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c + .long 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76 + +#define FRAMEZ 80 // Stack size: 16 * 4 + 16 bytes for one clobbered register and alignment +#define DIGEST_OFFSET 256 // Offset for the digest constants +#define PADDING_OFFSET 288 // Offset for padding constants + +.macro BSWP rd + rev8 \rd, \rd + srai \rd, \rd, 32 +.endm + +// Schedules a word +// s10 has the scheduled word to consume +.macro SCHED offset + .set .Lmyoff, (\offset + 56) % 64 + lwu a5, (.Lmyoff)(sp) // a5 = w[i-2] + .set .Lmyoff, (\offset + 36) % 64 + lwu s10, (.Lmyoff)(sp) // s10 = w[i-7] + sha256sig1 a4,a5 // a4 = \sigma_1(w_{i-2}) + .set .Lmyoff, (\offset + 4) % 64 + lwu a6, (.Lmyoff)(sp) // a6 = w[i-15] + addw s10, s10, a4 // s10 = w_{i-7} + \sigma_1 w_{i-2} + .set .Lmyoff, (\offset + 0) % 64 + lwu a5, (.Lmyoff)(sp) // a5 = w[i-16] + sha256sig0 a4, a6 // a4 = \sigma_0(w_{i-15}) + addw s10, s10, a5 // s10 = w_{i-7} + w_{i-16} + \sigma_1 w_{i-2} + addw s10, s10, a4 + sw s10, (.Lmyoff)(sp) // w[i] is stored and kept in s10 +.endm + +// Performs a full round. +// @w: the scheduled word to use +// @offset: the offset to the K constants to use. +// @base: the base address of the K constants. +// The word to consume is passed in s10 +.macro DO_ROUND base, offset, a, b, c, d, e, f, g, h, USE_SCHED + xor a4, \f, \g // a4 = f ^ g + or a5, \a, \c // a5 = a | c + sha256sum1 a6, \e // a6 = \Sigma_1 e + and a4, a4, \e // a4 = (f ^ g) & e + and a5, a5, \b // a5 = (a | c) & b + xor a4, a4, \g // a4 = ((f ^ g) & e ) ^ g + addw \h, \h, a6 // h = h + \Sigma_1 e + lwu a6, (\offset)(\base) // a6 = K[offset] +.ifnb \USE_SCHED + addw \h, \h, s10 // h = h + \Sigma_1 e + w +.endif + sha256sum0 s10, \a // s10 = \Sigma_0 a + addw \h, \h, a6 // h = h + \Sigma_1 e + K[offset] + w + and a6, \a, \c // a6 = a & c + addw \h, \h, a4 // h = h + \Sigma_1 e + ((f ^ g) & e) ^g + K[offset] + w + or a5, a5, a6 // a5 = (a | c) & b | (a & c) + addw \d, \d, \h // d = d + h + \Sigma_1 e + ((f ^ g) & e) ^g + K[offset] + w + addw a5, a5, s10 // a5 = \Sigma_0 a + ((a | c) & b) | (a & c) + addw \h, \h, a5 +.endm + +.macro ROUND_FROM_INPUT offset, a, b, c, d, e, f, g, h + lwu s10, \offset(a1) + BSWP s10 + sw s10, \offset(sp) + DO_ROUND a3, \offset, \a, \b, \c, \d, \e, \f, \g, \h, 1 +.endm + +.macro FIRST_SIXTEEN_ROUNDS a, b, c, d, e, f, g, h + ROUND_FROM_INPUT 0, \a, \b, \c, \d, \e, \f, \g, \h + ROUND_FROM_INPUT 4, \h, \a, \b, \c, \d, \e, \f, \g + ROUND_FROM_INPUT 8, \g, \h, \a, \b, \c, \d, \e, \f + ROUND_FROM_INPUT 12, \f, \g, \h, \a, \b, \c, \d, \e + ROUND_FROM_INPUT 16, \e, \f, \g, \h, \a, \b, \c, \d + ROUND_FROM_INPUT 20, \d, \e, \f, \g, \h, \a, \b, \c + ROUND_FROM_INPUT 24, \c, \d, \e, \f, \g, \h, \a, \b + ROUND_FROM_INPUT 28, \b, \c, \d, \e, \f, \g, \h, \a + ROUND_FROM_INPUT 32, \a, \b, \c, \d, \e, \f, \g, \h + ROUND_FROM_INPUT 36, \h, \a, \b, \c, \d, \e, \f, \g + ROUND_FROM_INPUT 40, \g, \h, \a, \b, \c, \d, \e, \f + ROUND_FROM_INPUT 44, \f, \g, \h, \a, \b, \c, \d, \e + ROUND_FROM_INPUT 48, \e, \f, \g, \h, \a, \b, \c, \d + ROUND_FROM_INPUT 52, \d, \e, \f, \g, \h, \a, \b, \c + ROUND_FROM_INPUT 56, \c, \d, \e, \f, \g, \h, \a, \b + ROUND_FROM_INPUT 60, \b, \c, \d, \e, \f, \g, \h, \a +.endm + +.macro ROUND_AND_SCHEDULE offset, a, b, c, d, e, f, g, h + SCHED \offset + DO_ROUND a3, \offset, \a, \b, \c, \d, \e, \f, \g, \h, 1 +.endm + +.macro EIGHT_ROUNDS_AND_SCHED offset, a, b, c, d, e, f, g, h + ROUND_AND_SCHEDULE \offset, \a, \b, \c, \d, \e, \f, \g, \h + ROUND_AND_SCHEDULE \offset + 4, \h, \a, \b, \c, \d, \e, \f, \g + ROUND_AND_SCHEDULE \offset + 8, \g, \h, \a, \b, \c, \d, \e, \f + ROUND_AND_SCHEDULE \offset + 12, \f, \g, \h, \a, \b, \c, \d, \e + ROUND_AND_SCHEDULE \offset + 16, \e, \f, \g, \h, \a, \b, \c, \d + ROUND_AND_SCHEDULE \offset + 20, \d, \e, \f, \g, \h, \a, \b, \c + ROUND_AND_SCHEDULE \offset + 24, \c, \d, \e, \f, \g, \h, \a ,\b + ROUND_AND_SCHEDULE \offset + 28, \b ,\c ,\d ,\e ,\f ,\g ,\h ,\a +.endm + +.macro EIGHT_ROUNDS offset, a, b, c, d, e, f, g, h + DO_ROUND a3, \offset, \a, \b, \c, \d, \e, \f, \g, \h, + DO_ROUND a3, \offset + 4, \h, \a, \b, \c, \d, \e, \f, \g, + DO_ROUND a3, \offset + 8, \g, \h, \a, \b, \c, \d, \e, \f, + DO_ROUND a3, \offset + 12, \f, \g, \h, \a, \b, \c, \d, \e, + DO_ROUND a3, \offset + 16, \e, \f, \g ,\h ,\a ,\b ,\c ,\d, + DO_ROUND a3, \offset + 20, \d ,\e ,\f ,\g ,\h ,\a ,\b ,\c, + DO_ROUND a3, \offset + 24, \c ,\d ,\e ,\f ,\g ,\h ,\a ,\b, + DO_ROUND a3, \offset + 28, \b ,\c ,\d ,\e ,\f ,\g ,\h ,\a, +.endm + +.text +.global hashtree_sha256_riscv_crypto +.type hashtree_sha256_riscv_crypto,%function +.align 4 +hashtree_sha256_riscv_crypto: + // set the stack (assumed to be aligned) + addi sp,sp,-FRAMEZ + sd s10, 64(sp) + + //initialize the constants + la a3, .LK256 + slli a2, a2, 5 + add a2, a2, a0 + +.Lsha256_riscv_crypto_loop: + beq a0, a2, .Lsha256_riscv_crypto_epilogue + // initialize the digest variables + lwu t0,DIGEST_OFFSET(a3) + lwu t1,DIGEST_OFFSET+4(a3) + lwu t2,DIGEST_OFFSET+8(a3) + lwu t3,DIGEST_OFFSET+12(a3) + lwu t4,DIGEST_OFFSET+16(a3) + lwu t5,DIGEST_OFFSET+20(a3) + lwu t6,DIGEST_OFFSET+24(a3) + lwu a7,DIGEST_OFFSET+28(a3) + + // First 16 rounds we consume the incoming words directly + FIRST_SIXTEEN_ROUNDS t0, t1, t2, t3, t4, t5, t6, a7 + // update the input pointer + addi a1, a1, 64 + + // The next 48 rounds we consume the scheduled words + .set .Lround_offset, 64 + .rept 6 + EIGHT_ROUNDS_AND_SCHED .Lround_offset, t0, t1, t2, t3, t4, t5, t6, a7 + .set .Lround_offset, .Lround_offset + 32 + .endr + + // Add the original digest + lwu a6, DIGEST_OFFSET(a3) + lwu a5, DIGEST_OFFSET+4(a3) + addw t0, t0, a6 + lwu a4, DIGEST_OFFSET+8(a3) + addw t1, t1, a5 + lwu a6, DIGEST_OFFSET+12(a3) + addw t2, t2, a4 + lwu a5, DIGEST_OFFSET+16(a3) + addw t3, t3, a6 + lwu a4, DIGEST_OFFSET+20(a3) + addw t4, t4, a5 + lwu a6, DIGEST_OFFSET+24(a3) + addw t5, t5, a4 + lwu a5, DIGEST_OFFSET+28(a3) + addw t6, t6, a6 + addw a7, a7, a5 + + // Save the digest, we can use the scheduled word storage + sw t0, 0(sp) + sw t1, 4(sp) + sw t2, 8(sp) + sw t3, 12(sp) + sw t4, 16(sp) + sw t5, 20(sp) + sw t6, 24(sp) + sw a7, 28(sp) + + // Rounds with padding + .set .Lround_offset, PADDING_OFFSET + .rept 8 + EIGHT_ROUNDS .Lround_offset, t0, t1, t2, t3, t4, t5, t6, a7 + .set .Lround_offset, .Lround_offset + 32 + .endr + + // Add the previous digest + lwu a6, 0(sp) + lwu a5, 4(sp) + addw t0, t0, a6 + lwu a4, 8(sp) + addw t1, t1, a5 + lwu a6, 12(sp) + addw t2, t2, a4 + lwu a5, 16(sp) + addw t3, t3, a6 + lwu a4, 20(sp) + addw t4, t4, a5 + lwu a6, 24(sp) + addw t5, t5, a4 + lwu a5, 28(sp) + addw t6, t6, a6 + addw a7, a7, a5 + + // Byte swap to little-endian and save + BSWP t0 + sw t0, 0(a0) + BSWP t1 + sw t1, 4(a0) + BSWP t2 + sw t2, 8(a0) + BSWP t3 + sw t3, 12(a0) + BSWP t4 + sw t4, 16(a0) + BSWP t5 + sw t5, 20(a0) + BSWP t6 + sw t6, 24(a0) + BSWP a7 + sw a7, 28(a0) + addi a0, a0, 32 + j .Lsha256_riscv_crypto_loop + +.Lsha256_riscv_crypto_epilogue: + // restore the stack + ld s10, 64(sp) + addi sp,sp,FRAMEZ + ret + +#ifdef __linux__ +.size hashtree_sha256_riscv_crypto,.-hashtree_sha256_riscv_crypto +.section .note.GNU-stack,"",@progbits +#endif + +#endif // riscv diff --git a/src/sha256_riscv_x1.S b/src/sha256_riscv_x1.S new file mode 100644 index 0000000..866e408 --- /dev/null +++ b/src/sha256_riscv_x1.S @@ -0,0 +1,354 @@ +/* +MIT License + +Copyright (c) 2021-2025 Prysmatic Labs + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* +Registers used: + a0 output pointer + a1 input pointer + a2 count of 32-byte blocks to process + a3 pointer to the K constants + a4 temporary register + a5 temporary register + a6 temporary register, used for digest rotations + + t0-t6 and a7 re used as digest variables: + +Clobbers + s10 (SCHD) keeps a scheduled word + s11 (a63) +*/ + +#ifdef __riscv +.section .rodata +.align 4 +.LK256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.LDIGEST: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +.LPADDING: + .long 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374 + .long 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254 + .long 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa + .long 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7 + .long 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0 + .long 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd + .long 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16 + .long 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537 + .long 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37 + .long 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7 + .long 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890 + .long 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c + .long 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76 + +#define FRAMEZ 80 // Stack size: 16 * 4 bytes for the scheduled words + 16 for two clobbered registers +#define DIGEST_OFFSET 256 // Offset for the digest constants +#define PADDING_OFFSET 288 // Offset for padding constants + +// Rotates rd = rs >> shift. +// Uses a6 as a temporary register. +.macro WROR rd, rs, shift + slliw a6, \rs, (32 - \shift) + srliw \rd, \rs, \shift + or \rd, \rd, a6 +.endm + +// Changes endianness of a word. +// returns the byteswapped word in rd and clobbers a6, a5, a4 and s11 +.macro BSWP rd + slliw a6, \rd, 24 + srliw a5, \rd, 24 + or a5, a5, a6 + lui a6, 0x10 + sraiw a4, \rd, 8 + addi a6, a6, -256 + and a4, a4, a6 + or a5, a5, a4 + slliw \rd, \rd, 8 + lui a4, 0xff0 + and \rd, \rd, a4 + or \rd, \rd, a5 +.endm + +// Schedules a word +// s10 has the scheduled word to consume +.macro SCHED offset + .set .Lmyoff, (\offset + 56) % 64 + lwu a5, (.Lmyoff)(sp) // a5 = w[i-2] + WROR s11, a5, 2 // s11 = w[i-2] >> 2 + .set .Lmyoff, (\offset + 4) % 64 + lwu s10, (.Lmyoff)(sp) // s10 = w[i-15] + xor s11, s11, a5 // s11 = (w[i-2] >> 2) ^ w[i-2] + WROR a4, s10, 11 // a4 = w[i-15] >> 11 + srliw a5, a5, 10 // a5 = w[i-2] > 10 + xor a4, a4, s10 // a4 = (w[i-15] >> 11) ^ w[i-15] + srliw s10, s10, 3 // s10 = w[i-15] >> 3 + WROR s11, s11, 17 // s11 = (w[i-2] >> 19) ^ (w[i-2] >> 17) + WROR a4, a4, 7 // a4 = (w[i-15] >> 18) ^ (w[i-15] >> 7) + xor a5, s11, a5 // a5 = (w[i-2] >> 19) ^ (w[i-2] >> 17) ^ (w[i-2] >> 10) + .set .Lmyoff, (\offset + 0) % 64 + lwu s11, (.Lmyoff)(sp) // s11 = w[i-16] + xor a4, a4, s10 // a4 = (w[i-15] >> 18) ^ (w[i-15] >> 7) ^ (w[i-15] >> 3) + .set .Lmyoff, (\offset + 36) % 64 + lwu s10, (.Lmyoff)(sp) // s10 = w[i-7] + addw s11, s11, a5 // s11 = w[i-16] + ((w[i-2] >> 19) ^ (w[i-2] >> 17) ^ (w[i-2] >> 10)) + addw s10, s10, a4 + addw s10, s10, s11 + .set .Lmyoff, (\offset + 0) % 64 + sw s10, (.Lmyoff)(sp) // w[i] is stored and kept in s10 +.endm + +// Performs a full round. +// @w: the scheduled word to use +// @offset: the offset to the K constants to use. +// @base: the base address of the K constants. +// The word to consume is passed in s10 +.macro DO_ROUND base, offset, a, b, c, d, e, f, g, h, USE_SCHED + WROR a5, \e, 14 // a5 = e >> 14 + WROR a4, \a, 9 // a4 = a >> 9 + xor a5, a5, \e // a5 = (e >> 14) ^ e + xor s11, \f, \g // s11 = f ^ g + xor a4, a4, \a // a4 = (a >> 9) ^ a + WROR a5, a5, 5 // a5 = (e >> 19)^(e >> 5) + WROR a4, a4, 11 // a4 = (a >> 20)^(a >> 11) + xor a5, a5, \e // a5 = (e >> 14)^(e >> 5) ^ e + and s11, s11, \e // s11 = (f ^ g) & e + WROR a5, a5, 6 // a5 = (e >> 25)^(e >> 11)^(e >> 6) + xor s11, s11, \g // s11 = ((f ^ g) & e) ^ g + xor a4, a4, \a // a4 = (a >> 20)^(a >> 11) ^ a + addw s11, s11, a5 // s11 = ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + lwu a5, (\offset)(\base) // a5 = K[offset] + WROR a4, a4, 2 // a4 = (a >> 22)^(a >> 13)^(a >> 2) +.ifnb \USE_SCHED + addw a5, a5, s10 // a5 = K[offset] + w +.endif + addw \h, \h, s11 // h = h + ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + and s11, \a, \c // s11 = a & c + or a6, \a, \c // a6 = a | c + addw \h, \h, a5 // h = h + ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + K[offset] + w + and a6, a6, \b // a6 = (a | c) & b + addw \d, \d, \h // d = d + h + ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + K[offset] + w + addw \h, \h, a4 // h = h + ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + K[offset] + w + (a >> 22)^(a >> 13)^(a >> 2) + or a6, a6, s11 // a6 = ((a | c) & b) | (a & c) + addw \h, \h, a6 // h = h + ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + K[offset] + w + (((a | c) & b) | (a & c)) + (a >> 22)^(a >> 13)^(a >> 2) +.endm + +.macro ROUND_FROM_INPUT offset, a, b, c, d, e, f, g, h + lwu s10, \offset(a1) + BSWP s10 + DO_ROUND a3, \offset, \a, \b, \c, \d, \e, \f, \g, \h, 1 + sw s10, \offset(sp) +.endm + +.macro FIRST_SIXTEEN_ROUNDS a, b, c, d, e, f, g, h + ROUND_FROM_INPUT 0, \a, \b, \c, \d, \e, \f, \g, \h + ROUND_FROM_INPUT 4, \h, \a, \b, \c, \d, \e, \f, \g + ROUND_FROM_INPUT 8, \g, \h, \a, \b, \c, \d, \e, \f + ROUND_FROM_INPUT 12, \f, \g, \h, \a, \b, \c, \d, \e + ROUND_FROM_INPUT 16, \e, \f, \g, \h, \a, \b, \c, \d + ROUND_FROM_INPUT 20, \d, \e, \f, \g, \h, \a, \b, \c + ROUND_FROM_INPUT 24, \c, \d, \e, \f, \g, \h, \a, \b + ROUND_FROM_INPUT 28, \b, \c, \d, \e, \f, \g, \h, \a + ROUND_FROM_INPUT 32, \a, \b, \c, \d, \e, \f, \g, \h + ROUND_FROM_INPUT 36, \h, \a, \b, \c, \d, \e, \f, \g + ROUND_FROM_INPUT 40, \g, \h, \a, \b, \c, \d, \e, \f + ROUND_FROM_INPUT 44, \f, \g, \h, \a, \b, \c, \d, \e + ROUND_FROM_INPUT 48, \e, \f, \g, \h, \a, \b, \c, \d + ROUND_FROM_INPUT 52, \d, \e, \f, \g, \h, \a, \b, \c + ROUND_FROM_INPUT 56, \c, \d, \e, \f, \g, \h, \a, \b + ROUND_FROM_INPUT 60, \b, \c, \d, \e, \f, \g, \h, \a +.endm + +.macro ROUND_AND_SCHEDULE offset, a, b, c, d, e, f, g, h + SCHED \offset + DO_ROUND a3, \offset, \a, \b, \c, \d, \e, \f, \g, \h, 1 +.endm + +.macro EIGHT_ROUNDS_AND_SCHED offset, a, b, c, d, e, f, g, h + ROUND_AND_SCHEDULE \offset, \a, \b, \c, \d, \e, \f, \g, \h + ROUND_AND_SCHEDULE \offset + 4, \h, \a, \b, \c, \d, \e, \f, \g + ROUND_AND_SCHEDULE \offset + 8, \g, \h, \a, \b, \c, \d, \e, \f + ROUND_AND_SCHEDULE \offset + 12, \f, \g, \h, \a, \b, \c, \d, \e + ROUND_AND_SCHEDULE \offset + 16, \e, \f, \g, \h, \a, \b, \c, \d + ROUND_AND_SCHEDULE \offset + 20, \d, \e, \f, \g, \h, \a, \b, \c + ROUND_AND_SCHEDULE \offset + 24, \c, \d, \e, \f, \g, \h, \a ,\b + ROUND_AND_SCHEDULE \offset + 28, \b ,\c ,\d ,\e ,\f ,\g ,\h ,\a +.endm + +.macro EIGHT_ROUNDS offset, a, b, c, d, e, f, g, h + DO_ROUND a3, \offset, \a, \b, \c, \d, \e, \f, \g, \h, + DO_ROUND a3, \offset + 4, \h, \a, \b, \c, \d, \e, \f, \g, + DO_ROUND a3, \offset + 8, \g, \h, \a, \b, \c, \d, \e, \f, + DO_ROUND a3, \offset + 12, \f, \g, \h, \a, \b, \c, \d, \e, + DO_ROUND a3, \offset + 16, \e, \f, \g ,\h ,\a ,\b ,\c ,\d, + DO_ROUND a3, \offset + 20, \d ,\e ,\f ,\g ,\h ,\a ,\b ,\c, + DO_ROUND a3, \offset + 24, \c ,\d ,\e ,\f ,\g ,\h ,\a ,\b, + DO_ROUND a3, \offset + 28, \b ,\c ,\d ,\e ,\f ,\g ,\h ,\a, +.endm + +.text +.global hashtree_sha256_riscv_x1 +.type hashtree_sha256_riscv_x1,%function +.align 4 +hashtree_sha256_riscv_x1: + // set the stack (assumed to be aligned) + addi sp,sp,-FRAMEZ + sd s10, 64(sp) + sd s11, 72(sp) + + //initialize the constants + la a3, .LK256 + slli a2, a2, 5 + add a2, a2, a0 + +.Lsha256_riscv_x1_loop: + beq a0, a2, .Lsha256_riscv_x1_epilogue + // initialize the digest variables + lwu t0,DIGEST_OFFSET(a3) + lwu t1,DIGEST_OFFSET+4(a3) + lwu t2,DIGEST_OFFSET+8(a3) + lwu t3,DIGEST_OFFSET+12(a3) + lwu t4,DIGEST_OFFSET+16(a3) + lwu t5,DIGEST_OFFSET+20(a3) + lwu t6,DIGEST_OFFSET+24(a3) + lwu a7,DIGEST_OFFSET+28(a3) + + // First 16 rounds we consume the incoming words directly + FIRST_SIXTEEN_ROUNDS t0, t1, t2, t3, t4, t5, t6, a7 + // update the input pointer + addi a1, a1, 64 + + // The next 48 rounds we consume the scheduled words + .set .Lround_offset, 64 + .rept 6 + EIGHT_ROUNDS_AND_SCHED .Lround_offset, t0, t1, t2, t3, t4, t5, t6, a7 + .set .Lround_offset, .Lround_offset + 32 + .endr + + // Add the original digest + lwu a6, DIGEST_OFFSET(a3) + lwu a5, DIGEST_OFFSET+4(a3) + addw t0, t0, a6 + lwu s11, DIGEST_OFFSET+8(a3) + addw t1, t1, a5 + lwu a6, DIGEST_OFFSET+12(a3) + addw t2, t2, s11 + lwu a5, DIGEST_OFFSET+16(a3) + addw t3, t3, a6 + lwu s11, DIGEST_OFFSET+20(a3) + addw t4, t4, a5 + lwu a6, DIGEST_OFFSET+24(a3) + addw t5, t5, s11 + lwu a5, DIGEST_OFFSET+28(a3) + addw t6, t6, a6 + addw a7, a7, a5 + + // Save the digest, we can use the scheduled word storage + sw t0, 0(sp) + sw t1, 4(sp) + sw t2, 8(sp) + sw t3, 12(sp) + sw t4, 16(sp) + sw t5, 20(sp) + sw t6, 24(sp) + sw a7, 28(sp) + + // Rounds with padding + .set .Lround_offset, PADDING_OFFSET + .rept 8 + EIGHT_ROUNDS .Lround_offset, t0, t1, t2, t3, t4, t5, t6, a7 + .set .Lround_offset, .Lround_offset + 32 + .endr + + // Add the previous digest + lwu a6, 0(sp) + lwu a5, 4(sp) + addw t0, t0, a6 + lwu s11, 8(sp) + addw t1, t1, a5 + lwu a6, 12(sp) + addw t2, t2, s11 + lwu a5, 16(sp) + addw t3, t3, a6 + lwu s11, 20(sp) + addw t4, t4, a5 + lwu a6, 24(sp) + addw t5, t5, s11 + lwu a5, 28(sp) + addw t6, t6, a6 + addw a7, a7, a5 + + // Byte swap to little-endian and save + BSWP t0 + sw t0, 0(a0) + BSWP t1 + sw t1, 4(a0) + BSWP t2 + sw t2, 8(a0) + BSWP t3 + sw t3, 12(a0) + BSWP t4 + sw t4, 16(a0) + BSWP t5 + sw t5, 20(a0) + BSWP t6 + sw t6, 24(a0) + BSWP a7 + sw a7, 28(a0) + addi a0, a0, 32 + j .Lsha256_riscv_x1_loop + +.Lsha256_riscv_x1_epilogue: + // restore the stack + ld s10, 64(sp) + ld s11, 72(sp) + addi sp,sp,FRAMEZ + ret + +#ifdef __linux__ +.size hashtree_sha256_riscv_x1,.-hashtree_sha256_riscv_x1 +.section .note.GNU-stack,"",@progbits +#endif + +#endif // riscv diff --git a/src/sha256_riscv_zbb_x1.S b/src/sha256_riscv_zbb_x1.S new file mode 100644 index 0000000..664aa2c --- /dev/null +++ b/src/sha256_riscv_zbb_x1.S @@ -0,0 +1,336 @@ +/* +MIT License + +Copyright (c) 2021-2025 Prysmatic Labs + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* +Registers used: + a0 output pointer + a1 input pointer + a2 count of 32-byte blocks to process + a3 pointer to the K constants + a4 temporary register + a5 temporary register + a6 temporary register, used for digest rotations + + t0-t6 and a7 re used as digest variables: + +Clobbers + s10 (SCHD) keeps a scheduled word + s11 (a63) +*/ + +#ifdef __riscv +.section .rodata +.align 4 +.LK256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.LDIGEST: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +.LPADDING: + .long 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374 + .long 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254 + .long 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa + .long 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7 + .long 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0 + .long 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd + .long 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16 + .long 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537 + .long 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37 + .long 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7 + .long 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890 + .long 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c + .long 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76 + +#define FRAMEZ 80 // Stack size: 16 * 4 bytes for the scheduled words + 16 for two clobbered registers +#define DIGEST_OFFSET 256 // Offset for the digest constants +#define PADDING_OFFSET 288 // Offset for padding constants + +// Changes endianness of a word. +// returns the byteswapped word in rd and clobbers a6, a5, a4 and s11 +.macro BSWP rd + rev8 \rd, \rd + srai \rd, \rd, 32 +.endm + +// Schedules a word +// s10 has the scheduled word to consume +.macro SCHED offset + .set .Lmyoff, (\offset + 56) % 64 + lwu a5, (.Lmyoff)(sp) // a5 = w[i-2] + roriw s11, a5, 2 // s11 = w[i-2] >> 2 + .set .Lmyoff, (\offset + 4) % 64 + lwu s10, (.Lmyoff)(sp) // s10 = w[i-15] + xor s11, s11, a5 // s11 = (w[i-2] >> 2) ^ w[i-2] + roriw a4, s10, 11 // a4 = w[i-15] >> 11 + srliw a5, a5, 10 // a5 = w[i-2] > 10 + xor a4, a4, s10 // a4 = (w[i-15] >> 11) ^ w[i-15] + srliw s10, s10, 3 // s10 = w[i-15] >> 3 + roriw s11, s11, 17 // s11 = (w[i-2] >> 19) ^ (w[i-2] >> 17) + roriw a4, a4, 7 // a4 = (w[i-15] >> 18) ^ (w[i-15] >> 7) + xor a5, s11, a5 // a5 = (w[i-2] >> 19) ^ (w[i-2] >> 17) ^ (w[i-2] >> 10) + .set .Lmyoff, (\offset + 0) % 64 + lwu s11, (.Lmyoff)(sp) // s11 = w[i-16] + xor a4, a4, s10 // a4 = (w[i-15] >> 18) ^ (w[i-15] >> 7) ^ (w[i-15] >> 3) + .set .Lmyoff, (\offset + 36) % 64 + lwu s10, (.Lmyoff)(sp) // s10 = w[i-7] + addw s11, s11, a5 // s11 = w[i-16] + ((w[i-2] >> 19) ^ (w[i-2] >> 17) ^ (w[i-2] >> 10)) + addw s10, s10, a4 + addw s10, s10, s11 + .set .Lmyoff, (\offset + 0) % 64 + sw s10, (.Lmyoff)(sp) // w[i] is stored and kept in s10 +.endm + +// Performs a full round. +// @w: the scheduled word to use +// @offset: the offset to the K constants to use. +// @base: the base address of the K constants. +// The word to consume is passed in s10 +.macro DO_ROUND base, offset, a, b, c, d, e, f, g, h, USE_SCHED + roriw a5, \e, 14 // a5 = e >> 14 + roriw a4, \a, 9 // a4 = a >> 9 + xor a5, a5, \e // a5 = (e >> 14) ^ e + xor s11, \f, \g // s11 = f ^ g + xor a4, a4, \a // a4 = (a >> 9) ^ a + roriw a5, a5, 5 // a5 = (e >> 19)^(e >> 5) + roriw a4, a4, 11 // a4 = (a >> 20)^(a >> 11) + xor a5, a5, \e // a5 = (e >> 14)^(e >> 5) ^ e + and s11, s11, \e // s11 = (f ^ g) & e + roriw a5, a5, 6 // a5 = (e >> 25)^(e >> 11)^(e >> 6) + xor s11, s11, \g // s11 = ((f ^ g) & e) ^ g + xor a4, a4, \a // a4 = (a >> 20)^(a >> 11) ^ a + addw s11, s11, a5 // s11 = ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + lwu a5, (\offset)(\base) // a5 = K[offset] + roriw a4, a4, 2 // a4 = (a >> 22)^(a >> 13)^(a >> 2) +.ifnb \USE_SCHED + addw a5, a5, s10 // a5 = K[offset] + w +.endif + addw \h, \h, s11 // h = h + ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + and s11, \a, \c // s11 = a & c + or a6, \a, \c // a6 = a | c + addw \h, \h, a5 // h = h + ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + K[offset] + w + and a6, a6, \b // a6 = (a | c) & b + addw \d, \d, \h // d = d + h + ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + K[offset] + w + addw \h, \h, a4 // h = h + ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + K[offset] + w + (a >> 22)^(a >> 13)^(a >> 2) + or a6, a6, s11 // a6 = ((a | c) & b) | (a & c) + addw \h, \h, a6 // h = h + ((f ^ g) & e) ^ g + (e >> 25)^(e >> 11)^(e >> 6) + K[offset] + w + (((a | c) & b) | (a & c)) + (a >> 22)^(a >> 13)^(a >> 2) +.endm + +.macro ROUND_FROM_INPUT offset, a, b, c, d, e, f, g, h + lwu s10, \offset(a1) + BSWP s10 + DO_ROUND a3, \offset, \a, \b, \c, \d, \e, \f, \g, \h, 1 + sw s10, \offset(sp) +.endm + +.macro FIRST_SIXTEEN_ROUNDS a, b, c, d, e, f, g, h + ROUND_FROM_INPUT 0, \a, \b, \c, \d, \e, \f, \g, \h + ROUND_FROM_INPUT 4, \h, \a, \b, \c, \d, \e, \f, \g + ROUND_FROM_INPUT 8, \g, \h, \a, \b, \c, \d, \e, \f + ROUND_FROM_INPUT 12, \f, \g, \h, \a, \b, \c, \d, \e + ROUND_FROM_INPUT 16, \e, \f, \g, \h, \a, \b, \c, \d + ROUND_FROM_INPUT 20, \d, \e, \f, \g, \h, \a, \b, \c + ROUND_FROM_INPUT 24, \c, \d, \e, \f, \g, \h, \a, \b + ROUND_FROM_INPUT 28, \b, \c, \d, \e, \f, \g, \h, \a + ROUND_FROM_INPUT 32, \a, \b, \c, \d, \e, \f, \g, \h + ROUND_FROM_INPUT 36, \h, \a, \b, \c, \d, \e, \f, \g + ROUND_FROM_INPUT 40, \g, \h, \a, \b, \c, \d, \e, \f + ROUND_FROM_INPUT 44, \f, \g, \h, \a, \b, \c, \d, \e + ROUND_FROM_INPUT 48, \e, \f, \g, \h, \a, \b, \c, \d + ROUND_FROM_INPUT 52, \d, \e, \f, \g, \h, \a, \b, \c + ROUND_FROM_INPUT 56, \c, \d, \e, \f, \g, \h, \a, \b + ROUND_FROM_INPUT 60, \b, \c, \d, \e, \f, \g, \h, \a +.endm + +.macro ROUND_AND_SCHEDULE offset, a, b, c, d, e, f, g, h + SCHED \offset + DO_ROUND a3, \offset, \a, \b, \c, \d, \e, \f, \g, \h, 1 +.endm + +.macro EIGHT_ROUNDS_AND_SCHED offset, a, b, c, d, e, f, g, h + ROUND_AND_SCHEDULE \offset, \a, \b, \c, \d, \e, \f, \g, \h + ROUND_AND_SCHEDULE \offset + 4, \h, \a, \b, \c, \d, \e, \f, \g + ROUND_AND_SCHEDULE \offset + 8, \g, \h, \a, \b, \c, \d, \e, \f + ROUND_AND_SCHEDULE \offset + 12, \f, \g, \h, \a, \b, \c, \d, \e + ROUND_AND_SCHEDULE \offset + 16, \e, \f, \g, \h, \a, \b, \c, \d + ROUND_AND_SCHEDULE \offset + 20, \d, \e, \f, \g, \h, \a, \b, \c + ROUND_AND_SCHEDULE \offset + 24, \c, \d, \e, \f, \g, \h, \a ,\b + ROUND_AND_SCHEDULE \offset + 28, \b ,\c ,\d ,\e ,\f ,\g ,\h ,\a +.endm + +.macro EIGHT_ROUNDS offset, a, b, c, d, e, f, g, h + DO_ROUND a3, \offset, \a, \b, \c, \d, \e, \f, \g, \h, + DO_ROUND a3, \offset + 4, \h, \a, \b, \c, \d, \e, \f, \g, + DO_ROUND a3, \offset + 8, \g, \h, \a, \b, \c, \d, \e, \f, + DO_ROUND a3, \offset + 12, \f, \g, \h, \a, \b, \c, \d, \e, + DO_ROUND a3, \offset + 16, \e, \f, \g ,\h ,\a ,\b ,\c ,\d, + DO_ROUND a3, \offset + 20, \d ,\e ,\f ,\g ,\h ,\a ,\b ,\c, + DO_ROUND a3, \offset + 24, \c ,\d ,\e ,\f ,\g ,\h ,\a ,\b, + DO_ROUND a3, \offset + 28, \b ,\c ,\d ,\e ,\f ,\g ,\h ,\a, +.endm + +.text +.global hashtree_sha256_riscv_zbb_x1 +.type hashtree_sha256_riscv_zbb_x1,%function +.align 4 +hashtree_sha256_riscv_zbb_x1: + // set the stack (assumed to be aligned) + addi sp,sp,-FRAMEZ + sd s10, 64(sp) + sd s11, 72(sp) + + //initialize the constants + la a3, .LK256 + slli a2, a2, 5 + add a2, a2, a0 + +.Lsha256_riscv_zbb_x1_loop: + beq a0, a2, .Lsha256_riscv_zbb_x1_epilogue + // initialize the digest variables + lwu t0,DIGEST_OFFSET(a3) + lwu t1,DIGEST_OFFSET+4(a3) + lwu t2,DIGEST_OFFSET+8(a3) + lwu t3,DIGEST_OFFSET+12(a3) + lwu t4,DIGEST_OFFSET+16(a3) + lwu t5,DIGEST_OFFSET+20(a3) + lwu t6,DIGEST_OFFSET+24(a3) + lwu a7,DIGEST_OFFSET+28(a3) + + // First 16 rounds we consume the incoming words directly + FIRST_SIXTEEN_ROUNDS t0, t1, t2, t3, t4, t5, t6, a7 + // update the input pointer + addi a1, a1, 64 + + // The next 48 rounds we consume the scheduled words + .set .Lround_offset, 64 + .rept 6 + EIGHT_ROUNDS_AND_SCHED .Lround_offset, t0, t1, t2, t3, t4, t5, t6, a7 + .set .Lround_offset, .Lround_offset + 32 + .endr + + // Add the original digest + lwu a6, DIGEST_OFFSET(a3) + lwu a5, DIGEST_OFFSET+4(a3) + addw t0, t0, a6 + lwu s11, DIGEST_OFFSET+8(a3) + addw t1, t1, a5 + lwu a6, DIGEST_OFFSET+12(a3) + addw t2, t2, s11 + lwu a5, DIGEST_OFFSET+16(a3) + addw t3, t3, a6 + lwu s11, DIGEST_OFFSET+20(a3) + addw t4, t4, a5 + lwu a6, DIGEST_OFFSET+24(a3) + addw t5, t5, s11 + lwu a5, DIGEST_OFFSET+28(a3) + addw t6, t6, a6 + addw a7, a7, a5 + + // Save the digest, we can use the scheduled word storage + sw t0, 0(sp) + sw t1, 4(sp) + sw t2, 8(sp) + sw t3, 12(sp) + sw t4, 16(sp) + sw t5, 20(sp) + sw t6, 24(sp) + sw a7, 28(sp) + + // Rounds with padding + .set .Lround_offset, PADDING_OFFSET + .rept 8 + EIGHT_ROUNDS .Lround_offset, t0, t1, t2, t3, t4, t5, t6, a7 + .set .Lround_offset, .Lround_offset + 32 + .endr + + // Add the previous digest + lwu a6, 0(sp) + lwu a5, 4(sp) + addw t0, t0, a6 + lwu s11, 8(sp) + addw t1, t1, a5 + lwu a6, 12(sp) + addw t2, t2, s11 + lwu a5, 16(sp) + addw t3, t3, a6 + lwu s11, 20(sp) + addw t4, t4, a5 + lwu a6, 24(sp) + addw t5, t5, s11 + lwu a5, 28(sp) + addw t6, t6, a6 + addw a7, a7, a5 + + // Byte swap to little-endian and save + BSWP t0 + sw t0, 0(a0) + BSWP t1 + sw t1, 4(a0) + BSWP t2 + sw t2, 8(a0) + BSWP t3 + sw t3, 12(a0) + BSWP t4 + sw t4, 16(a0) + BSWP t5 + sw t5, 20(a0) + BSWP t6 + sw t6, 24(a0) + BSWP a7 + sw a7, 28(a0) + addi a0, a0, 32 + j .Lsha256_riscv_zbb_x1_loop + +.Lsha256_riscv_zbb_x1_epilogue: + // restore the stack + ld s10, 64(sp) + ld s11, 72(sp) + addi sp,sp,FRAMEZ + ret + +#ifdef __linux__ +.size hashtree_sha256_riscv_zbb_x1,.-hashtree_sha256_riscv_zbb_x1 +.section .note.GNU-stack,"",@progbits +#endif + +#endif // riscv diff --git a/src/test.c b/src/test.c index cd12c12..dd6a501 100644 --- a/src/test.c +++ b/src/test.c @@ -504,6 +504,78 @@ void test_hash_openssl() { } #endif +#ifdef __riscv +void test_hash_riscv_x1_one_block() { + unsigned char digest[32]; + + hashtree_sha256_riscv_x1(digest, test_16_block, 1); + + TEST_CHECK(digests_equal(digest, test_1_digest, sizeof(digest))); + TEST_DUMP("Expected: ", test_1_digest, sizeof(test_1_digest)); + TEST_DUMP("Produced: ", digest, sizeof(digest)); +} + +void test_hash_riscv_x1_multiple_blocks() { + unsigned char digest[128]; + + hashtree_sha256_riscv_x1(digest, test_16_block, 4); + + TEST_CHECK(sizeof(digest) == sizeof(test_4_digests)); + TEST_MSG("Expected: %lu", sizeof(test_4_digests)); + TEST_MSG("Produced: %lu", sizeof(digest)); + + TEST_CHECK(digests_equal(digest, test_4_digests, sizeof(digest))); + TEST_DUMP("Expected: ", test_4_digests, sizeof(test_4_digests)); + TEST_DUMP("Produced: ", digest, sizeof(digest)); +} +void test_hash_riscv_zbb_x1_one_block() { + unsigned char digest[32]; + + hashtree_sha256_riscv_zbb_x1(digest, test_16_block, 1); + + TEST_CHECK(digests_equal(digest, test_1_digest, sizeof(digest))); + TEST_DUMP("Expected: ", test_1_digest, sizeof(test_1_digest)); + TEST_DUMP("Produced: ", digest, sizeof(digest)); +} + +void test_hash_riscv_zbb_x1_multiple_blocks() { + unsigned char digest[128]; + + hashtree_sha256_riscv_zbb_x1(digest, test_16_block, 4); + + TEST_CHECK(sizeof(digest) == sizeof(test_4_digests)); + TEST_MSG("Expected: %lu", sizeof(test_4_digests)); + TEST_MSG("Produced: %lu", sizeof(digest)); + + TEST_CHECK(digests_equal(digest, test_4_digests, sizeof(digest))); + TEST_DUMP("Expected: ", test_4_digests, sizeof(test_4_digests)); + TEST_DUMP("Produced: ", digest, sizeof(digest)); +} +void test_hash_riscv_crypto_one_block() { + unsigned char digest[32]; + + hashtree_sha256_riscv_crypto(digest, test_16_block, 1); + + TEST_CHECK(digests_equal(digest, test_1_digest, sizeof(digest))); + TEST_DUMP("Expected: ", test_1_digest, sizeof(test_1_digest)); + TEST_DUMP("Produced: ", digest, sizeof(digest)); +} + +void test_hash_riscv_crypto_multiple_blocks() { + unsigned char digest[128]; + + hashtree_sha256_riscv_crypto(digest, test_16_block, 4); + + TEST_CHECK(sizeof(digest) == sizeof(test_4_digests)); + TEST_MSG("Expected: %lu", sizeof(test_4_digests)); + TEST_MSG("Produced: %lu", sizeof(digest)); + + TEST_CHECK(digests_equal(digest, test_4_digests, sizeof(digest))); + TEST_DUMP("Expected: ", test_4_digests, sizeof(test_4_digests)); + TEST_DUMP("Produced: ", digest, sizeof(digest)); +} +#endif + #ifdef __aarch64__ void test_hash_armv8_neon_x1_one_block() { unsigned char digest[32]; @@ -595,6 +667,14 @@ TEST_LIST = {{"hash", test_hash}, {"hash_avx_16", test_hash_avx_16}, {"hash_avx_16_30blocks", test_hash_avx512_30_blocks}, #endif +#ifdef __riscv + {"hash_riscv_x1_one_block", test_hash_riscv_x1_one_block}, + {"hash_riscv_x1_multiple_blocks", test_hash_riscv_x1_multiple_blocks}, + {"hash_riscv_zbb_x1_one_block", test_hash_riscv_zbb_x1_one_block}, + {"hash_riscv_zbb_x1_multiple_blocks", test_hash_riscv_zbb_x1_multiple_blocks}, + {"hash_riscv_crypto_one_block", test_hash_riscv_crypto_one_block}, + {"hash_riscv_crypto_multiple_blocks", test_hash_riscv_crypto_multiple_blocks}, +#endif #ifdef __aarch64__ {"hash_armv8_neon_one_block", test_hash_armv8_neon_x1_one_block}, {"hash_armv8_neon_multiple_blocks", test_hash_armv8_neon_x1_multiple_blocks}, diff --git a/wrapper_linux_riscv64.s b/wrapper_linux_riscv64.s new file mode 100644 index 0000000..9ec047a --- /dev/null +++ b/wrapper_linux_riscv64.s @@ -0,0 +1,9 @@ +// +build linux,riscv64 + +TEXT ·HashtreeHash(SB), 0, $0-24 + MOV output+0(FP), A0 + MOV input+8(FP), A1 + MOV count+16(FP), A2 + + CALL hashtree_hash(SB) + RET