From 2d5f45306a7ce4c0ca5fe5c273964e82ca43ddb1 Mon Sep 17 00:00:00 2001 From: Aman Date: Mon, 8 Sep 2025 15:20:13 +0100 Subject: [PATCH] polkavm-compiler: Uninline division and remainder instructions Instead of compiling individual division and remainder instructions inline, we now compile them as calls to trampoline functions. This will reduce the maximum number of compiled instruction per PolkaVM instruction. We are doing this because division and remainder instructions are quite rarely used in practice, and inlining them bloats the compiled code size significantly. Additionally, we have made division and remainder operations branchless. Signed-off-by: Aman --- crates/polkavm/src/compiler.rs | 25 ++ crates/polkavm/src/compiler/amd64.rs | 345 +++++++++++------- crates/polkavm/src/tests.rs | 2 +- .../riscv-tests/output/rv64um/div.elf | Bin 2880 -> 2968 bytes .../riscv-tests/output/rv64um/divw.elf | Bin 2856 -> 2936 bytes guest-programs/riscv-tests/tests/rv64um/div.S | 2 + .../riscv-tests/tests/rv64um/divw.S | 2 + 7 files changed, 243 insertions(+), 133 deletions(-) diff --git a/crates/polkavm/src/compiler.rs b/crates/polkavm/src/compiler.rs index e3550ee1..84ae819b 100644 --- a/crates/polkavm/src/compiler.rs +++ b/crates/polkavm/src/compiler.rs @@ -94,6 +94,14 @@ where step_label: Label, trap_label: Label, memset_label: Label, + div32u_label: Label, + div32s_label: Label, + div64u_label: Label, + div64s_label: Label, + rem32u_label: Label, + rem32s_label: Label, + rem64u_label: Label, + rem64s_label: Label, invalid_jump_label: Label, instruction_set: RuntimeInstructionSet, memset_trampoline_start: usize, @@ -210,6 +218,14 @@ where let jump_table_label = asm.forward_declare_label(); let sbrk_label = asm.forward_declare_label(); let memset_label = asm.forward_declare_label(); + let div32u_label = asm.forward_declare_label(); + let div32s_label = asm.forward_declare_label(); + let div64u_label = asm.forward_declare_label(); + let div64s_label = asm.forward_declare_label(); + let rem32u_label = asm.forward_declare_label(); + let rem32s_label = asm.forward_declare_label(); + let rem64u_label = asm.forward_declare_label(); + let rem64s_label = asm.forward_declare_label(); polkavm_common::static_assert!(polkavm_common::zygote::VM_SANDBOX_MAXIMUM_NATIVE_CODE_SIZE < u32::MAX); @@ -237,6 +253,14 @@ where jump_table_label, sbrk_label, memset_label, + div32u_label, + div32s_label, + div64u_label, + div64s_label, + rem32u_label, + rem32s_label, + rem64u_label, + rem64s_label, gas_metering: config.gas_metering, step_tracing, program_counter_to_machine_code_offset_list, @@ -254,6 +278,7 @@ where ArchVisitor(&mut visitor).emit_trap_trampoline(); ArchVisitor(&mut visitor).emit_ecall_trampoline(); ArchVisitor(&mut visitor).emit_sbrk_trampoline(); + ArchVisitor(&mut visitor).emit_divrem_trampoline(); if config.gas_metering.is_some() { visitor.memset_trampoline_start = visitor.asm.len(); diff --git a/crates/polkavm/src/compiler/amd64.rs b/crates/polkavm/src/compiler/amd64.rs index 014c4d24..91268515 100644 --- a/crates/polkavm/src/compiler/amd64.rs +++ b/crates/polkavm/src/compiler/amd64.rs @@ -147,11 +147,13 @@ macro_rules! load_store_operand { }; } +#[derive(Copy, Clone)] enum Signedness { Signed, Unsigned, } +#[derive(Copy, Clone)] enum DivRem { Div, Rem, @@ -352,24 +354,6 @@ where }); } - #[cfg_attr(not(debug_assertions), inline(always))] - fn clear_reg(&mut self, reg: RawReg) { - let reg = conv_reg(reg); - self.push(xor((RegSize::R32, reg, reg))); - } - - #[cfg_attr(not(debug_assertions), inline(always))] - fn fill_with_ones(&mut self, reg: RawReg) { - match self.reg_size() { - RegSize::R32 => { - self.push(mov_imm(conv_reg(reg), imm32(0xffffffff))); - } - RegSize::R64 => { - self.push(mov_imm(conv_reg(reg), imm64(cast(0xffffffff_u32).to_signed()))); - } - } - } - #[cfg_attr(not(debug_assertions), inline(always))] fn compare_reg_reg(&mut self, d: RawReg, s1: RawReg, s2: RawReg, condition: Condition) { let reg_size = self.reg_size(); @@ -552,112 +536,6 @@ where asm.assert_reserved_exactly_as_needed(); } - fn div_rem(&mut self, reg_size: RegSize, d: RawReg, s1: RawReg, s2: RawReg, div_rem: DivRem, kind: Signedness) { - // Unlike most other architectures RISC-V doesn't trap on division by zero - // nor on division with overflow, and has well defined results in such cases. - - let label_divisor_is_zero = self.asm.forward_declare_label(); - let label_next = self.asm.forward_declare_label(); - - self.push(test((reg_size, conv_reg(s2), conv_reg(s2)))); - self.push(jcc_label8(Condition::Equal, label_divisor_is_zero)); - - if matches!(kind, Signedness::Signed) { - let label_normal = self.asm.forward_declare_label(); - match reg_size { - RegSize::R32 => { - self.push(cmp((conv_reg(s1), imm32(cast(i32::MIN).to_unsigned())))); - self.push(jcc_label8(Condition::NotEqual, label_normal)); - } - RegSize::R64 => { - self.push(mov(RegSize::R64, TMP_REG, conv_reg(s1))); - self.push(cmp((TMP_REG, imm32(0)))); - self.push(jcc_label8(Condition::NotEqual, label_normal)); - self.push(shr_imm(RegSize::R64, TMP_REG, 32)); - self.push(cmp((TMP_REG, imm32(cast(i32::MIN).to_unsigned())))); - self.push(jcc_label8(Condition::NotEqual, label_normal)); - } - } - - match reg_size { - RegSize::R32 => self.push(cmp((conv_reg(s2), imm32(cast(-1_i32).to_unsigned())))), - RegSize::R64 => self.push(cmp((conv_reg(s2), imm64(-1_i32)))), - } - self.push(jcc_label8(Condition::NotEqual, label_normal)); - - match div_rem { - DivRem::Div => self.mov(d, s1), - DivRem::Rem => self.clear_reg(d), - } - self.push(jmp_label8(label_next)); - self.define_label(label_normal); - } - - // The division instruction always accepts the dividend and returns the result in rdx:rax. - // This isn't great because we're using these registers for the VM's registers, so we need - // to do all of this in such a way that those won't be accidentally overwritten. - - const _: () = { - assert!(TMP_REG as u32 != rdx as u32); - assert!(TMP_REG as u32 != rax as u32); - }; - - // Push the registers that will be clobbered. - self.push(push(rdx)); - self.push(push(rax)); - - // Push the operands. - self.push(push(conv_reg(s1))); - self.push(push(conv_reg(s2))); - - // Pop the divisor. - self.push(pop(TMP_REG)); - - // Pop the dividend. - self.push(xor((RegSize::R32, rdx, rdx))); - self.push(pop(rax)); - - match kind { - Signedness::Unsigned => self.push(div(reg_size, TMP_REG)), - Signedness::Signed => { - match reg_size { - RegSize::R32 => self.push(cdq()), - RegSize::R64 => self.push(cqo()), - } - self.push(idiv(reg_size, TMP_REG)) - } - } - - // Move the result to the temporary register. - match div_rem { - DivRem::Div => self.push(mov(reg_size, TMP_REG, rax)), - DivRem::Rem => self.push(mov(reg_size, TMP_REG, rdx)), - } - - // Restore the original registers. - self.push(pop(rax)); - self.push(pop(rdx)); - - // Move the output into the destination registers. - self.push(mov(reg_size, conv_reg(d), TMP_REG)); - - // Go to the next instruction. - self.push(jmp_label8(label_next)); - - self.define_label(label_divisor_is_zero); - match div_rem { - DivRem::Div => self.fill_with_ones(d), - DivRem::Rem if d == s1 => {} - DivRem::Rem => self.mov(d, s1), - } - - self.define_label(label_next); - - if (B::BITNESS, reg_size) == (Bitness::B64, RegSize::R32) { - self.push(movsxd_32_to_64(conv_reg(d), conv_reg(d))) - } - } - #[cfg_attr(not(debug_assertions), inline(always))] fn vmctx_field(offset: usize) -> MemOp { match S::KIND { @@ -831,6 +709,187 @@ where self.push(jmp(TMP_REG)); } + fn emit_divrem_trampoline_common(&mut self, reg_size: RegSize, div_rem: DivRem, kind: Signedness) { + let label = match (reg_size, div_rem, kind) { + (RegSize::R32, DivRem::Div, Signedness::Unsigned) => { + log::trace!("Emitting trampoline: divu32"); + self.div32u_label + } + (RegSize::R32, DivRem::Div, Signedness::Signed) => { + log::trace!("Emitting trampoline: divs32"); + self.div32s_label + } + (RegSize::R32, DivRem::Rem, Signedness::Unsigned) => { + log::trace!("Emitting trampoline: remu32"); + self.rem32u_label + } + (RegSize::R32, DivRem::Rem, Signedness::Signed) => { + log::trace!("Emitting trampoline: rems32"); + self.rem32s_label + } + (RegSize::R64, DivRem::Div, Signedness::Unsigned) => { + log::trace!("Emitting trampoline: divu64"); + self.div64u_label + } + (RegSize::R64, DivRem::Div, Signedness::Signed) => { + log::trace!("Emitting trampoline: divs64"); + self.div64s_label + } + (RegSize::R64, DivRem::Rem, Signedness::Unsigned) => { + log::trace!("Emitting trampoline: remu64"); + self.rem64u_label + } + (RegSize::R64, DivRem::Rem, Signedness::Signed) => { + log::trace!("Emitting trampoline: rems64"); + self.rem64s_label + } + }; + + self.define_label(label); + self.push(push(rax)); + self.push(push(rdx)); + self.push(push(rbx)); + + let dividend_address = if matches!(kind, Signedness::Signed) { + self.push(push(r12)); + + reg_indirect(RegSize::R64, rsp + 40) + } else { + reg_indirect(RegSize::R64, rsp + 32) + }; + + // Dividend in rax, divisor in rcx (TMP_REG). + self.push(load(LoadKind::U64, rax, dividend_address)); + + // Create mask for division-by-zero check. + // if divisor == 0 then rbx = 1 else rbx = 0 + self.push(xor((RegSize::R32, rbx, rbx))); + self.push(test((reg_size, TMP_REG, TMP_REG))); + self.push(setcc(Condition::Equal, rbx)); + + if matches!(kind, Signedness::Signed) { + // rdx = (dividend == i32::MIN) ? 1 : 0 + match reg_size { + RegSize::R32 => self.push(mov_imm(rdx, imm32(cast(i32::MIN).to_unsigned()))), + RegSize::R64 => self.push(mov_imm64(rdx, cast(i64::MIN).to_unsigned())), + } + + self.push(cmp((reg_size, rax, rdx))); + self.push(setcc(Condition::Equal, rdx)); + + // r12 = (divisor == -1) ? 1 : 0 + self.push(xor((RegSize::R32, r12, r12))); + match reg_size { + RegSize::R32 => self.push(cmp((TMP_REG, imm32(cast(-1_i32).to_unsigned())))), + RegSize::R64 => self.push(cmp((TMP_REG, imm64(-1_i32)))), + } + self.push(setcc(Condition::Equal, r12)); + + // r12 = (dividend == i32::MIN && divisor == -1) ? 1 : 0 + self.push(and((reg_size, r12, rdx))); + + // ZF is tainted, update it to reflect whether we have a division-by-zero or overflow. + self.push(mov(reg_size, rdx, rbx)); + self.push(or((reg_size, rdx, r12))); + + // safe_divisor = if mask != 0 then 1 else divisor + self.push(cmov(Condition::NotEqual, reg_size, TMP_REG, rdx)); + } else { + // safe_divisor = if mask != 0 then 1 else divisor + self.push(cmov(Condition::Equal, reg_size, TMP_REG, rbx)); + } + + // Actual division. + match kind { + Signedness::Unsigned => { + self.push(xor((RegSize::R32, rdx, rdx))); + self.push(div(reg_size, TMP_REG)) + } + Signedness::Signed => { + match reg_size { + RegSize::R32 => self.push(cdq()), + RegSize::R64 => self.push(cqo()), + } + self.push(idiv(reg_size, TMP_REG)) + } + } + + let result = match div_rem { + DivRem::Div => { + // select quotient: if overflow, return i32::MIN + if matches!(kind, Signedness::Signed) { + // r12 = (dividend == i32::MIN && divisor == -1) ? i32::MIN : 0 + match reg_size { + RegSize::R32 => self.push(shl_imm(reg_size, r12, 31)), + RegSize::R64 => self.push(shl_imm(reg_size, r12, 63)), + } + + self.push(cmov(Condition::NotEqual, reg_size, rax, r12)); + } + + // select quotient: if divisor was 0, return -1 + self.push(neg(reg_size, rbx)); + self.push(cmov(Condition::NotEqual, reg_size, rax, rbx)); + + rax + } + DivRem::Rem => { + // select remainder: if overflow, return 0 + if matches!(kind, Signedness::Signed) { + self.push(xor((r12, imm32(1)))); + self.push(cmov(Condition::Equal, reg_size, rdx, r12)); + } + + // select remainder: if divisor was 0, return dividend + self.push(test((reg_size, rbx, rbx))); + self.push(cmov(Condition::NotEqual, reg_size, rdx, dividend_address)); + + rdx + } + }; + + if reg_size == RegSize::R32 && B::BITNESS == Bitness::B64 { + self.push(movsxd_32_to_64(TMP_REG, result)); + } else { + self.push(mov(reg_size, TMP_REG, result)); + } + + // Restore stack and return. + if matches!(kind, Signedness::Signed) { + self.push(pop(r12)); + } + + self.push(pop(rbx)); + self.push(pop(rdx)); + self.push(pop(rax)); + self.push(ret()); + } + + pub(crate) fn emit_divrem_trampoline(&mut self) { + log::trace!("Emitting trampoline: divrem"); + + polkavm_common::static_assert!(TMP_REG as u32 != rdx as u32); + polkavm_common::static_assert!(TMP_REG as u32 != rax as u32); + + // Unsigned 32-bit division. + self.emit_divrem_trampoline_common(RegSize::R32, DivRem::Div, Signedness::Unsigned); + self.emit_divrem_trampoline_common(RegSize::R32, DivRem::Rem, Signedness::Unsigned); + + // Signed 32-bit division. + self.emit_divrem_trampoline_common(RegSize::R32, DivRem::Div, Signedness::Signed); + self.emit_divrem_trampoline_common(RegSize::R32, DivRem::Rem, Signedness::Signed); + + if B::BITNESS == Bitness::B64 { + // Unsigned 64-bit division. + self.emit_divrem_trampoline_common(RegSize::R64, DivRem::Div, Signedness::Unsigned); + self.emit_divrem_trampoline_common(RegSize::R64, DivRem::Rem, Signedness::Unsigned); + + // Signed 64-bit division. + self.emit_divrem_trampoline_common(RegSize::R64, DivRem::Div, Signedness::Signed); + self.emit_divrem_trampoline_common(RegSize::R64, DivRem::Rem, Signedness::Signed); + } + } + pub(crate) fn trace_execution(&mut self, code_offset: u32) { let step_label = self.step_label; let asm = self.asm.reserve::(); @@ -1781,48 +1840,70 @@ where } } + #[cfg_attr(not(debug_assertions), inline(always))] + fn divrem(&mut self, reg_size: RegSize, div_rem: DivRem, kind: Signedness, d: RawReg, s1: RawReg, s2: RawReg) { + let label = match (reg_size, div_rem, kind) { + (RegSize::R32, DivRem::Div, Signedness::Unsigned) => self.div32u_label, + (RegSize::R32, DivRem::Div, Signedness::Signed) => self.div32s_label, + (RegSize::R32, DivRem::Rem, Signedness::Unsigned) => self.rem32u_label, + (RegSize::R32, DivRem::Rem, Signedness::Signed) => self.rem32s_label, + (RegSize::R64, DivRem::Div, Signedness::Unsigned) => self.div64u_label, + (RegSize::R64, DivRem::Div, Signedness::Signed) => self.div64s_label, + (RegSize::R64, DivRem::Rem, Signedness::Unsigned) => self.rem64u_label, + (RegSize::R64, DivRem::Rem, Signedness::Signed) => self.rem64s_label, + }; + + let asm = self.asm.reserve::(); + let asm = asm.push(mov(reg_size, TMP_REG, conv_reg(s2))); + let asm = asm.push(push(conv_reg(s1))); + let asm = asm.push(call_label32(label)); + let asm = asm.push(pop(conv_reg(d))); + let asm = asm.push(mov(RegSize::R64, conv_reg(d), TMP_REG)); + asm.assert_reserved_exactly_as_needed(); + } + #[inline(always)] pub fn div_unsigned_32(&mut self, d: RawReg, s1: RawReg, s2: RawReg) { - self.div_rem(RegSize::R32, d, s1, s2, DivRem::Div, Signedness::Unsigned); + self.divrem(RegSize::R32, DivRem::Div, Signedness::Unsigned, d, s1, s2); } #[inline(always)] pub fn div_unsigned_64(&mut self, d: RawReg, s1: RawReg, s2: RawReg) { assert_eq!(B::BITNESS, Bitness::B64); - self.div_rem(RegSize::R64, d, s1, s2, DivRem::Div, Signedness::Unsigned); + self.divrem(RegSize::R64, DivRem::Div, Signedness::Unsigned, d, s1, s2); } #[inline(always)] pub fn div_signed_32(&mut self, d: RawReg, s1: RawReg, s2: RawReg) { - self.div_rem(RegSize::R32, d, s1, s2, DivRem::Div, Signedness::Signed); + self.divrem(RegSize::R32, DivRem::Div, Signedness::Signed, d, s1, s2); } #[inline(always)] pub fn div_signed_64(&mut self, d: RawReg, s1: RawReg, s2: RawReg) { assert_eq!(B::BITNESS, Bitness::B64); - self.div_rem(RegSize::R64, d, s1, s2, DivRem::Div, Signedness::Signed); + self.divrem(RegSize::R64, DivRem::Div, Signedness::Signed, d, s1, s2); } #[inline(always)] pub fn rem_unsigned_32(&mut self, d: RawReg, s1: RawReg, s2: RawReg) { - self.div_rem(RegSize::R32, d, s1, s2, DivRem::Rem, Signedness::Unsigned); + self.divrem(RegSize::R32, DivRem::Rem, Signedness::Unsigned, d, s1, s2); } #[inline(always)] pub fn rem_unsigned_64(&mut self, d: RawReg, s1: RawReg, s2: RawReg) { assert_eq!(B::BITNESS, Bitness::B64); - self.div_rem(RegSize::R64, d, s1, s2, DivRem::Rem, Signedness::Unsigned); + self.divrem(RegSize::R64, DivRem::Rem, Signedness::Unsigned, d, s1, s2); } #[inline(always)] pub fn rem_signed_32(&mut self, d: RawReg, s1: RawReg, s2: RawReg) { - self.div_rem(RegSize::R32, d, s1, s2, DivRem::Rem, Signedness::Signed); + self.divrem(RegSize::R32, DivRem::Rem, Signedness::Signed, d, s1, s2); } #[inline(always)] pub fn rem_signed_64(&mut self, d: RawReg, s1: RawReg, s2: RawReg) { assert_eq!(B::BITNESS, Bitness::B64); - self.div_rem(RegSize::R64, d, s1, s2, DivRem::Rem, Signedness::Signed); + self.divrem(RegSize::R64, DivRem::Rem, Signedness::Signed, d, s1, s2); } #[inline(always)] diff --git a/crates/polkavm/src/tests.rs b/crates/polkavm/src/tests.rs index e22084f7..0b53316f 100644 --- a/crates/polkavm/src/tests.rs +++ b/crates/polkavm/src/tests.rs @@ -4135,7 +4135,7 @@ fn run_riscv_test(engine_config: Config, elf: &[u8], testnum_reg: Reg, optimize: instance.set_next_program_counter(entry_point); let result = instance.run().unwrap(); if !matches!(result, InterruptKind::Finished) { - panic!("test {} failed: unexpecte result: {result:?}", instance.reg(testnum_reg)); + panic!("test {} failed: unexpected result: {result:?}", instance.reg(testnum_reg)); } } diff --git a/guest-programs/riscv-tests/output/rv64um/div.elf b/guest-programs/riscv-tests/output/rv64um/div.elf index 85e4d901db04236aaba2fe0a239cf0d0dd383180..8733421f5f2d7dd772ab6daee27ebe6d2f5e5b01 100755 GIT binary patch delta 754 zcmZ{i&r2Io5XWcU+a$Ydl!qG=4Tau!G=(1548t{^k$B|2U!FY2SM=h#CEa|1n|RRKy=&`a(HW9Mgr9KfJ?FY4Ib<4BXSVcjMtB~(9hd9sFg=Zai zXce{Aj%026Dr=cK)G`fNQqcV1-+q5zeYXfmzi>Zq+R(kUodk>9q7(5$*n;-u)yg-F znl)9Q^#ej$#Q7m}1^0o<$9Yq88D5q?X$3|6Tj2^-xvJ; z4}W;lp{c}CYWdC5^1{f->0M$1N1@$R_~W~I8M8Wg&OnZ7K~t$&%+qS>5=&II$DL2% zq_7of&wk;Wu9Y4#^6lc^XxJ$U&c1Y}@qo6SGRs`EZ_jz3z*UELDJMZ5Z;)!dNw!tet6482RGG7+ z-=N#RJF|o%Rt)c%Pi6pk%|0y+!F#V)c1p8K#^B%CAn^(T24G|4A=Iq-!mQHNxn?H{wKu Ai2wiq delta 592 zcmZ`$Jxc>Y5S`uo$lb-_!hIPOO&3wYR54(2v9t8>48 zhd3gaBy7;&cnI?+dBQsPvfm}~NRz?fie(js;~@PXzm0GXf_T{YIlN(qw~ikomWTH_ z_BqDqRlNmng9Z54)E^z8aifTNl5WS1tBf+)nMM~+6GtSCt;8c80bWRXImZ?ghJ>%u zjy&xpz?8f@Q#rEDHsrIrfF0SENAUtue%ZpDk|BraC^iw$S8U^ePAvOzPO{R%4`oZv Rv7ZRzCz2&I#6u^U{RKzTc{Bh3 diff --git a/guest-programs/riscv-tests/tests/rv64um/div.S b/guest-programs/riscv-tests/tests/rv64um/div.S index ee21f0c9..146e86fc 100644 --- a/guest-programs/riscv-tests/tests/rv64um/div.S +++ b/guest-programs/riscv-tests/tests/rv64um/div.S @@ -29,6 +29,8 @@ RVTEST_CODE_BEGIN TEST_RR_OP( 9, div, -1, 1, 0 ); TEST_RR_OP(10, div, -1, 0, 0 ); + TEST_RR_OP(11, div, -1<<62, -1<<63, 2 ); + TEST_PASSFAIL RVTEST_CODE_END diff --git a/guest-programs/riscv-tests/tests/rv64um/divw.S b/guest-programs/riscv-tests/tests/rv64um/divw.S index 4cffa1aa..47119814 100644 --- a/guest-programs/riscv-tests/tests/rv64um/divw.S +++ b/guest-programs/riscv-tests/tests/rv64um/divw.S @@ -29,6 +29,8 @@ RVTEST_CODE_BEGIN TEST_RR_OP( 9, divw, -1, 1, 0 ); TEST_RR_OP(10, divw, -1, 0, 0 ); + TEST_RR_OP(11, div, -1<<30, -1<<31, 2 ); + TEST_PASSFAIL RVTEST_CODE_END