Skip to content

Commit

Permalink
loongarch: sljit_emit_atomic_load/store implementation
Browse files Browse the repository at this point in the history
Assumes all atomics are at least 32bit aligned by setting
SLJIT_ATOMIC_WIDTH.

In that mode, only the bits that correspond to the operand
size are used and the rest are pressumed to be 0.

Operations with lower alignment than it are undefined and
will result in a SIGBUS.

If SLJIT_ATOMIC_EMULATION is defined, then additional code
is enabled that allow manipulating those narrower types at
a significant performance cost.

Original code by Mingtao, all bugs mine.
  • Loading branch information
carenas committed Jun 11, 2023
1 parent 818706a commit 8797280
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 79 deletions.
5 changes: 5 additions & 0 deletions API_CHANGES
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
This file is the short summary of the API changes:

10.06.2023 - Backward compatible
Emulation for atomic operations in narrow bitwidths is
configurable at build time, SLJIT_HAS_ATOMIC_{8,16}BIT
provide availability at runtime.

16.02.2022 - Non-backward compatible
The sljit_emit_cmov operation is replaced
by sljit_emit_select.
Expand Down
5 changes: 5 additions & 0 deletions sljit_src/sljitConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@ extern "C" {
#define SLJIT_VERBOSE 1
#endif

/* Atomic emulation level */
#ifndef SLJIT_ATOMIC_EMULATION
#define SLJIT_ATOMIC_EMULATION 0
#endif

/*
SLJIT_IS_FPU_AVAILABLE
The availability of the FPU can be controlled by SLJIT_IS_FPU_AVAILABLE.
Expand Down
3 changes: 2 additions & 1 deletion sljit_src/sljitConfigInternal.h
Original file line number Diff line number Diff line change
Expand Up @@ -779,13 +779,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);

#elif (defined SLJIT_CONFIG_LOONGARCH && SLJIT_CONFIG_LOONGARCH)

#define SLJIT_NUMBER_OF_REGISTERS 23
#define SLJIT_NUMBER_OF_REGISTERS 22
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 10
#define SLJIT_LOCALS_OFFSET_BASE 0
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 12
#define SLJIT_MASKED_SHIFT 1
#define SLJIT_MASKED_SHIFT32 1
#define SLJIT_ATOMIC_WIDTH 32

#elif (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED)

Expand Down
10 changes: 9 additions & 1 deletion sljit_src/sljitLir.h
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,10 @@ static SLJIT_INLINE sljit_uw sljit_get_generated_code_size(struct sljit_compiler
#define SLJIT_HAS_COPY_F32 9
/* [Emulated] Copy from/to f64 operation is available (see sljit_emit_fcopy). */
#define SLJIT_HAS_COPY_F64 10
/* [Emulated] Loading/Storing 8bit atomics is available. */
#define SLJIT_HAS_ATOMIC_8BIT 11
/* [Emulated] Loading/Storing 16bit atomics is available. */
#define SLJIT_HAS_ATOMIC_16BIT 12

#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
/* [Not emulated] SSE2 support is available on x86. */
Expand Down Expand Up @@ -1804,7 +1808,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler
- the memory operation (op) and the base address (stored in mem_reg)
passed to the load/store operations must be the same (the mem_reg
can be a different register, only its value must be the same)
- an store must always follow a load for the same transaction.
- an store must always follow a load for the same transaction, but
loads might be abandoned
- if the CPU defines a minimum bit width supported (SLJIT_ATOMIC_WIDTH)
then the memory address must be aligned to it; alternatively smaller
atomic types can be supported by enabling SLJIT_ATOMIC_EMULATION
op must be between SLJIT_MOV and SLJIT_MOV_P, excluding all
signed loads such as SLJIT_MOV32_S16
Expand Down
133 changes: 117 additions & 16 deletions sljit_src/sljitNativeLOONGARCH_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,20 @@ typedef sljit_u32 sljit_ins;
#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2)
#define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3)
#define TMP_REG3 (SLJIT_NUMBER_OF_REGISTERS + 4)
#define TMP_REG4 (SLJIT_NUMBER_OF_REGISTERS + 5)
#define TMP_ZERO 0

/* Flags are kept in volatile registers. */
#define EQUAL_FLAG (SLJIT_NUMBER_OF_REGISTERS + 5)
#define EQUAL_FLAG (SLJIT_NUMBER_OF_REGISTERS + 6)
#define RETURN_ADDR_REG TMP_REG2
#define OTHER_FLAG (SLJIT_NUMBER_OF_REGISTERS + 6)
#define OTHER_FLAG (SLJIT_NUMBER_OF_REGISTERS + 7)

#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
#define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)


static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 7] = {
0, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19, 20, 22, 31, 30, 29, 28, 27, 26, 25, 24, 23, 3, 13, 1, 14, 12, 15
static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 8] = {
0, 4, 5, 6, 7, 8, 9, 10, 11, 17, 18, 19, 20, 22, 31, 30, 29, 28, 27, 26, 25, 24, 23, 3, 13, 1, 14, 16, 12, 15
};

static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
Expand All @@ -62,7 +63,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
/*
LoongArch instructions are 32 bits wide, belonging to 9 basic instruction formats (and variants of them):
| Format name | Composition |
| Format name | Composition |
| 2R | Opcode + Rj + Rd |
| 3R | Opcode + Rk + Rj + Rd |
| 4R | Opcode + Ra + Rk + Rj + Rd |
Expand Down Expand Up @@ -642,6 +643,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
case SLJIT_HAS_COPY_F64:
return 1;

case SLJIT_HAS_ATOMIC_8BIT:
case SLJIT_HAS_ATOMIC_16BIT:
return SLJIT_ATOMIC_EMULATION;

default:
return 0;
}
Expand Down Expand Up @@ -2449,8 +2454,10 @@ static sljit_ins get_jump_instruction(sljit_s32 type)
{
switch (type) {
case SLJIT_EQUAL:
case SLJIT_ATOMIC_NOT_STORED:
return BNE | RJ(EQUAL_FLAG) | RD(TMP_ZERO);
case SLJIT_NOT_EQUAL:
case SLJIT_ATOMIC_STORED:
return BEQ | RJ(EQUAL_FLAG) | RD(TMP_ZERO);
case SLJIT_LESS:
case SLJIT_GREATER:
Expand Down Expand Up @@ -2734,6 +2741,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co
FAIL_IF(push_inst(compiler, SLTUI | RD(dst_r) | RJ(EQUAL_FLAG) | IMM_I12(1)));
src_r = dst_r;
break;
case SLJIT_ATOMIC_STORED:
case SLJIT_ATOMIC_NOT_STORED:
FAIL_IF(push_inst(compiler, SLTUI | RD(dst_r) | RJ(EQUAL_FLAG) | IMM_I12(1)));
src_r = dst_r;
invert ^= 0x1;
break;
case SLJIT_OVERFLOW:
case SLJIT_NOT_OVERFLOW:
if (compiler->status_flags_state & (SLJIT_CURRENT_FLAGS_ADD | SLJIT_CURRENT_FLAGS_SUB)) {
Expand Down Expand Up @@ -2933,15 +2946,56 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler
sljit_s32 dst_reg,
sljit_s32 mem_reg)
{
SLJIT_UNUSED_ARG(compiler);
SLJIT_UNUSED_ARG(op);
SLJIT_UNUSED_ARG(dst_reg);
SLJIT_UNUSED_ARG(mem_reg);
sljit_ins ins = LL_W;
sljit_s32 dst = TMP_REG2;
sljit_s32 mem = mem_reg;

CHECK_ERROR();
CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg));

return SLJIT_ERR_UNSUPPORTED;
op = GET_OPCODE(op);
switch (op) {
#if SLJIT_ATOMIC_EMULATION
case SLJIT_MOV_U8:
case SLJIT_MOV_U16:
mem = TMP_REG1;
FAIL_IF(push_inst(compiler, ANDI | RD(TMP_REG3) | RJ(mem_reg) | IMM_I12(0x3)));
FAIL_IF(push_inst(compiler, XOR | RD(mem) | RJ(TMP_REG3) | RK(mem_reg)));
break;
#endif /* SLJIT_ATOMIC_EMULATION */
case SLJIT_MOV_P:
case SLJIT_MOV:
ins = LL_D;
dst = dst_reg;
break;
}

FAIL_IF(push_inst(compiler, ins | RD(dst) | RJ(mem)));

#if SLJIT_ATOMIC_EMULATION
if (mem != mem_reg)
FAIL_IF(push_inst(compiler, SLLI_W | RD(TMP_REG3) | RJ(TMP_REG3) | IMM_I12(0x3)));
#endif /* SLJIT_ATOMIC_EMULATION */

switch (op) {
case SLJIT_MOV_U8:
#if SLJIT_ATOMIC_EMULATION
FAIL_IF(push_inst(compiler, SRL_W | RD(dst) | RJ(dst) | RK(TMP_REG3)));
#endif /* SLJIT_ATOMIC_EMULATION */
FAIL_IF(push_inst(compiler, ANDI | RD(dst_reg) | RJ(dst) | IMM_I12(0xff)));
break;
case SLJIT_MOV_U16:
#if SLJIT_ATOMIC_EMULATION
FAIL_IF(push_inst(compiler, BEQZ | RJ(TMP_REG3) | IMM_I21(1 + 2)));
FAIL_IF(push_inst(compiler, SRL_W | RD(dst_reg) | RJ(dst) | RK(TMP_REG3)));
FAIL_IF(push_inst(compiler, B | IMM_I26(1 + 1)));
#endif /* SLJIT_ATOMIC_EMULATION */
return push_inst(compiler, BSTRPICK_W | RD(dst_reg) | RJ(dst) | (15 << 16));
case SLJIT_MOV_U32:
case SLJIT_MOV32:
return push_inst(compiler, BSTRPICK_D | RD(dst_reg) | RJ(dst) | (31 << 16));
}
return SLJIT_SUCCESS;
}

SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler,
Expand All @@ -2950,16 +3004,63 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler
sljit_s32 mem_reg,
sljit_s32 temp_reg)
{
SLJIT_UNUSED_ARG(compiler);
SLJIT_UNUSED_ARG(op);
SLJIT_UNUSED_ARG(src_reg);
SLJIT_UNUSED_ARG(mem_reg);
SLJIT_UNUSED_ARG(temp_reg);
sljit_ins ins = SC_W;
sljit_ins chk = ORI | RD(EQUAL_FLAG) | RJ(temp_reg) | RK(TMP_ZERO);
sljit_s32 mem = mem_reg;

CHECK_ERROR();
CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg));

return SLJIT_ERR_UNSUPPORTED;
op = GET_OPCODE(op);

#if SLJIT_ATOMIC_EMULATION
switch (op) {
case SLJIT_MOV_U8:
case SLJIT_MOV_U16:
mem = TMP_REG1;
FAIL_IF(push_inst(compiler, ANDI | RD(TMP_REG3) | RJ(mem_reg) | IMM_I12(0x3)));
FAIL_IF(push_inst(compiler, XOR | RD(mem) | RJ(TMP_REG3) | RK(mem_reg)));
}
#endif /* SLJIY_ATOMIC_EMULATION */

switch (op) {
case SLJIT_MOV_U8:
#if SLJIT_ATOMIC_EMULATION
FAIL_IF(push_inst(compiler, SLLI_W | RD(TMP_REG3) | RJ(TMP_REG3) | IMM_I12(3)));
FAIL_IF(push_inst(compiler, LD_WU | RD(temp_reg) | RJ(mem)));
FAIL_IF(push_inst(compiler, ADDI_W | RD(TMP_REG2) | RJ(TMP_ZERO) | IMM_I12(0xff)));
FAIL_IF(push_inst(compiler, SLL_W | RD(TMP_REG2) | RJ(TMP_REG2) | RK(TMP_REG3)));
FAIL_IF(push_inst(compiler, ANDN | RD(temp_reg) | RJ(temp_reg) | RK(TMP_REG2)));
FAIL_IF(push_inst(compiler, ANDI | RD(TMP_REG4) | RJ(src_reg) | IMM_I12(0xff)));
FAIL_IF(push_inst(compiler, SLL_W | RD(TMP_REG4) | RJ(src_reg) | RK(TMP_REG3)));
FAIL_IF(push_inst(compiler, OR | RD(temp_reg) | RJ(temp_reg) | RK(TMP_REG4)));
#else
FAIL_IF(push_inst(compiler, ANDI | RD(temp_reg) | RJ(src_reg) | IMM_I12(0xff)));
#endif /* SLJIT_ATOMIC_EMULATION */
break;
case SLJIT_MOV_U16:
#if SLJIT_ATOMIC_EMULATION
FAIL_IF(push_inst(compiler, LD_WU | RD(temp_reg) | RJ(mem)));
FAIL_IF(push_inst(compiler, BNEZ | RJ(TMP_REG3) | IMM_I21(1 + 2)));
#endif /* SLJIT_ATOMIC_EMULATION */
FAIL_IF(push_inst(compiler, BSTRINS_W | RD(temp_reg) | RJ(src_reg) | (15 << 16)));
#if SLJIT_ATOMIC_EMULATION
FAIL_IF(push_inst(compiler, B | IMM_I26(1 + 1)));
FAIL_IF(push_inst(compiler, BSTRINS_W | RD(temp_reg) | RJ(src_reg) | (31 << 16) | (16 << 10)));
#endif /* SLJIT_ATOMIC_EMULATION */
break;
case SLJIT_MOV_P:
case SLJIT_MOV:
ins = SC_D;
/* FALLTHRU */
default:
FAIL_IF(push_inst(compiler, ORI | RD(temp_reg) | RJ(src_reg) | RK(TMP_ZERO)));
break;
}

FAIL_IF(push_inst(compiler, ins | RD(temp_reg) | RJ(mem)));

return chk ? push_inst(compiler, chk) : SLJIT_SUCCESS;
}

static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value, sljit_ins last_ins)
Expand Down
Loading

0 comments on commit 8797280

Please sign in to comment.