Skip to content
This repository has been archived by the owner on Dec 1, 2021. It is now read-only.

Vectorized loop crashing? #24

Open
iwasaki-kenta opened this issue Dec 6, 2019 · 1 comment
Open

Vectorized loop crashing? #24

iwasaki-kenta opened this issue Dec 6, 2019 · 1 comment

Comments

@iwasaki-kenta
Copy link

I tried compile the following C code w/ clang 7.0.0 (trunk 338352) w/ the following command:

clang -O3 -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -S count.c
int CountFilledEntries(char* entries, int len) {
  int result = 0;
  for (int i = 0; i < len; i++) {
    if (entries[i] != 0) result++;
  }

  return result;
}

Given the stub file:

//go:noescape
func _CountFilledEntries(entries unsafe.Pointer, len uint64) (count uint64)

func CountFilledEntries(entries []byte) uint {
	return uint(_CountFilledEntries(
		unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&entries)).Data),
		uint64(len(entries))),
	)
}

Running the test below causes the program to crash. Any possible insight as to why?

func TestCount(t *testing.T) {
	a := []byte{0, 1, 2, 9, 4, 0, 3, 0, 0}
	fmt.Println(CountFilledEntries(a))
}

Assembly:

//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT

TEXT ·_CountFilledEntries(SB), $0-24

    MOVQ entries+0(FP), DI
    MOVQ len+8(FP), SI

    WORD $0xf685                 // test    esi, esi
	JLE LBB0_1
    WORD $0xf189                 // mov    ecx, esi
    WORD $0xfe83; BYTE $0x07     // cmp    esi, 7
	JA LBB0_4
    WORD $0xd231                 // xor    edx, edx
    WORD $0xc031                 // xor    eax, eax
	JMP LBB0_11
LBB0_1:
    WORD $0xc031                 // xor    eax, eax
    MOVQ AX, count+16(FP)
    RET
LBB0_4:
    WORD $0xca89                 // mov    edx, ecx
    WORD $0xe283; BYTE $0xf8     // and    edx, -8
    LONG $0xf8728d48             // lea    rsi, [rdx - 8]
    WORD $0x8948; BYTE $0xf0     // mov    rax, rsi
    LONG $0x03e8c148             // shr    rax, 3
    LONG $0x01c08348             // add    rax, 1
    WORD $0x8941; BYTE $0xc0     // mov    r8d, eax
    LONG $0x01e08341             // and    r8d, 1
    WORD $0x8548; BYTE $0xf6     // test    rsi, rsi
	JE LBB0_5
    LONG $0x000001be; BYTE $0x00 // mov    esi, 1
    WORD $0x2948; BYTE $0xc6     // sub    rsi, rax
    WORD $0x014c; BYTE $0xc6     // add    rsi, r8
    LONG $0xffc68348             // add    rsi, -1
    LONG $0xd2ef0f66             // pxor    xmm2, xmm2
    WORD $0xc031                 // xor    eax, eax
    LONG $0xdb760f66             // pcmpeqd    xmm3, xmm3
    LONG $0xc0ef0f66             // pxor    xmm0, xmm0
    LONG $0xc9ef0f66             // pxor    xmm1, xmm1
LBB0_7:
    LONG $0x246e0f66; BYTE $0x07 // movd    xmm4, dword [rdi + rax]
    LONG $0xe2600f66             // punpcklbw    xmm4, xmm2
    LONG $0xe2610f66             // punpcklwd    xmm4, xmm2
    LONG $0x6c6e0f66; WORD $0x0407 // movd    xmm5, dword [rdi + rax + 4]
    LONG $0xea600f66             // punpcklbw    xmm5, xmm2
    LONG $0xea610f66             // punpcklwd    xmm5, xmm2
    LONG $0xe2760f66             // pcmpeqd    xmm4, xmm2
    LONG $0xe3ef0f66             // pxor    xmm4, xmm3
    LONG $0xc4fa0f66             // psubd    xmm0, xmm4
    LONG $0xea760f66             // pcmpeqd    xmm5, xmm2
    LONG $0xebef0f66             // pxor    xmm5, xmm3
    LONG $0xcdfa0f66             // psubd    xmm1, xmm5
    LONG $0x646e0f66; WORD $0x0807 // movd    xmm4, dword [rdi + rax + 8]
    LONG $0xe2600f66             // punpcklbw    xmm4, xmm2
    LONG $0xe2610f66             // punpcklwd    xmm4, xmm2
    LONG $0x6c6e0f66; WORD $0x0c07 // movd    xmm5, dword [rdi + rax + 12]
    LONG $0xea600f66             // punpcklbw    xmm5, xmm2
    LONG $0xea610f66             // punpcklwd    xmm5, xmm2
    LONG $0xe2760f66             // pcmpeqd    xmm4, xmm2
    LONG $0xe3ef0f66             // pxor    xmm4, xmm3
    LONG $0xc4fa0f66             // psubd    xmm0, xmm4
    LONG $0xea760f66             // pcmpeqd    xmm5, xmm2
    LONG $0xebef0f66             // pxor    xmm5, xmm3
    LONG $0xcdfa0f66             // psubd    xmm1, xmm5
    LONG $0x10c08348             // add    rax, 16
    LONG $0x02c68348             // add    rsi, 2
	JNE LBB0_7
    WORD $0x854d; BYTE $0xc0     // test    r8, r8
	JE LBB0_10
LBB0_9:
    LONG $0x546e0f66; WORD $0x0407 // movd    xmm2, dword [rdi + rax + 4]
    LONG $0xdbef0f66             // pxor    xmm3, xmm3
    LONG $0xd3600f66             // punpcklbw    xmm2, xmm3
    LONG $0xd3610f66             // punpcklwd    xmm2, xmm3
    LONG $0xd3760f66             // pcmpeqd    xmm2, xmm3
    LONG $0xe4760f66             // pcmpeqd    xmm4, xmm4
    LONG $0xd4ef0f66             // pxor    xmm2, xmm4
    LONG $0xcafa0f66             // psubd    xmm1, xmm2
    LONG $0x146e0f66; BYTE $0x07 // movd    xmm2, dword [rdi + rax]
    LONG $0xd3600f66             // punpcklbw    xmm2, xmm3
    LONG $0xd3610f66             // punpcklwd    xmm2, xmm3
    LONG $0xd3760f66             // pcmpeqd    xmm2, xmm3
    LONG $0xd4ef0f66             // pxor    xmm2, xmm4
    LONG $0xc2fa0f66             // psubd    xmm0, xmm2
LBB0_10:
    LONG $0xc1fe0f66             // paddd    xmm0, xmm1
    LONG $0xc8700f66; BYTE $0x4e // pshufd    xmm1, xmm0, 78
    LONG $0xc8fe0f66             // paddd    xmm1, xmm0
    LONG $0xc1700f66; BYTE $0xe5 // pshufd    xmm0, xmm1, 229
    LONG $0xc1fe0f66             // paddd    xmm0, xmm1
    LONG $0xc07e0f66             // movd    eax, xmm0
    WORD $0x3948; BYTE $0xca     // cmp    rdx, rcx
	JE LBB0_12
LBB0_11:
    LONG $0x01173c80             // cmp    byte [rdi + rdx], 1
    WORD $0xd883; BYTE $0xff     // sbb    eax, -1
    LONG $0x01c28348             // add    rdx, 1
    WORD $0x3948; BYTE $0xd1     // cmp    rcx, rdx
	JNE LBB0_11
LBB0_12:
    WORD $0x8948; BYTE $0xec     // mov    rsp, rbp
    BYTE $0x5d                   // pop    rbp
    BYTE $0xc3                   // ret
LBB0_5:
    LONG $0xc0ef0f66             // pxor    xmm0, xmm0
    WORD $0xc031                 // xor    eax, eax
    LONG $0xc9ef0f66             // pxor    xmm1, xmm1
    WORD $0x854d; BYTE $0xc0     // test    r8, r8
	JNE LBB0_9
	JMP LBB0_10
@harshavardhana
Copy link
Member

Feel free to send a fix @iwasaki-kenta

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants