-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Labels
bugSomething isn't workingSomething isn't working
Description
It seems that for whatever reason, enabling LTO triggers LLVM to undo target-specific specialisations. This may be a bug with LLVM or rustc caused by the #[target_feature] attribute, but I am just documenting it here as I haven't spent enough time to find the root cause.
Example assembly with and without LTO is provided below, which shows vector operations being used when LTO is disabled, but no vector operations being used with LTO enabled.
Naive dot product with avx2 and no LTO
special_test::fast_dot_product_32::_x86_avx2:
sub rsp, 24
vmovaps xmmword ptr [rsp], xmm6
vmovdqu ymm0, ymmword ptr [rcx + 192]
vmovdqu ymm1, ymmword ptr [rcx + 224]
vmovdqu ymm2, ymmword ptr [rcx + 32]
vmovdqu ymm3, ymmword ptr [rcx + 160]
vmovdqu ymm4, ymmword ptr [rdx + 160]
vpsrlq ymm5, ymm4, 32
vpmuludq ymm5, ymm5, ymm3
vpsrlq ymm6, ymm3, 32
vpmuludq ymm6, ymm4, ymm6
vpaddq ymm5, ymm6, ymm5
vmovdqu ymm6, ymmword ptr [rdx + 32]
vpsllq ymm5, ymm5, 32
vpmuludq ymm3, ymm4, ymm3
vpaddq ymm3, ymm3, ymm5
vpsrlq ymm4, ymm6, 32
vpmuludq ymm4, ymm4, ymm2
vpsrlq ymm5, ymm2, 32
vpmuludq ymm5, ymm6, ymm5
vpaddq ymm4, ymm5, ymm4
vmovdqu ymm5, ymmword ptr [rcx + 96]
vpmuludq ymm2, ymm6, ymm2
vmovdqu ymm6, ymmword ptr [rdx + 224]
vpsllq ymm4, ymm4, 32
vpaddq ymm2, ymm2, ymm4
vpaddq ymm2, ymm2, ymm3
vpsrlq ymm3, ymm6, 32
vpmuludq ymm3, ymm3, ymm1
vpsrlq ymm4, ymm1, 32
vpmuludq ymm4, ymm6, ymm4
vpaddq ymm3, ymm4, ymm3
vmovdqu ymm4, ymmword ptr [rdx + 96]
vpsllq ymm3, ymm3, 32
vpmuludq ymm1, ymm6, ymm1
vpaddq ymm1, ymm1, ymm3
vpsrlq ymm3, ymm4, 32
vpmuludq ymm3, ymm3, ymm5
vpsrlq ymm6, ymm5, 32
vpmuludq ymm6, ymm4, ymm6
vpaddq ymm3, ymm6, ymm3
vmovdqu ymm6, ymmword ptr [rcx + 128]
vpmuludq ymm4, ymm4, ymm5
vmovdqu ymm5, ymmword ptr [rcx]
vpsllq ymm3, ymm3, 32
vpaddq ymm3, ymm4, ymm3
vmovdqu ymm4, ymmword ptr [rdx + 128]
vpaddq ymm1, ymm3, ymm1
vpaddq ymm1, ymm2, ymm1
vpsrlq ymm2, ymm4, 32
vpmuludq ymm2, ymm2, ymm6
vpsrlq ymm3, ymm6, 32
vpmuludq ymm3, ymm4, ymm3
vpaddq ymm2, ymm3, ymm2
vmovdqu ymm3, ymmword ptr [rdx]
vpsllq ymm2, ymm2, 32
vpmuludq ymm4, ymm4, ymm6
vpaddq ymm2, ymm4, ymm2
vpsrlq ymm4, ymm3, 32
vpmuludq ymm4, ymm4, ymm5
vpsrlq ymm6, ymm5, 32
vpmuludq ymm6, ymm3, ymm6
vpaddq ymm4, ymm6, ymm4
vmovdqu ymm6, ymmword ptr [rcx + 64]
vpmuludq ymm3, ymm3, ymm5
vmovdqu ymm5, ymmword ptr [rdx + 192]
vpsllq ymm4, ymm4, 32
vpaddq ymm3, ymm3, ymm4
vpaddq ymm2, ymm3, ymm2
vpsrlq ymm3, ymm5, 32
vpmuludq ymm3, ymm3, ymm0
vpsrlq ymm4, ymm0, 32
vpmuludq ymm4, ymm5, ymm4
vpaddq ymm3, ymm4, ymm3
vmovdqu ymm4, ymmword ptr [rdx + 64]
vpsllq ymm3, ymm3, 32
vpmuludq ymm0, ymm5, ymm0
vpaddq ymm0, ymm0, ymm3
vpsrlq ymm3, ymm4, 32
vpmuludq ymm3, ymm3, ymm6
vpsrlq ymm5, ymm6, 32
vpmuludq ymm5, ymm4, ymm5
vpaddq ymm3, ymm5, ymm3
vpmuludq ymm4, ymm4, ymm6
vpsllq ymm3, ymm3, 32
vpaddq ymm3, ymm4, ymm3
vpaddq ymm0, ymm3, ymm0
vpaddq ymm0, ymm2, ymm0
vpaddq ymm0, ymm0, ymm1
vextracti128 xmm1, ymm0, 1
vpaddq xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 238
vpaddq xmm0, xmm0, xmm1
vmovq rax, xmm0
vmovaps xmm6, xmmword ptr [rsp]
add rsp, 24
vzeroupper
retNaive dot product with avx2 and LTO
special_test::fast_dot_product_32::_x86_avx2:
mov r8, qword ptr [rdx]
mov r9, qword ptr [rdx + 8]
imul r8, qword ptr [rcx]
imul r9, qword ptr [rcx + 8]
mov r10, qword ptr [rdx + 16]
imul r10, qword ptr [rcx + 16]
mov rax, qword ptr [rdx + 24]
imul rax, qword ptr [rcx + 24]
add r9, r8
mov r11, qword ptr [rdx + 32]
imul r11, qword ptr [rcx + 32]
add rax, r10
mov r10, qword ptr [rdx + 40]
imul r10, qword ptr [rcx + 40]
add rax, r9
mov r8, qword ptr [rdx + 48]
imul r8, qword ptr [rcx + 48]
add r10, r11
mov r9, qword ptr [rdx + 56]
imul r9, qword ptr [rcx + 56]
add r8, r10
mov r10, qword ptr [rdx + 64]
imul r10, qword ptr [rcx + 64]
add r8, rax
mov r11, qword ptr [rdx + 72]
imul r11, qword ptr [rcx + 72]
add r10, r9
mov rax, qword ptr [rdx + 80]
imul rax, qword ptr [rcx + 80]
add r11, r10
mov r9, qword ptr [rdx + 88]
imul r9, qword ptr [rcx + 88]
add rax, r11
mov r10, qword ptr [rdx + 96]
imul r10, qword ptr [rcx + 96]
add rax, r8
mov r11, qword ptr [rdx + 104]
imul r11, qword ptr [rcx + 104]
add r10, r9
mov r9, qword ptr [rdx + 112]
imul r9, qword ptr [rcx + 112]
add r11, r10
mov r8, qword ptr [rdx + 120]
imul r8, qword ptr [rcx + 120]
add r9, r11
mov r10, qword ptr [rdx + 128]
imul r10, qword ptr [rcx + 128]
add r8, r9
mov r9, qword ptr [rdx + 136]
imul r9, qword ptr [rcx + 136]
add r8, rax
mov rax, qword ptr [rdx + 144]
imul rax, qword ptr [rcx + 144]
add r9, r10
mov r10, qword ptr [rdx + 152]
imul r10, qword ptr [rcx + 152]
add rax, r9
mov r9, qword ptr [rdx + 160]
imul r9, qword ptr [rcx + 160]
add r10, rax
mov rax, qword ptr [rdx + 168]
imul rax, qword ptr [rcx + 168]
add r9, r10
mov r10, qword ptr [rdx + 176]
imul r10, qword ptr [rcx + 176]
add rax, r9
mov r9, qword ptr [rdx + 184]
imul r9, qword ptr [rcx + 184]
add rax, r8
mov r8, qword ptr [rdx + 192]
imul r8, qword ptr [rcx + 192]
add r9, r10
mov r10, qword ptr [rdx + 200]
imul r10, qword ptr [rcx + 200]
add r8, r9
mov r9, qword ptr [rdx + 208]
imul r9, qword ptr [rcx + 208]
add r10, r8
mov r8, qword ptr [rdx + 216]
imul r8, qword ptr [rcx + 216]
add r9, r10
mov r10, qword ptr [rdx + 224]
imul r10, qword ptr [rcx + 224]
add r8, r9
mov r9, qword ptr [rdx + 232]
imul r9, qword ptr [rcx + 232]
add r10, r8
mov r8, qword ptr [rdx + 240]
imul r8, qword ptr [rcx + 240]
add r10, rax
mov rax, qword ptr [rdx + 248]
imul rax, qword ptr [rcx + 248]
add r8, r9
add rax, r8
add rax, r10
retReactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working