Skip to content

LTO seems to undo specialisations #4

@PyPylia

Description

@PyPylia

It seems that for whatever reason, enabling LTO triggers LLVM to undo target-specific specialisations. This may be a bug with LLVM or rustc caused by the #[target_feature] attribute, but I am just documenting it here as I haven't spent enough time to find the root cause.

Example assembly with and without LTO is provided below, which shows vector operations being used when LTO is disabled, but no vector operations being used with LTO enabled.

Naive dot product with avx2 and no LTO

special_test::fast_dot_product_32::_x86_avx2:
        sub rsp, 24
        vmovaps xmmword ptr [rsp], xmm6
        vmovdqu ymm0, ymmword ptr [rcx + 192]
        vmovdqu ymm1, ymmword ptr [rcx + 224]
        vmovdqu ymm2, ymmword ptr [rcx + 32]
        vmovdqu ymm3, ymmword ptr [rcx + 160]
        vmovdqu ymm4, ymmword ptr [rdx + 160]
        vpsrlq ymm5, ymm4, 32
        vpmuludq ymm5, ymm5, ymm3
        vpsrlq ymm6, ymm3, 32
        vpmuludq ymm6, ymm4, ymm6
        vpaddq ymm5, ymm6, ymm5
        vmovdqu ymm6, ymmword ptr [rdx + 32]
        vpsllq ymm5, ymm5, 32
        vpmuludq ymm3, ymm4, ymm3
        vpaddq ymm3, ymm3, ymm5
        vpsrlq ymm4, ymm6, 32
        vpmuludq ymm4, ymm4, ymm2
        vpsrlq ymm5, ymm2, 32
        vpmuludq ymm5, ymm6, ymm5
        vpaddq ymm4, ymm5, ymm4
        vmovdqu ymm5, ymmword ptr [rcx + 96]
        vpmuludq ymm2, ymm6, ymm2
        vmovdqu ymm6, ymmword ptr [rdx + 224]
        vpsllq ymm4, ymm4, 32
        vpaddq ymm2, ymm2, ymm4
        vpaddq ymm2, ymm2, ymm3
        vpsrlq ymm3, ymm6, 32
        vpmuludq ymm3, ymm3, ymm1
        vpsrlq ymm4, ymm1, 32
        vpmuludq ymm4, ymm6, ymm4
        vpaddq ymm3, ymm4, ymm3
        vmovdqu ymm4, ymmword ptr [rdx + 96]
        vpsllq ymm3, ymm3, 32
        vpmuludq ymm1, ymm6, ymm1
        vpaddq ymm1, ymm1, ymm3
        vpsrlq ymm3, ymm4, 32
        vpmuludq ymm3, ymm3, ymm5
        vpsrlq ymm6, ymm5, 32
        vpmuludq ymm6, ymm4, ymm6
        vpaddq ymm3, ymm6, ymm3
        vmovdqu ymm6, ymmword ptr [rcx + 128]
        vpmuludq ymm4, ymm4, ymm5
        vmovdqu ymm5, ymmword ptr [rcx]
        vpsllq ymm3, ymm3, 32
        vpaddq ymm3, ymm4, ymm3
        vmovdqu ymm4, ymmword ptr [rdx + 128]
        vpaddq ymm1, ymm3, ymm1
        vpaddq ymm1, ymm2, ymm1
        vpsrlq ymm2, ymm4, 32
        vpmuludq ymm2, ymm2, ymm6
        vpsrlq ymm3, ymm6, 32
        vpmuludq ymm3, ymm4, ymm3
        vpaddq ymm2, ymm3, ymm2
        vmovdqu ymm3, ymmword ptr [rdx]
        vpsllq ymm2, ymm2, 32
        vpmuludq ymm4, ymm4, ymm6
        vpaddq ymm2, ymm4, ymm2
        vpsrlq ymm4, ymm3, 32
        vpmuludq ymm4, ymm4, ymm5
        vpsrlq ymm6, ymm5, 32
        vpmuludq ymm6, ymm3, ymm6
        vpaddq ymm4, ymm6, ymm4
        vmovdqu ymm6, ymmword ptr [rcx + 64]
        vpmuludq ymm3, ymm3, ymm5
        vmovdqu ymm5, ymmword ptr [rdx + 192]
        vpsllq ymm4, ymm4, 32
        vpaddq ymm3, ymm3, ymm4
        vpaddq ymm2, ymm3, ymm2
        vpsrlq ymm3, ymm5, 32
        vpmuludq ymm3, ymm3, ymm0
        vpsrlq ymm4, ymm0, 32
        vpmuludq ymm4, ymm5, ymm4
        vpaddq ymm3, ymm4, ymm3
        vmovdqu ymm4, ymmword ptr [rdx + 64]
        vpsllq ymm3, ymm3, 32
        vpmuludq ymm0, ymm5, ymm0
        vpaddq ymm0, ymm0, ymm3
        vpsrlq ymm3, ymm4, 32
        vpmuludq ymm3, ymm3, ymm6
        vpsrlq ymm5, ymm6, 32
        vpmuludq ymm5, ymm4, ymm5
        vpaddq ymm3, ymm5, ymm3
        vpmuludq ymm4, ymm4, ymm6
        vpsllq ymm3, ymm3, 32
        vpaddq ymm3, ymm4, ymm3
        vpaddq ymm0, ymm3, ymm0
        vpaddq ymm0, ymm2, ymm0
        vpaddq ymm0, ymm0, ymm1
        vextracti128 xmm1, ymm0, 1
        vpaddq xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 238
        vpaddq xmm0, xmm0, xmm1
        vmovq rax, xmm0
        vmovaps xmm6, xmmword ptr [rsp]
        add rsp, 24
        vzeroupper
        ret

Naive dot product with avx2 and LTO

special_test::fast_dot_product_32::_x86_avx2:
        mov r8, qword ptr [rdx]
        mov r9, qword ptr [rdx + 8]
        imul r8, qword ptr [rcx]
        imul r9, qword ptr [rcx + 8]
        mov r10, qword ptr [rdx + 16]
        imul r10, qword ptr [rcx + 16]
        mov rax, qword ptr [rdx + 24]
        imul rax, qword ptr [rcx + 24]
        add r9, r8
        mov r11, qword ptr [rdx + 32]
        imul r11, qword ptr [rcx + 32]
        add rax, r10
        mov r10, qword ptr [rdx + 40]
        imul r10, qword ptr [rcx + 40]
        add rax, r9
        mov r8, qword ptr [rdx + 48]
        imul r8, qword ptr [rcx + 48]
        add r10, r11
        mov r9, qword ptr [rdx + 56]
        imul r9, qword ptr [rcx + 56]
        add r8, r10
        mov r10, qword ptr [rdx + 64]
        imul r10, qword ptr [rcx + 64]
        add r8, rax
        mov r11, qword ptr [rdx + 72]
        imul r11, qword ptr [rcx + 72]
        add r10, r9
        mov rax, qword ptr [rdx + 80]
        imul rax, qword ptr [rcx + 80]
        add r11, r10
        mov r9, qword ptr [rdx + 88]
        imul r9, qword ptr [rcx + 88]
        add rax, r11
        mov r10, qword ptr [rdx + 96]
        imul r10, qword ptr [rcx + 96]
        add rax, r8
        mov r11, qword ptr [rdx + 104]
        imul r11, qword ptr [rcx + 104]
        add r10, r9
        mov r9, qword ptr [rdx + 112]
        imul r9, qword ptr [rcx + 112]
        add r11, r10
        mov r8, qword ptr [rdx + 120]
        imul r8, qword ptr [rcx + 120]
        add r9, r11
        mov r10, qword ptr [rdx + 128]
        imul r10, qword ptr [rcx + 128]
        add r8, r9
        mov r9, qword ptr [rdx + 136]
        imul r9, qword ptr [rcx + 136]
        add r8, rax
        mov rax, qword ptr [rdx + 144]
        imul rax, qword ptr [rcx + 144]
        add r9, r10
        mov r10, qword ptr [rdx + 152]
        imul r10, qword ptr [rcx + 152]
        add rax, r9
        mov r9, qword ptr [rdx + 160]
        imul r9, qword ptr [rcx + 160]
        add r10, rax
        mov rax, qword ptr [rdx + 168]
        imul rax, qword ptr [rcx + 168]
        add r9, r10
        mov r10, qword ptr [rdx + 176]
        imul r10, qword ptr [rcx + 176]
        add rax, r9
        mov r9, qword ptr [rdx + 184]
        imul r9, qword ptr [rcx + 184]
        add rax, r8
        mov r8, qword ptr [rdx + 192]
        imul r8, qword ptr [rcx + 192]
        add r9, r10
        mov r10, qword ptr [rdx + 200]
        imul r10, qword ptr [rcx + 200]
        add r8, r9
        mov r9, qword ptr [rdx + 208]
        imul r9, qword ptr [rcx + 208]
        add r10, r8
        mov r8, qword ptr [rdx + 216]
        imul r8, qword ptr [rcx + 216]
        add r9, r10
        mov r10, qword ptr [rdx + 224]
        imul r10, qword ptr [rcx + 224]
        add r8, r9
        mov r9, qword ptr [rdx + 232]
        imul r9, qword ptr [rcx + 232]
        add r10, r8
        mov r8, qword ptr [rdx + 240]
        imul r8, qword ptr [rcx + 240]
        add r10, rax
        mov rax, qword ptr [rdx + 248]
        imul rax, qword ptr [rcx + 248]
        add r8, r9
        add rax, r8
        add rax, r10
        ret

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions