Skip to content

std.mem.swap is slow #25334

@matklad

Description

@matklad

Zig Version

0.15.1

Steps to Reproduce and Observed Behavior

Zig

const std = @import("std");

const Huge = struct {
    data: [128]u128,
};

export fn f(x: *Huge, y: *Huge) void {
    std.mem.swap(Huge, x, y);
}
f:
        push    rbp
        mov     rbp, rsp
        push    r15
        push    r14
        push    rbx
        sub     rsp, 2056
        lea     r15, [rbp - 2080]
        mov     edx, 2048
        mov     r14, rdi
        mov     rbx, rsi
        mov     rsi, r14
        mov     rdi, r15
        call    memcpy@PLT
        mov     edx, 2048
        mov     rdi, r14
        mov     rsi, rbx
        call    memcpy@PLT
        mov     edx, 2048
        mov     rdi, rbx
        mov     rsi, r15
        call    memcpy@PLT
        add     rsp, 2056
        pop     rbx
        pop     r14
        pop     r15
        pop     rbp
        ret

Rust

https://godbolt.org/z/a7dv8jx7a

struct Huge([u128; 128]);

pub extern fn f(x: &mut Huge, y: &mut Huge) {
    std::mem::swap(x, y);
}
example::f::h4bf74e253b89ab56:
        xor     eax, eax
.LBB0_1:
        movaps  xmm0, xmmword ptr [rdi + 8*rax]
        movaps  xmm1, xmmword ptr [rdi + 8*rax + 16]
        movaps  xmm2, xmmword ptr [rsi + 8*rax]
        movaps  xmm3, xmmword ptr [rsi + 8*rax + 16]
        movaps  xmmword ptr [rdi + 8*rax], xmm2
        movaps  xmmword ptr [rdi + 8*rax + 16], xmm3
        movaps  xmmword ptr [rsi + 8*rax], xmm0
        movaps  xmmword ptr [rsi + 8*rax + 16], xmm1
        movaps  xmm0, xmmword ptr [rdi + 8*rax + 32]
        movaps  xmm1, xmmword ptr [rdi + 8*rax + 48]
        movaps  xmm2, xmmword ptr [rsi + 8*rax + 32]
        movaps  xmm3, xmmword ptr [rsi + 8*rax + 48]
        movaps  xmmword ptr [rdi + 8*rax + 32], xmm2
        movaps  xmmword ptr [rdi + 8*rax + 48], xmm3
        movaps  xmmword ptr [rsi + 8*rax + 32], xmm0
        movaps  xmmword ptr [rsi + 8*rax + 48], xmm1
        add     rax, 8
        cmp     rax, 256
        jne     .LBB0_1
        ret

Zig uses three memcpys, while Rust just shuffles some bytes in registers. I think the two implementations actually differ. They give different results if two Huge structs overlap! Zig assumes aliasing, and gives suboptimal code, but it rather should assert that no aliasing happens (adding noalias annotations doesn't change anything).

Expected Behavior

Zig should use O(1) extra memory to swap two structs of size N, with a no-alias safety check in std.mem.swap

Metadata

Metadata

Assignees

No one assigned

    Labels

    contributor friendlyThis issue is limited in scope and/or knowledge of Zig internals.enhancementSolving this issue will likely involve adding new logic or components to the codebase.optimizationstandard libraryThis issue involves writing Zig code for the standard library.

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions