diff --git a/clang/test/Driver/clang-linker-wrapper.cpp b/clang/test/Driver/clang-linker-wrapper.cpp index f2acf035db317..a667fc9492dc5 100644 --- a/clang/test/Driver/clang-linker-wrapper.cpp +++ b/clang/test/Driver/clang-linker-wrapper.cpp @@ -121,7 +121,7 @@ // CHK-CMDS-AOT-NV-NEXT: sycl-post-link{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc // CHK-CMDS-AOT-NV-NEXT: clang{{.*}} -o [[CLANGOUT:.*]] -dumpdir a.out.nvptx64.sm_50.img. --target=nvptx64-nvidia-cuda -march={{.*}} // CHK-CMDS-AOT-NV-NEXT: ptxas{{.*}} --output-file [[PTXASOUT:.*]] [[CLANGOUT]] -// CHK-CMDS-AOT-NV-NEXT: fatbinary{{.*}} --create [[FATBINOUT:.*]] --image=profile={{.*}},file=[[CLANGOUT]] --image=profile={{.*}},file=[[PTXASOUT]] +// CHK-CMDS-AOT-NV-NEXT: fatbinary{{.*}} --create [[FATBINOUT:.*]] --image3=kind=ptx,sm={{(compute_)?50|pute_50}},file=[[CLANGOUT]] --image3=kind=elf,sm=50,file=[[PTXASOUT]] // CHK-CMDS-AOT-NV-NEXT: offload-wrapper: output: [[WRAPPEROUT:.*]].bc, input: [[FATBINOUT]] // CHK-CMDS-AOT-NV-NEXT: clang{{.*}} -c -o [[LLCOUT:.*]] [[WRAPPEROUT]] // CHK-CMDS-AOT-NV-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]] HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o @@ -166,7 +166,7 @@ // CHK-CMDS-AOT-NV-EMBED-IR-NEXT: clang{{.*}} -c -o [[LLCOUT1:.*]] [[WRAPPEROUT1]] // CHK-CMDS-AOT-NV-EMBED-IR-NEXT: clang{{.*}} -o [[CLANGOUT:.*]] -dumpdir a.out.nvptx64.sm_50.img. --target=nvptx64-nvidia-cuda -march={{.*}} // CHK-CMDS-AOT-NV-EMBED-IR-NEXT: ptxas{{.*}} --output-file [[PTXASOUT:.*]] [[CLANGOUT]] -// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: fatbinary{{.*}} --create [[FATBINOUT:.*]] --image=profile={{.*}},file=[[CLANGOUT]] --image=profile={{.*}},file=[[PTXASOUT]] +// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: fatbinary{{.*}} --create [[FATBINOUT:.*]] --image3=kind=ptx,sm={{(compute_)?50|pute_50}},file=[[CLANGOUT]] --image3=kind=elf,sm=50,file=[[PTXASOUT]] // CHK-CMDS-AOT-NV-EMBED-IR-NEXT: offload-wrapper: output: [[WRAPPEROUT:.*]].bc, input: [[FATBINOUT]] // CHK-CMDS-AOT-NV-EMBED-IR-NEXT: clang{{.*}} -c -o [[LLCOUT2:.*]] [[WRAPPEROUT]] // CHK-CMDS-AOT-NV-EMBED-IR-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT1]] [[LLCOUT2]] HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c index 07daac39cc7e9..a27391dbff56c 100644 --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -106,7 +106,7 @@ __attribute__((visibility("protected"), used)) int x; // CUDA: clang{{.*}} -o [[IMG_SM70:.+]] -dumpdir a.out.nvptx64.sm_70.img. --target=nvptx64-nvidia-cuda -march=sm_70 // CUDA: clang{{.*}} -o [[IMG_SM52:.+]] -dumpdir a.out.nvptx64.sm_52.img. --target=nvptx64-nvidia-cuda -march=sm_52 -// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_70,file=[[IMG_SM70]] --image=profile=sm_52,file=[[IMG_SM52]] +// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image3=kind=elf,sm=70,file=[[IMG_SM70]] --image3=kind=elf,sm=52,file=[[IMG_SM52]] // CUDA: usr/bin/ld{{.*}} {{.*}}.openmp.image.{{.*}}.o {{.*}}.cuda.image.{{.*}}.o // RUN: llvm-offload-binary -o %t.out \ @@ -240,7 +240,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=RELOCATABLE-LINK-CUDA // RELOCATABLE-LINK-CUDA: clang{{.*}} -o {{.*}}.img -dumpdir a.out.nvptx64.sm_89.img. --target=nvptx64-nvidia-cuda -// RELOCATABLE-LINK-CUDA: fatbinary{{.*}} -64 --create {{.*}}.fatbin --image=profile=sm_89,file={{.*}}.img +// RELOCATABLE-LINK-CUDA: fatbinary{{.*}} -64 --create {{.*}}.fatbin --image3=kind=elf,sm=89,file={{.*}}.img // RELOCATABLE-LINK-CUDA: /usr/bin/ld.lld{{.*}}-r // RELOCATABLE-LINK-CUDA: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 2a16c6ae4f40a..603539c8baa41 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -481,9 +481,22 @@ fatbinary(ArrayRef> InputFiles, CmdArgs.push_back(Triple.isArch64Bit() ? "-64" : "-32"); CmdArgs.push_back("--create"); CmdArgs.push_back(*TempFileOrErr); - for (const auto &[File, Arch] : InputFiles) - CmdArgs.push_back( - Args.MakeArgString("--image=profile=" + Arch + ",file=" + File)); + + for (const auto &[File, Arch] : InputFiles) { + // When bundling SYCL NVPTX, we may get both a cubin (sm_*) and a PTX + // assembly image (compute_*). `fatbinary` needs the right kind for each. + StringRef Kind = "elf"; + StringRef ArchId = Arch; + if (Arch.starts_with("sm_")) { + ArchId = Arch.drop_front(3); + } else if (Arch.starts_with("compute_")) { + Kind = "ptx"; + ArchId = Arch.drop_front(8); + } + + CmdArgs.push_back(Args.MakeArgString("--image3=kind=" + Kind + + ",sm=" + ArchId + ",file=" + File)); + } if (Error Err = executeCommands(*FatBinaryPath, CmdArgs)) return std::move(Err); diff --git a/sycl/test-e2e/AtomicRef/atomic_memory_order_acq_rel.cpp b/sycl/test-e2e/AtomicRef/atomic_memory_order_acq_rel.cpp index bac155b16b3c4..9245e57abde85 100644 --- a/sycl/test-e2e/AtomicRef/atomic_memory_order_acq_rel.cpp +++ b/sycl/test-e2e/AtomicRef/atomic_memory_order_acq_rel.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -O3 -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %} +// RUN: %{build} -O3 -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out // NOTE: Tests fetch_add for acquire and release memory ordering. diff --git a/sycl/test-e2e/AtomicRef/atomic_memory_order_seq_cst.cpp b/sycl/test-e2e/AtomicRef/atomic_memory_order_seq_cst.cpp index 462834453129e..900f840ee6acc 100644 --- a/sycl/test-e2e/AtomicRef/atomic_memory_order_seq_cst.cpp +++ b/sycl/test-e2e/AtomicRef/atomic_memory_order_seq_cst.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -O3 -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %} +// RUN: %{build} -O3 -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out #include "atomic_memory_order.h" diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index 5afc29fb0b61d..4ee5c96d6dd01 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -2,7 +2,7 @@ // XFAIL: (opencl && !cpu) // XFAIL-TRACKER: https://github.com/intel/llvm/issues/14641 -// RUN: %{build} -I . -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %} +// RUN: %{build} -I . -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out // Disabled temporarily while investigation into the failure is ongoing. diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp index bfa1156bc0e6b..71125772f6f91 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: target-nvidia -// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %t.out +// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out // RUN: %{run} %t.out // // This tests the unified matrix extension interfaces for the cuda backend. diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp index a8dbb59b46e0e..7e8d8fcb9f918 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: target-nvidia -// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_72 -o %t.out +// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out // RUN: %{run} %t.out // // This tests the unified matrix extension interfaces for the cuda backend. diff --git a/sycl/test-e2e/Reduction/reduction_range_1d_dw.cpp b/sycl/test-e2e/Reduction/reduction_range_1d_dw.cpp index 58121639594a9..779b315ac6d43 100644 --- a/sycl/test-e2e/Reduction/reduction_range_1d_dw.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_1d_dw.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -DENABLE_64_BIT=false -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -DENABLE_64_BIT=false -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out #include "reduction_utils.hpp" diff --git a/sycl/test-e2e/Reduction/reduction_range_1d_dw_64bit.cpp b/sycl/test-e2e/Reduction/reduction_range_1d_dw_64bit.cpp index 23c83be59b56c..f7756eaedceda 100644 --- a/sycl/test-e2e/Reduction/reduction_range_1d_dw_64bit.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_1d_dw_64bit.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -DENABLE_64_BIT=true -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -DENABLE_64_BIT=true -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out #include "reduction_range_1d_dw.cpp" diff --git a/sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp index 83e13dc07102d..2f5077c41f03f 100644 --- a/sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out // This test performs basic checks of parallel_for(range<1>, reduction, func) diff --git a/sycl/test-e2e/Reduction/reduction_range_1d_rw.cpp b/sycl/test-e2e/Reduction/reduction_range_1d_rw.cpp index ee72e6e516814..9df0bb8faa974 100644 --- a/sycl/test-e2e/Reduction/reduction_range_1d_rw.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_1d_rw.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out // This test performs basic checks of parallel_for(range<1>, reduction, func) diff --git a/sycl/test-e2e/Reduction/reduction_range_2d_dw.cpp b/sycl/test-e2e/Reduction/reduction_range_2d_dw.cpp index 604861ece2123..d8c7ee5fe037b 100644 --- a/sycl/test-e2e/Reduction/reduction_range_2d_dw.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_2d_dw.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out // This test performs basic checks of parallel_for(range<2>, reduction, func) diff --git a/sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp index b15ed6e6e3b6a..5d9b382dd1546 100644 --- a/sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out // This test performs basic checks of parallel_for(range<2>, reduction, func) diff --git a/sycl/test-e2e/Reduction/reduction_range_2d_rw.cpp b/sycl/test-e2e/Reduction/reduction_range_2d_rw.cpp index bb19278df3281..a264e8a10a007 100644 --- a/sycl/test-e2e/Reduction/reduction_range_2d_rw.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_2d_rw.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out // This test performs basic checks of parallel_for(range<2>, reduction, func) diff --git a/sycl/test-e2e/Reduction/reduction_range_3d_dw.cpp b/sycl/test-e2e/Reduction/reduction_range_3d_dw.cpp index 0a0a2f187f870..63462f8e34d2c 100644 --- a/sycl/test-e2e/Reduction/reduction_range_3d_dw.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_3d_dw.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out // This test performs basic checks of parallel_for(range<3>, reduction, func) diff --git a/sycl/test-e2e/Reduction/reduction_range_3d_rw.cpp b/sycl/test-e2e/Reduction/reduction_range_3d_rw.cpp index e49b55156485a..8141b22138b07 100644 --- a/sycl/test-e2e/Reduction/reduction_range_3d_rw.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_3d_rw.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out // This test performs basic checks of parallel_for(range<3>, reduction, func) diff --git a/sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp index 017d6a9427dca..0488a93cfc822 100644 --- a/sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out // This test performs basic checks of parallel_for(range<3>, reduction, func) diff --git a/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp b/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp index b99f3bdc276f7..b1e290a9348c6 100644 --- a/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} +// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} // RUN: %{run} %t.out #include "reduction_utils.hpp" diff --git a/sycl/test-e2e/USM/P2P/p2p_atomics.cpp b/sycl/test-e2e/USM/P2P/p2p_atomics.cpp index c36b331ca46c4..71bf0ac6b88c8 100644 --- a/sycl/test-e2e/USM/P2P/p2p_atomics.cpp +++ b/sycl/test-e2e/USM/P2P/p2p_atomics.cpp @@ -1,5 +1,5 @@ // REQUIRES: cuda || hip || level_zero -// RUN: %{build} %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_61 %} -o %t.out +// RUN: %{build} %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %} -o %t.out // RUN: %{run} %t.out #include