diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index 9379faf1156fc..35bf629ae81ac 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -188,19 +188,6 @@ impl<'a, 'll, CX: Borrow>> GenericBuilder<'a, 'll, CX> { load } } - - fn memset(&mut self, ptr: &'ll Value, fill_byte: &'ll Value, size: &'ll Value, align: Align) { - unsafe { - llvm::LLVMRustBuildMemSet( - self.llbuilder, - ptr, - align.bytes() as c_uint, - fill_byte, - size, - false, - ); - } - } } /// Empty string, to be used where LLVM expects an instruction name, indicating diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index f1735b9a0f586..49682a64e8c46 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -19,8 +19,6 @@ pub(crate) struct OffloadGlobals<'ll> { pub launcher_fn: &'ll llvm::Value, pub launcher_ty: &'ll llvm::Type, - pub bin_desc: &'ll llvm::Type, - pub kernel_args_ty: &'ll llvm::Type, pub offload_entry_ty: &'ll llvm::Type, @@ -31,8 +29,8 @@ pub(crate) struct OffloadGlobals<'ll> { pub ident_t_global: &'ll llvm::Value, - pub register_lib: &'ll llvm::Value, - pub unregister_lib: &'ll llvm::Value, + // FIXME(offload): Drop this, once we fully automated our offload compilation pipeline, since + // LLVM will initialize them for us if it sees gpu kernels being registered. pub init_rtls: &'ll llvm::Value, } @@ -44,15 +42,6 @@ impl<'ll> OffloadGlobals<'ll> { let (begin_mapper, _, end_mapper, mapper_fn_ty) = gen_tgt_data_mappers(cx); let ident_t_global = generate_at_one(cx); - let tptr = cx.type_ptr(); - let ti32 = cx.type_i32(); - let tgt_bin_desc_ty = vec![ti32, tptr, tptr, tptr]; - let bin_desc = cx.type_named_struct("struct.__tgt_bin_desc"); - cx.set_struct_body(bin_desc, &tgt_bin_desc_ty, false); - - let reg_lib_decl = cx.type_func(&[cx.type_ptr()], cx.type_void()); - let register_lib = declare_offload_fn(&cx, "__tgt_register_lib", reg_lib_decl); - let unregister_lib = declare_offload_fn(&cx, "__tgt_unregister_lib", reg_lib_decl); let init_ty = cx.type_func(&[], cx.type_void()); let init_rtls = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty); @@ -63,20 +52,83 @@ impl<'ll> OffloadGlobals<'ll> { OffloadGlobals { launcher_fn, launcher_ty, - bin_desc, kernel_args_ty, offload_entry_ty, begin_mapper, end_mapper, mapper_fn_ty, ident_t_global, - register_lib, - unregister_lib, init_rtls, } } } +// We need to register offload before using it. We also should unregister it once we are done, for +// good measures. Previously we have done so before and after each individual offload intrinsic +// call, but that comes at a performance cost. The repeated (un)register calls might also confuse +// the LLVM ompOpt pass, which tries to move operations to a better location. The easiest solution, +// which we copy from clang, is to just have those two calls once, in the global ctor/dtor section +// of the final binary. +pub(crate) fn register_offload<'ll>(cx: &CodegenCx<'ll, '_>) { + // First we check quickly whether we already have done our setup, in which case we return early. + // Shouldn't be needed for correctness. + if cx.get_function("__tgt_register_lib").is_some() { + return; + } + + let reg_lib_decl = cx.type_func(&[cx.type_ptr()], cx.type_void()); + let register_lib = declare_offload_fn(&cx, "__tgt_register_lib", reg_lib_decl); + let unregister_lib = declare_offload_fn(&cx, "__tgt_unregister_lib", reg_lib_decl); + + let ptr_null = cx.const_null(cx.type_ptr()); + let const_struct = cx.const_struct(&[cx.get_const_i32(0), ptr_null, ptr_null, ptr_null], false); + let omp_descriptor = + add_global(cx, ".omp_offloading.descriptor", const_struct, InternalLinkage); + // @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_llvm_offload_entries, ptr @__stop_llvm_offload_entries } + // @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 0, ptr null, ptr null, ptr null } + + let atexit = cx.type_func(&[cx.type_ptr()], cx.type_i32()); + let atexit_fn = declare_offload_fn(cx, "atexit", atexit); + + let desc_ty = cx.type_func(&[], cx.type_void()); + let reg_name = ".omp_offloading.descriptor_reg"; + let unreg_name = ".omp_offloading.descriptor_unreg"; + let desc_reg_fn = declare_offload_fn(cx, reg_name, desc_ty); + let desc_unreg_fn = declare_offload_fn(cx, unreg_name, desc_ty); + llvm::set_linkage(desc_reg_fn, InternalLinkage); + llvm::set_linkage(desc_unreg_fn, InternalLinkage); + llvm::set_section(desc_reg_fn, c".text.startup"); + llvm::set_section(desc_unreg_fn, c".text.startup"); + + // define internal void @.omp_offloading.descriptor_reg() section ".text.startup" { + // entry: + // call void @__tgt_register_lib(ptr @.omp_offloading.descriptor) + // %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg) + // ret void + // } + let bb = Builder::append_block(cx, desc_reg_fn, "entry"); + let mut a = Builder::build(cx, bb); + a.call(reg_lib_decl, None, None, register_lib, &[omp_descriptor], None, None); + a.call(atexit, None, None, atexit_fn, &[desc_unreg_fn], None, None); + a.ret_void(); + + // define internal void @.omp_offloading.descriptor_unreg() section ".text.startup" { + // entry: + // call void @__tgt_unregister_lib(ptr @.omp_offloading.descriptor) + // ret void + // } + let bb = Builder::append_block(cx, desc_unreg_fn, "entry"); + let mut a = Builder::build(cx, bb); + a.call(reg_lib_decl, None, None, unregister_lib, &[omp_descriptor], None, None); + a.ret_void(); + + // @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }] + let args = vec![cx.get_const_i32(101), desc_reg_fn, ptr_null]; + let const_struct = cx.const_struct(&args, false); + let arr = cx.const_array(cx.val_ty(const_struct), &[const_struct]); + add_global(cx, "llvm.global_ctors", arr, AppendingLinkage); +} + pub(crate) struct OffloadKernelDims<'ll> { num_workgroups: &'ll Value, threads_per_block: &'ll Value, @@ -487,9 +539,6 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( let tgt_decl = offload_globals.launcher_fn; let tgt_target_kernel_ty = offload_globals.launcher_ty; - // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } - let tgt_bin_desc = offload_globals.bin_desc; - let tgt_kernel_decl = offload_globals.kernel_args_ty; let begin_mapper_decl = offload_globals.begin_mapper; let end_mapper_decl = offload_globals.end_mapper; @@ -513,12 +562,9 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( } // Step 0) - // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } - // %6 = alloca %struct.__tgt_bin_desc, align 8 unsafe { llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, builder.llfn()); } - let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc"); let ty = cx.type_array(cx.type_ptr(), num_args); // Baseptr are just the input pointer to the kernel, stored in a local alloca @@ -536,7 +582,6 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( unsafe { llvm::LLVMPositionBuilderAtEnd(&builder.llbuilder, bb); } - builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT); // Now we allocate once per function param, a copy to be passed to one of our maps. let mut vals = vec![]; @@ -574,15 +619,9 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( geps.push(gep); } - let mapper_fn_ty = cx.type_func(&[cx.type_ptr()], cx.type_void()); - let register_lib_decl = offload_globals.register_lib; - let unregister_lib_decl = offload_globals.unregister_lib; let init_ty = cx.type_func(&[], cx.type_void()); let init_rtls_decl = offload_globals.init_rtls; - // FIXME(offload): Later we want to add them to the wrapper code, rather than our main function. - // call void @__tgt_register_lib(ptr noundef %6) - builder.call(mapper_fn_ty, None, None, register_lib_decl, &[tgt_bin_desc_alloca], None, None); // call void @__tgt_init_all_rtls() builder.call(init_ty, None, None, init_rtls_decl, &[], None, None); @@ -679,6 +718,4 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( num_args, s_ident_t, ); - - builder.call(mapper_fn_ty, None, None, unregister_lib_decl, &[tgt_bin_desc_alloca], None, None); } diff --git a/compiler/rustc_codegen_llvm/src/common.rs b/compiler/rustc_codegen_llvm/src/common.rs index b0cf9925019d2..f2261ab79340f 100644 --- a/compiler/rustc_codegen_llvm/src/common.rs +++ b/compiler/rustc_codegen_llvm/src/common.rs @@ -124,6 +124,10 @@ impl<'ll, CX: Borrow>> GenericCx<'ll, CX> { pub(crate) fn const_null(&self, t: &'ll Type) -> &'ll Value { unsafe { llvm::LLVMConstNull(t) } } + + pub(crate) fn const_struct(&self, elts: &[&'ll Value], packed: bool) -> &'ll Value { + struct_in_context(self.llcx(), elts, packed) + } } impl<'ll, 'tcx> ConstCodegenMethods for CodegenCx<'ll, 'tcx> { diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 20eac4cf92c20..97bc929dff32a 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -30,7 +30,9 @@ use tracing::debug; use crate::abi::FnAbiLlvmExt; use crate::builder::Builder; use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call}; -use crate::builder::gpu_offload::{OffloadKernelDims, gen_call_handling, gen_define_handling}; +use crate::builder::gpu_offload::{ + OffloadKernelDims, gen_call_handling, gen_define_handling, register_offload, +}; use crate::context::CodegenCx; use crate::declare::declare_raw_fn; use crate::errors::{ @@ -1410,6 +1412,7 @@ fn codegen_offload<'ll, 'tcx>( return; } }; + register_offload(cx); let offload_data = gen_define_handling(&cx, &metadata, target_symbol, offload_globals); gen_call_handling(bx, &offload_data, &args, &types, &metadata, offload_globals, &offload_dims); } diff --git a/tests/codegen-llvm/gpu_offload/control_flow.rs b/tests/codegen-llvm/gpu_offload/control_flow.rs index 28ee9c85b0edc..1a3d3cd7a7789 100644 --- a/tests/codegen-llvm/gpu_offload/control_flow.rs +++ b/tests/codegen-llvm/gpu_offload/control_flow.rs @@ -12,8 +12,7 @@ // CHECK: define{{( dso_local)?}} void @main() // CHECK-NOT: define -// CHECK: %EmptyDesc = alloca %struct.__tgt_bin_desc, align 8 -// CHECK-NEXT: %.offload_baseptrs = alloca [1 x ptr], align 8 +// CHECK: %.offload_baseptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8 // CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 diff --git a/tests/codegen-llvm/gpu_offload/gpu_host.rs b/tests/codegen-llvm/gpu_offload/gpu_host.rs index 27ff6f325aa0f..d0bc34ec66b20 100644 --- a/tests/codegen-llvm/gpu_offload/gpu_host.rs +++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs @@ -2,9 +2,10 @@ //@ no-prefer-dynamic //@ needs-offload -// This test is verifying that we generate __tgt_target_data_*_mapper before and after a call to the -// kernel_1. Better documentation to what each global or variable means is available in the gpu -// offload code, or the LLVM offload documentation. +// This test is verifying that we generate __tgt_target_data_*_mapper before and after a call to +// __tgt_target_kernel, and initialize all needed variables. It also verifies some related globals. +// Better documentation to what each global or variable means is available in the gpu offload code, +// or the LLVM offload documentation. #![feature(rustc_attrs)] #![feature(core_intrinsics)] @@ -17,10 +18,8 @@ fn main() { core::hint::black_box(&x); } -#[unsafe(no_mangle)] -#[inline(never)] pub fn kernel_1(x: &mut [f32; 256]) { - core::intrinsics::offload(_kernel_1, [256, 1, 1], [32, 1, 1], (x,)) + core::intrinsics::offload(kernel_1, [256, 1, 1], [32, 1, 1], (x,)) } #[unsafe(no_mangle)] @@ -33,75 +32,75 @@ pub fn _kernel_1(x: &mut [f32; 256]) { // CHECK: %struct.ident_t = type { i32, i32, i32, i32, ptr } // CHECK: %struct.__tgt_offload_entry = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr } -// CHECK: %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr } // CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 } -// CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 -// CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8 +// CHECK: @anon.[[ID:.*]].0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +// CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.[[ID]].0 }, align 8 -// CHECK: @.offload_sizes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 1024] -// CHECK: @.offload_maptypes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 35] -// CHECK: @._kernel_1.region_id = internal constant i8 0 -// CHECK: @.offloading.entry_name._kernel_1 = internal unnamed_addr constant [10 x i8] c"_kernel_1\00", section ".llvm.rodata.offloading", align 1 -// CHECK: @.offloading.entry._kernel_1 = internal constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 +// CHECK-DAG: @.omp_offloading.descriptor = internal constant { i32, ptr, ptr, ptr } zeroinitializer +// CHECK-DAG: @llvm.global_ctors = appending constant [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }] +// CHECK-DAG: @.offload_sizes.[[K:[^ ]*kernel_1]] = private unnamed_addr constant [1 x i64] [i64 1024] +// CHECK-DAG: @.offload_maptypes.[[K]] = private unnamed_addr constant [1 x i64] [i64 35] +// CHECK-DAG: @.[[K]].region_id = internal constant i8 0 +// CHECK-DAG: @.offloading.entry_name.[[K]] = internal unnamed_addr constant [{{[0-9]+}} x i8] c"[[K]]{{\\00}}", section ".llvm.rodata.offloading", align 1 +// CHECK-DAG: @.offloading.entry.[[K]] = internal constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @.[[K]].region_id, ptr @.offloading.entry_name.[[K]], i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 // CHECK: declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) -// CHECK: declare void @__tgt_register_lib(ptr) local_unnamed_addr -// CHECK: declare void @__tgt_unregister_lib(ptr) local_unnamed_addr - -// CHECK: define{{( dso_local)?}} void @main() -// CHECK-NEXT: start: -// CHECK-NEXT: %0 = alloca [8 x i8], align 8 -// CHECK-NEXT: %x = alloca [1024 x i8], align 16 -// CHECK: call void @kernel_1(ptr noalias noundef nonnull align 4 dereferenceable(1024) %x) -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %0) -// CHECK-NEXT: store ptr %x, ptr %0, align 8 -// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %0) -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1024, ptr nonnull %x) -// CHECK-NEXT: ret void -// CHECK-NEXT: } -// CHECK: define{{( dso_local)?}} void @kernel_1(ptr noalias noundef align 4 dereferenceable(1024) %x) +// CHECK-LABEL: define{{( dso_local)?}} void @main() // CHECK-NEXT: start: -// CHECK-NEXT: %EmptyDesc = alloca %struct.__tgt_bin_desc, align 8 +// CHECK-NEXT: %0 = alloca [8 x i8], align 8 +// CHECK-NEXT: %x = alloca [1024 x i8], align 16 // CHECK-NEXT: %.offload_baseptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8 // CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 -// CHECK-NEXT: %dummy = load volatile ptr, ptr @.offload_sizes._kernel_1, align 8 -// CHECK-NEXT: %dummy1 = load volatile ptr, ptr @.offloading.entry._kernel_1, align 8 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %EmptyDesc, i8 0, i64 32, i1 false) -// CHECK-NEXT: call void @__tgt_register_lib(ptr nonnull %EmptyDesc) +// CHECK: %dummy = load volatile ptr, ptr @.offload_sizes.[[K]], align 8 +// CHECK-NEXT: %dummy1 = load volatile ptr, ptr @.offloading.entry.[[K]], align 8 // CHECK-NEXT: call void @__tgt_init_all_rtls() // CHECK-NEXT: store ptr %x, ptr %.offload_baseptrs, align 8 // CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8 // CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8 -// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null) +// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]], ptr null, ptr null) // CHECK-NEXT: store i32 3, ptr %kernel_args, align 8 -// CHECK-NEXT: %0 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4 -// CHECK-NEXT: store i32 1, ptr %0, align 4 -// CHECK-NEXT: %1 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 8 -// CHECK-NEXT: store ptr %.offload_baseptrs, ptr %1, align 8 -// CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16 -// CHECK-NEXT: store ptr %.offload_ptrs, ptr %2, align 8 -// CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24 -// CHECK-NEXT: store ptr %.offload_sizes, ptr %3, align 8 -// CHECK-NEXT: %4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32 -// CHECK-NEXT: store ptr @.offload_maptypes._kernel_1, ptr %4, align 8 -// CHECK-NEXT: %5 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40 -// CHECK-NEXT: %6 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 72 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %5, i8 0, i64 32, i1 false) -// CHECK-NEXT: store <4 x i32> , ptr %6, align 8 -// CHECK-NEXT: %.fca.1.gep5 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88 -// CHECK-NEXT: store i32 1, ptr %.fca.1.gep5, align 8 -// CHECK-NEXT: %.fca.2.gep7 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92 -// CHECK-NEXT: store i32 1, ptr %.fca.2.gep7, align 4 -// CHECK-NEXT: %7 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96 -// CHECK-NEXT: store i32 0, ptr %7, align 8 -// CHECK-NEXT: %8 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @._kernel_1.region_id, ptr nonnull %kernel_args) -// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null) -// CHECK-NEXT: call void @__tgt_unregister_lib(ptr nonnull %EmptyDesc) +// CHECK-NEXT: [[P4:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4 +// CHECK-NEXT: store i32 1, ptr [[P4]], align 4 +// CHECK-NEXT: [[P8:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 8 +// CHECK-NEXT: store ptr %.offload_baseptrs, ptr [[P8]], align 8 +// CHECK-NEXT: [[P16:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16 +// CHECK-NEXT: store ptr %.offload_ptrs, ptr [[P16]], align 8 +// CHECK-NEXT: [[P24:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24 +// CHECK-NEXT: store ptr %.offload_sizes, ptr [[P24]], align 8 +// CHECK-NEXT: [[P32:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32 +// CHECK-NEXT: store ptr @.offload_maptypes.[[K]], ptr [[P32]], align 8 +// CHECK-NEXT: [[P40:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40 +// CHECK-NEXT: [[P72:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 72 +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[P40]], i8 0, i64 32, i1 false) +// CHECK-NEXT: store <4 x i32> , ptr [[P72]], align 8 +// CHECK-NEXT: [[P88:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88 +// CHECK-NEXT: store i32 1, ptr [[P88]], align 8 +// CHECK-NEXT: [[P92:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92 +// CHECK-NEXT: store i32 1, ptr [[P92]], align 4 +// CHECK-NEXT: [[P96:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96 +// CHECK-NEXT: store i32 0, ptr [[P96]], align 8 +// CHECK-NEXT: {{%[^ ]+}} = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.[[K]].region_id, ptr nonnull %kernel_args) +// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]], ptr null, ptr null) +// CHECK: ret void +// CHECK-NEXT: } + +// CHECK: declare void @__tgt_register_lib(ptr) local_unnamed_addr +// CHECK: declare void @__tgt_unregister_lib(ptr) local_unnamed_addr + +// CHECK-LABEL: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @__tgt_register_lib(ptr nonnull @.omp_offloading.descriptor) +// CHECK-NEXT: %0 = {{tail }}call i32 @atexit(ptr nonnull @.omp_offloading.descriptor_unreg) +// CHECK-NEXT: ret void +// CHECK-NEXT: } + +// CHECK-LABEL: define internal void @.omp_offloading.descriptor_unreg() section ".text.startup" { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @__tgt_unregister_lib(ptr nonnull @.omp_offloading.descriptor) // CHECK-NEXT: ret void // CHECK-NEXT: }