Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions compiler/rustc_codegen_llvm/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,19 +188,6 @@ impl<'a, 'll, CX: Borrow<SCx<'ll>>> GenericBuilder<'a, 'll, CX> {
load
}
}

fn memset(&mut self, ptr: &'ll Value, fill_byte: &'ll Value, size: &'ll Value, align: Align) {
unsafe {
llvm::LLVMRustBuildMemSet(
self.llbuilder,
ptr,
align.bytes() as c_uint,
fill_byte,
size,
false,
);
}
}
}

/// Empty string, to be used where LLVM expects an instruction name, indicating
Expand Down
99 changes: 68 additions & 31 deletions compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ pub(crate) struct OffloadGlobals<'ll> {
pub launcher_fn: &'ll llvm::Value,
pub launcher_ty: &'ll llvm::Type,

pub bin_desc: &'ll llvm::Type,

pub kernel_args_ty: &'ll llvm::Type,

pub offload_entry_ty: &'ll llvm::Type,
Expand All @@ -31,8 +29,8 @@ pub(crate) struct OffloadGlobals<'ll> {

pub ident_t_global: &'ll llvm::Value,

pub register_lib: &'ll llvm::Value,
pub unregister_lib: &'ll llvm::Value,
// FIXME(offload): Drop this, once we fully automated our offload compilation pipeline, since
// LLVM will initialize them for us if it sees gpu kernels being registered.
pub init_rtls: &'ll llvm::Value,
}

Expand All @@ -44,15 +42,6 @@ impl<'ll> OffloadGlobals<'ll> {
let (begin_mapper, _, end_mapper, mapper_fn_ty) = gen_tgt_data_mappers(cx);
let ident_t_global = generate_at_one(cx);

let tptr = cx.type_ptr();
let ti32 = cx.type_i32();
let tgt_bin_desc_ty = vec![ti32, tptr, tptr, tptr];
let bin_desc = cx.type_named_struct("struct.__tgt_bin_desc");
cx.set_struct_body(bin_desc, &tgt_bin_desc_ty, false);

let reg_lib_decl = cx.type_func(&[cx.type_ptr()], cx.type_void());
let register_lib = declare_offload_fn(&cx, "__tgt_register_lib", reg_lib_decl);
let unregister_lib = declare_offload_fn(&cx, "__tgt_unregister_lib", reg_lib_decl);
let init_ty = cx.type_func(&[], cx.type_void());
let init_rtls = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty);

Expand All @@ -63,20 +52,83 @@ impl<'ll> OffloadGlobals<'ll> {
OffloadGlobals {
launcher_fn,
launcher_ty,
bin_desc,
kernel_args_ty,
offload_entry_ty,
begin_mapper,
end_mapper,
mapper_fn_ty,
ident_t_global,
register_lib,
unregister_lib,
init_rtls,
}
}
}

// We need to register offload before using it. We also should unregister it once we are done, for
// good measures. Previously we have done so before and after each individual offload intrinsic
// call, but that comes at a performance cost. The repeated (un)register calls might also confuse
// the LLVM ompOpt pass, which tries to move operations to a better location. The easiest solution,
// which we copy from clang, is to just have those two calls once, in the global ctor/dtor section
// of the final binary.
pub(crate) fn register_offload<'ll>(cx: &CodegenCx<'ll, '_>) {
// First we check quickly whether we already have done our setup, in which case we return early.
// Shouldn't be needed for correctness.
if cx.get_function("__tgt_register_lib").is_some() {
return;
}

let reg_lib_decl = cx.type_func(&[cx.type_ptr()], cx.type_void());
let register_lib = declare_offload_fn(&cx, "__tgt_register_lib", reg_lib_decl);
let unregister_lib = declare_offload_fn(&cx, "__tgt_unregister_lib", reg_lib_decl);

let ptr_null = cx.const_null(cx.type_ptr());
let const_struct = cx.const_struct(&[cx.get_const_i32(0), ptr_null, ptr_null, ptr_null], false);
let omp_descriptor =
add_global(cx, ".omp_offloading.descriptor", const_struct, InternalLinkage);
// @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_llvm_offload_entries, ptr @__stop_llvm_offload_entries }
// @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 0, ptr null, ptr null, ptr null }

let atexit = cx.type_func(&[cx.type_ptr()], cx.type_i32());
let atexit_fn = declare_offload_fn(cx, "atexit", atexit);

let desc_ty = cx.type_func(&[], cx.type_void());
let reg_name = ".omp_offloading.descriptor_reg";
let unreg_name = ".omp_offloading.descriptor_unreg";
let desc_reg_fn = declare_offload_fn(cx, reg_name, desc_ty);
let desc_unreg_fn = declare_offload_fn(cx, unreg_name, desc_ty);
llvm::set_linkage(desc_reg_fn, InternalLinkage);
llvm::set_linkage(desc_unreg_fn, InternalLinkage);
llvm::set_section(desc_reg_fn, c".text.startup");
llvm::set_section(desc_unreg_fn, c".text.startup");

// define internal void @.omp_offloading.descriptor_reg() section ".text.startup" {
// entry:
// call void @__tgt_register_lib(ptr @.omp_offloading.descriptor)
// %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
// ret void
// }
let bb = Builder::append_block(cx, desc_reg_fn, "entry");
let mut a = Builder::build(cx, bb);
a.call(reg_lib_decl, None, None, register_lib, &[omp_descriptor], None, None);
a.call(atexit, None, None, atexit_fn, &[desc_unreg_fn], None, None);
a.ret_void();

// define internal void @.omp_offloading.descriptor_unreg() section ".text.startup" {
// entry:
// call void @__tgt_unregister_lib(ptr @.omp_offloading.descriptor)
// ret void
// }
let bb = Builder::append_block(cx, desc_unreg_fn, "entry");
let mut a = Builder::build(cx, bb);
a.call(reg_lib_decl, None, None, unregister_lib, &[omp_descriptor], None, None);
a.ret_void();

// @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }]
let args = vec![cx.get_const_i32(101), desc_reg_fn, ptr_null];
let const_struct = cx.const_struct(&args, false);
let arr = cx.const_array(cx.val_ty(const_struct), &[const_struct]);
add_global(cx, "llvm.global_ctors", arr, AppendingLinkage);
}

pub(crate) struct OffloadKernelDims<'ll> {
num_workgroups: &'ll Value,
threads_per_block: &'ll Value,
Expand Down Expand Up @@ -487,9 +539,6 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
let tgt_decl = offload_globals.launcher_fn;
let tgt_target_kernel_ty = offload_globals.launcher_ty;

// %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
let tgt_bin_desc = offload_globals.bin_desc;

let tgt_kernel_decl = offload_globals.kernel_args_ty;
let begin_mapper_decl = offload_globals.begin_mapper;
let end_mapper_decl = offload_globals.end_mapper;
Expand All @@ -513,12 +562,9 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
}

// Step 0)
// %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
// %6 = alloca %struct.__tgt_bin_desc, align 8
unsafe {
llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, builder.llfn());
}
let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc");

let ty = cx.type_array(cx.type_ptr(), num_args);
// Baseptr are just the input pointer to the kernel, stored in a local alloca
Expand All @@ -536,7 +582,6 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
unsafe {
llvm::LLVMPositionBuilderAtEnd(&builder.llbuilder, bb);
}
builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT);

// Now we allocate once per function param, a copy to be passed to one of our maps.
let mut vals = vec![];
Expand Down Expand Up @@ -574,15 +619,9 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
geps.push(gep);
}

let mapper_fn_ty = cx.type_func(&[cx.type_ptr()], cx.type_void());
let register_lib_decl = offload_globals.register_lib;
let unregister_lib_decl = offload_globals.unregister_lib;
let init_ty = cx.type_func(&[], cx.type_void());
let init_rtls_decl = offload_globals.init_rtls;

// FIXME(offload): Later we want to add them to the wrapper code, rather than our main function.
// call void @__tgt_register_lib(ptr noundef %6)
builder.call(mapper_fn_ty, None, None, register_lib_decl, &[tgt_bin_desc_alloca], None, None);
// call void @__tgt_init_all_rtls()
builder.call(init_ty, None, None, init_rtls_decl, &[], None, None);

Expand Down Expand Up @@ -679,6 +718,4 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
num_args,
s_ident_t,
);

builder.call(mapper_fn_ty, None, None, unregister_lib_decl, &[tgt_bin_desc_alloca], None, None);
}
4 changes: 4 additions & 0 deletions compiler/rustc_codegen_llvm/src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
pub(crate) fn const_null(&self, t: &'ll Type) -> &'ll Value {
unsafe { llvm::LLVMConstNull(t) }
}

pub(crate) fn const_struct(&self, elts: &[&'ll Value], packed: bool) -> &'ll Value {
struct_in_context(self.llcx(), elts, packed)
}
}

impl<'ll, 'tcx> ConstCodegenMethods for CodegenCx<'ll, 'tcx> {
Expand Down
5 changes: 4 additions & 1 deletion compiler/rustc_codegen_llvm/src/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ use tracing::debug;
use crate::abi::FnAbiLlvmExt;
use crate::builder::Builder;
use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call};
use crate::builder::gpu_offload::{OffloadKernelDims, gen_call_handling, gen_define_handling};
use crate::builder::gpu_offload::{
OffloadKernelDims, gen_call_handling, gen_define_handling, register_offload,
};
use crate::context::CodegenCx;
use crate::declare::declare_raw_fn;
use crate::errors::{
Expand Down Expand Up @@ -1410,6 +1412,7 @@ fn codegen_offload<'ll, 'tcx>(
return;
}
};
register_offload(cx);
let offload_data = gen_define_handling(&cx, &metadata, target_symbol, offload_globals);
gen_call_handling(bx, &offload_data, &args, &types, &metadata, offload_globals, &offload_dims);
}
Expand Down
3 changes: 1 addition & 2 deletions tests/codegen-llvm/gpu_offload/control_flow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@

// CHECK: define{{( dso_local)?}} void @main()
// CHECK-NOT: define
// CHECK: %EmptyDesc = alloca %struct.__tgt_bin_desc, align 8
// CHECK-NEXT: %.offload_baseptrs = alloca [1 x ptr], align 8
// CHECK: %.offload_baseptrs = alloca [1 x ptr], align 8
// CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8
// CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
Expand Down
Loading
Loading