rust-lang · Sa4dUs · Oct 19, 2025 · Oct 21, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/compiler/rustc_codegen_llvm/src/attributes.rs b/compiler/rustc_codegen_llvm/src/attributes.rs
@@ -30,6 +30,14 @@ pub(crate) fn apply_to_callsite(callsite: &Value, idx: AttributePlace, attrs: &[
     }
 }
 
+pub(crate) fn has_string_attr(llfn: &Value, name: &str) -> bool {
+    llvm::HasStringAttribute(llfn, name)
+}
+
+pub(crate) fn remove_string_attr_from_llfn(llfn: &Value, name: &str) {
+    llvm::RemoveStringAttrFromFn(llfn, name);
+}
+
 /// Get LLVM attribute for the provided inline heuristic.
 pub(crate) fn inline_attr<'ll, 'tcx>(
     cx: &SimpleCx<'ll>,
@@ -408,6 +416,10 @@ pub(crate) fn llfn_attrs_from_instance<'ll, 'tcx>(
         to_add.push(llvm::CreateAttrString(cx.llcx, "no-builtins"));
     }
 
+    if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::OFFLOAD_KERNEL) {
+        to_add.push(llvm::CreateAttrString(cx.llcx, "offload-kernel"))
+    }
+
     if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::COLD) {
         to_add.push(AttributeKind::Cold.create_attr(cx.llcx));
     }

diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs
@@ -26,7 +26,7 @@ use crate::back::write::{
 };
 use crate::errors::{LlvmError, LtoBitcodeFromRlib};
 use crate::llvm::{self, build_string};
-use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx};
+use crate::{LlvmCodegenBackend, ModuleLlvm};
 
 /// We keep track of the computed LTO cache keys from the previous
 /// session to determine which CGUs we can reuse.
@@ -601,7 +601,6 @@ pub(crate) fn run_pass_manager(
     // We then run the llvm_optimize function a second time, to optimize the code which we generated
     // in the enzyme differentiation pass.
     let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
-    let enable_gpu = config.offload.contains(&config::Offload::Enable);
     let stage = if thin {
         write::AutodiffStage::PreAD
     } else {
@@ -616,13 +615,6 @@ pub(crate) fn run_pass_manager(
         write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage);
     }
 
-    // Here we only handle the GPU host (=cpu) code.
-    if enable_gpu && !thin && !cgcx.target_is_like_gpu {
-        let cx =
-            SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
-        crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx);
-    }
-
     if cfg!(feature = "llvm_enzyme") && enable_ad && !thin {
         let opt_stage = llvm::OptStage::FatLTO;
         let stage = write::AutodiffStage::PostAD;

diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs
@@ -43,7 +43,7 @@ use crate::errors::{
 use crate::llvm::diagnostic::OptimizationDiagnosticKind::*;
 use crate::llvm::{self, DiagnosticInfo};
 use crate::type_::llvm_type_ptr;
-use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, base, common, llvm_util};
+use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes, base, common, llvm_util};
 
 pub(crate) fn llvm_err<'a>(dcx: DiagCtxtHandle<'_>, err: LlvmError<'a>) -> ! {
     match llvm::last_error() {
@@ -706,11 +706,12 @@ pub(crate) unsafe fn llvm_optimize(
             SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size);
         // For now we only support up to 10 kernels named kernel_0 ... kernel_9, a follow-up PR is
         // introducing a proper offload intrinsic to solve this limitation.
-        for num in 0..9 {
-            let name = format!("kernel_{num}");
-            if let Some(kernel) = cx.get_function(&name) {
-                handle_offload(&cx, kernel);
+        for func in cx.get_functions() {
+            let offload_kernel = "offload-kernel";
+            if attributes::has_string_attr(func, offload_kernel) {
+                handle_offload(&cx, func);
             }
+            attributes::remove_string_attr_from_llfn(func, offload_kernel);
         }
     }
 

diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
@@ -2,37 +2,13 @@ use std::ffi::CString;
 
 use llvm::Linkage::*;
 use rustc_abi::Align;
-use rustc_codegen_ssa::back::write::CodegenContext;
 use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
+use rustc_middle::ty::offload_meta::OffloadMetadata;
 
 use crate::builder::SBuilder;
-use crate::common::AsCCharPtr;
 use crate::llvm::AttributePlace::Function;
-use crate::llvm::{self, Linkage, Type, Value};
-use crate::{LlvmCodegenBackend, SimpleCx, attributes};
-
-pub(crate) fn handle_gpu_code<'ll>(
-    _cgcx: &CodegenContext<LlvmCodegenBackend>,
-    cx: &'ll SimpleCx<'_>,
-) {
-    // The offload memory transfer type for each kernel
-    let mut memtransfer_types = vec![];
-    let mut region_ids = vec![];
-    let offload_entry_ty = TgtOffloadEntry::new_decl(&cx);
-    // This is a temporary hack, we only search for kernel_0 to kernel_9 functions.
-    // There is a draft PR in progress which will introduce a proper offload intrinsic to remove
-    // this limitation.
-    for num in 0..9 {
-        let kernel = cx.get_function(&format!("kernel_{num}"));
-        if let Some(kernel) = kernel {
-            let (o, k) = gen_define_handling(&cx, kernel, offload_entry_ty, num);
-            memtransfer_types.push(o);
-            region_ids.push(k);
-        }
-    }
-
-    gen_call_handling(&cx, &memtransfer_types, &region_ids);
-}
+use crate::llvm::{self, BasicBlock, Linkage, Type, Value};
+use crate::{SimpleCx, attributes};
 
 // ; Function Attrs: nounwind
 // declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2
@@ -79,7 +55,7 @@ fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value {
     at_one
 }
 
-struct TgtOffloadEntry {
+pub(crate) struct TgtOffloadEntry {
     //   uint64_t Reserved;
     //   uint16_t Version;
     //   uint16_t Kind;
@@ -167,7 +143,7 @@ impl KernelArgsTy {
     fn new<'ll>(
         cx: &'ll SimpleCx<'_>,
         num_args: u64,
-        memtransfer_types: &[&'ll Value],
+        memtransfer_types: &'ll Value,
         geps: [&'ll Value; 3],
     ) -> [(Align, &'ll Value); 13] {
         let four = Align::from_bytes(4).expect("4 Byte alignment should work");
@@ -181,7 +157,7 @@ impl KernelArgsTy {
             (eight, geps[0]),
             (eight, geps[1]),
             (eight, geps[2]),
-            (eight, memtransfer_types[0]),
+            (eight, memtransfer_types),
             // The next two are debug infos. FIXME(offload): set them
             (eight, cx.const_null(cx.type_ptr())), // dbg
             (eight, cx.const_null(cx.type_ptr())), // dbg
@@ -256,68 +232,68 @@ pub(crate) fn add_global<'ll>(
 // This function returns a memtransfer value which encodes how arguments to this kernel shall be
 // mapped to/from the gpu. It also returns a region_id with the name of this kernel, to be
 // concatenated into the list of region_ids.
-fn gen_define_handling<'ll>(
-    cx: &'ll SimpleCx<'_>,
-    kernel: &'ll llvm::Value,
+pub(crate) fn gen_define_handling<'ll>(
+    cx: &SimpleCx<'ll>,
     offload_entry_ty: &'ll llvm::Type,
-    num: i64,
-) -> (&'ll llvm::Value, &'ll llvm::Value) {
-    let types = cx.func_params_types(cx.get_type_of_global(kernel));
+    metadata: &[OffloadMetadata],
+    types: &[&Type],
+    symbol: &str,
+) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value) {
     // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or
     // reference) types.
-    let num_ptr_types = types
-        .iter()
-        .filter(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer))
-        .count();
-
-    // We do not know their size anymore at this level, so hardcode a placeholder.
-    // A follow-up pr will track these from the frontend, where we still have Rust types.
-    // Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes.
-    // I decided that 1024 bytes is a great placeholder value for now.
-    add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{num}"), &vec![1024; num_ptr_types]);
+    let ptr_meta = types.iter().zip(metadata).filter_map(|(&x, meta)| match cx.type_kind(x) {
+        rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta),
+        _ => None,
+    });
+
+    // FIXME(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
+    let (ptr_sizes, ptr_transfer): (Vec<_>, Vec<_>) =
+        ptr_meta.map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip();
+
+    let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes);
     // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
     // or both to and from the gpu (=3). Other values shouldn't affect us for now.
     // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
     // will be 2. For now, everything is 3, until we have our frontend set up.
     // 1+2+32: 1 (MapTo), 2 (MapFrom), 32 (Add one extra input ptr per function, to be used later).
-    let memtransfer_types = add_priv_unnamed_arr(
-        &cx,
-        &format!(".offload_maptypes.{num}"),
-        &vec![1 + 2 + 32; num_ptr_types],
-    );
+    let memtransfer_types =
+        add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}"), &ptr_transfer);
+
     // Next: For each function, generate these three entries. A weak constant,
     // the llvm.rodata entry name, and  the llvm_offload_entries value
 
-    let name = format!(".kernel_{num}.region_id");
+    let name = format!(".{symbol}.region_id");
     let initializer = cx.get_const_i8(0);
     let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage);
 
-    let c_entry_name = CString::new(format!("kernel_{num}")).unwrap();
+    let c_entry_name = CString::new(symbol).unwrap();
     let c_val = c_entry_name.as_bytes_with_nul();
-    let offload_entry_name = format!(".offloading.entry_name.{num}");
+    let offload_entry_name = format!(".offloading.entry_name.{symbol}");
 
     let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
     let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage);
     llvm::set_alignment(llglobal, Align::ONE);
     llvm::set_section(llglobal, c".llvm.rodata.offloading");
-    let name = format!(".offloading.entry.kernel_{num}");
+
+    let name = format!(".offloading.entry.{symbol}");
 
     // See the __tgt_offload_entry documentation above.
     let elems = TgtOffloadEntry::new(&cx, region_id, llglobal);
 
     let initializer = crate::common::named_struct(offload_entry_ty, &elems);
     let c_name = CString::new(name).unwrap();
-    let llglobal = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
-    llvm::set_global_constant(llglobal, true);
-    llvm::set_linkage(llglobal, WeakAnyLinkage);
-    llvm::set_initializer(llglobal, initializer);
-    llvm::set_alignment(llglobal, Align::EIGHT);
+    let offload_entry = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
+    llvm::set_global_constant(offload_entry, true);
+    llvm::set_linkage(offload_entry, WeakAnyLinkage);
+    llvm::set_initializer(offload_entry, initializer);
+    llvm::set_alignment(offload_entry, Align::EIGHT);
     let c_section_name = CString::new("llvm_offload_entries").unwrap();
-    llvm::set_section(llglobal, &c_section_name);
-    (memtransfer_types, region_id)
+    llvm::set_section(offload_entry, &c_section_name);
+
+    (offload_sizes, memtransfer_types, region_id, offload_entry)
 }
 
-pub(crate) fn declare_offload_fn<'ll>(
+fn declare_offload_fn<'ll>(
     cx: &'ll SimpleCx<'_>,
     name: &str,
     ty: &'ll llvm::Type,
@@ -333,8 +309,7 @@ pub(crate) fn declare_offload_fn<'ll>(
 }
 
 // For each kernel *call*, we now use some of our previous declared globals to move data to and from
-// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function
-// from main is intended to run on the GPU. For now, we only handle the data transfer part of it.
+// the gpu. For now, we only handle the data transfer part of it.
 // If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
 // Since in our frontend users (by default) don't have to specify data transfer, this is something
 // we should optimize in the future! We also assume that everything should be copied back and forth,
@@ -352,10 +327,16 @@ pub(crate) fn declare_offload_fn<'ll>(
 // 4. set insert point after kernel call.
 // 5. generate all the GEPS and stores, to be used in 6)
 // 6. generate __tgt_target_data_end calls to move data from the GPU
-fn gen_call_handling<'ll>(
-    cx: &'ll SimpleCx<'_>,
-    memtransfer_types: &[&'ll llvm::Value],
-    region_ids: &[&'ll llvm::Value],
+pub(crate) fn gen_call_handling<'ll>(
+    cx: &SimpleCx<'ll>,
+    bb: &BasicBlock,
+    offload_sizes: &'ll llvm::Value,
+    offload_entry: &'ll llvm::Value,
+    memtransfer_types: &'ll llvm::Value,
+    region_id: &'ll llvm::Value,
+    args: &[&'ll Value],
+    types: &[&Type],
+    metadata: &[OffloadMetadata],
 ) {
     let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx);
     // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
@@ -368,27 +349,26 @@ fn gen_call_handling<'ll>(
     let tgt_kernel_decl = KernelArgsTy::new_decl(&cx);
     let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx);
 
-    let main_fn = cx.get_function("main");
-    let Some(main_fn) = main_fn else { return };
-    let kernel_name = "kernel_1";
-    let call = unsafe {
-        llvm::LLVMRustGetFunctionCall(main_fn, kernel_name.as_c_char_ptr(), kernel_name.len())
-    };
-    let Some(kernel_call) = call else {
-        return;
-    };
-    let kernel_call_bb = unsafe { llvm::LLVMGetInstructionParent(kernel_call) };
-    let called = unsafe { llvm::LLVMGetCalledValue(kernel_call).unwrap() };
-    let mut builder = SBuilder::build(cx, kernel_call_bb);
-
-    let types = cx.func_params_types(cx.get_type_of_global(called));
+    let mut builder = SBuilder::build(cx, bb);
+
+    // prevent these globals from being optimized away
+    for val in [offload_sizes, offload_entry] {
+        unsafe {
+            let dummy = llvm::LLVMBuildLoad2(
+                &builder.llbuilder,
+                llvm::LLVMTypeOf(val),
+                val,
+                b"dummy\0".as_ptr() as *const _,
+            );
+            llvm::LLVMSetVolatile(dummy, llvm::TRUE);
+        }
+    }
+
     let num_args = types.len() as u64;
 
     // Step 0)
     // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
     // %6 = alloca %struct.__tgt_bin_desc, align 8
-    unsafe { llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, main_fn) };
-
     let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc");
 
     let ty = cx.type_array(cx.type_ptr(), num_args);
@@ -404,15 +384,13 @@ fn gen_call_handling<'ll>(
     let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args");
 
     // Step 1)
-    unsafe { llvm::LLVMRustPositionBefore(builder.llbuilder, kernel_call) };
     builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT);
 
     // Now we allocate once per function param, a copy to be passed to one of our maps.
     let mut vals = vec![];
     let mut geps = vec![];
     let i32_0 = cx.get_const_i32(0);
-    for index in 0..types.len() {
-        let v = unsafe { llvm::LLVMGetOperand(kernel_call, index as u32).unwrap() };
+    for &v in args {
         let gep = builder.inbounds_gep(cx.type_f32(), v, &[i32_0]);
         vals.push(v);
         geps.push(gep);
@@ -437,10 +415,8 @@ fn gen_call_handling<'ll>(
         let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
         builder.store(geps[i as usize], gep2, Align::EIGHT);
         let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
-        // As mentioned above, we don't use Rust type information yet. So for now we will just
-        // assume that we have 1024 bytes, 256 f32 values.
         // FIXME(offload): write an offload frontend and handle arbitrary types.
-        builder.store(cx.get_const_i64(1024), gep3, Align::EIGHT);
+        builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT);
     }
 
     // For now we have a very simplistic indexing scheme into our
@@ -482,7 +458,7 @@ fn gen_call_handling<'ll>(
 
     // Step 2)
     let s_ident_t = generate_at_one(&cx);
-    let o = memtransfer_types[0];
+    let o = memtransfer_types;
     let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
     generate_mapper_call(&mut builder, &cx, geps, o, begin_mapper_decl, fn_ty, num_args, s_ident_t);
     let values = KernelArgsTy::new(&cx, num_args, memtransfer_types, geps);
@@ -501,16 +477,11 @@ fn gen_call_handling<'ll>(
         // FIXME(offload): Don't hardcode the numbers of threads in the future.
         cx.get_const_i32(2097152),
         cx.get_const_i32(256),
-        region_ids[0],
+        region_id,
         a5,
     ];
-    let offload_success = builder.call(tgt_target_kernel_ty, tgt_decl, &args, None);
+    builder.call(tgt_target_kernel_ty, tgt_decl, &args, None);
     // %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args)
-    unsafe {
-        let next = llvm::LLVMGetNextInstruction(offload_success).unwrap();
-        llvm::LLVMRustPositionAfter(builder.llbuilder, next);
-        llvm::LLVMInstructionEraseFromParent(next);
-    }
 
     // Step 4)
     let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
@@ -519,8 +490,4 @@ fn gen_call_handling<'ll>(
     builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None);
 
     drop(builder);
-    // FIXME(offload) The issue is that we right now add a call to the gpu version of the function,
-    // and then delete the call to the CPU version. In the future, we should use an intrinsic which
-    // directly resolves to a call to the GPU version.
-    unsafe { llvm::LLVMDeleteFunction(called) };
 }