Skip to content

[clang][AMDGPU] Enable module splitting by default #128509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -1393,6 +1393,8 @@ def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">,
HelpText<"Compile HIP source to relocatable">;
def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
HelpText<"Do not override toolchain to compile HIP source to relocatable">;
def flto_partitions_EQ : Joined<["--"], "flto-partitions=">, Group<hip_Group>,
HelpText<"Number of partitions to use for parallel full LTO codegen. Use 1 to disable partitioning.">;
Comment on lines +1396 to +1397
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we added a top level option for this it probably should've been made to work for non-AMDGPU uses.

}

// Clang specific/exclusive options for OpenACC.
Expand Down
34 changes: 32 additions & 2 deletions clang/lib/Driver/ToolChains/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -630,8 +630,11 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
getToolChain().AddFilePathLibArgs(Args, CmdArgs);
AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
if (C.getDriver().isUsingLTO()) {
addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
C.getDriver().getLTOMode() == LTOK_Thin);
const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin);
addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO);

if (!ThinLTO)
addFullLTOPartitionOption(C.getDriver(), Args, CmdArgs);
} else if (Args.hasArg(options::OPT_mcpu_EQ)) {
CmdArgs.push_back(Args.MakeArgString(
"-plugin-opt=mcpu=" +
Expand Down Expand Up @@ -708,6 +711,33 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
options::OPT_m_amdgpu_Features_Group);
}

static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) {
const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be getLastArgValue with a default of "8".

// In the absence of an option, use 8 as the default.
if (!A)
return 8;
int Value = 0;
if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) {
D.Diag(diag::err_drv_invalid_int_value)
<< A->getAsString(Args) << A->getValue();
return 1;
}

return Value;
}

void amdgpu::addFullLTOPartitionOption(const Driver &D,
const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs) {
// TODO: Should this be restricted to fgpu-rdc only ? Currently we'll
// also do it for non gpu-rdc LTO

if (unsigned NumParts = getFullLTOPartitions(D, Args); NumParts > 1) {
CmdArgs.push_back(
Args.MakeArgString("--lto-partitions=" + Twine(NumParts)));
}
}

/// AMDGPU Toolchain
AMDGPUToolChain::AMDGPUToolChain(const Driver &D, const llvm::Triple &Triple,
const ArgList &Args)
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Driver/ToolChains/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ void getAMDGPUTargetFeatures(const Driver &D, const llvm::Triple &Triple,
const llvm::opt::ArgList &Args,
std::vector<StringRef> &Features);

void addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs);
} // end namespace amdgpu
} // end namespace tools

Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Driver/ToolChains/HIPAMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,

addLinkerCompressDebugSectionsOption(TC, Args, LldArgs);

amdgpu::addFullLTOPartitionOption(D, Args, LldArgs);

// Given that host and device linking happen in separate processes, the device
// linker doesn't always have the visibility as to which device symbols are
// needed by a program, especially for the device symbol dependencies that are
Expand Down
20 changes: 18 additions & 2 deletions clang/test/Driver/amdgpu-toolchain.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@
// AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all"

// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s
// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s
// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions={{[0-9]+}}"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"

// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
// RUN: -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s
// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
// MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"

// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
Expand All @@ -36,3 +38,17 @@
// RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \
// RUN: -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s
// STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o"

// Check --flto-partitions

// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s
// LTO_PARTS: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions=42"

// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s
// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'

// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s
// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'
35 changes: 35 additions & 0 deletions clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// RUN: %clang -### --target=x86_64-linux-gnu \
// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=42 \
// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
// RUN: 2>&1 | FileCheck %s --check-prefix=FIXED-PARTS

// FIXED-PARTS-NOT: "*.llvm-link"
// FIXED-PARTS-NOT: ".*opt"
// FIXED-PARTS-NOT: ".*llc"
// FIXED-PARTS: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
// FIXED-PARTS-SAME: "-plugin-opt=mcpu=gfx803"
// FIXED-PARTS-SAME: "--lto-partitions=42"
// FIXED-PARTS-SAME: "-o" "{{.*out}}" "{{.*bc}}"

// RUN: not %clang -### --target=x86_64-linux-gnu \
// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=a \
// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV0

// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'

// RUN: not %clang -### --target=x86_64-linux-gnu \
// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=0 \
// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV1

// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'
2 changes: 2 additions & 0 deletions clang/test/Driver/hip-toolchain-rdc-static-lib.hip
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
// CHECK-NOT: ".*llc"
// CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
// CHECK-SAME: "-plugin-opt=mcpu=gfx803"
// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
// CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]]

// generate image for device side path on gfx900
Expand Down Expand Up @@ -77,6 +78,7 @@
// CHECK-NOT: ".*llc"
// CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
// CHECK-SAME: "--whole-archive"
// CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]]
// CHECK-SAME: "--no-whole-archive"
Expand Down
1 change: 1 addition & 0 deletions clang/test/Driver/hip-toolchain-rdc.hip
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
// CHECK-NOT: ".*llc"
// CHECK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
// CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]]

// combine images generated into hip fat binary object
Expand Down
Loading