lld hangs targeting amdgpu and so does opt using amdgpu-attributor pass #58639

pozulp · 2022-10-26T21:45:44Z

The following hangs for me, opt --amdgpu-attributor hang.ll -o foo.bc where hang.ll is

; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"

%0 = type { %1, i32, i32, i32, %2, i32, [4 x i8], %3, i32, i32, double, i8, i8, %4, [3 x [3 x double]], %5*, %22* }
%1 = type { i32 (...)** }
%2 = type { %1, double, double, double, double, double, double, double, double }
%3 = type <{ i8*, i64, i64, i32, [4 x i8] }>
%4 = type { double, double, double }
%5 = type { %3, i8, i32, %6, %7, i32, i32, i32, i32, double, i8, i8, i8, %4, [3 x [3 x double]], double, double, double, i32, [4 x i8], %3, i32, i32, i8, i32, %8, i32, %0*, %3, i8*, double (double, double, double, double, double, double, double, double, double)*, %10* }
%6 = type { i32, double, double, double }
%7 = type { i32, double* }
%8 = type { %1, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %9, %9, %9, %9, %9, %9, %9, %9, %9, %9 }
%9 = type <{ double*, i64, i64, i32, [4 x i8] }>
%10 = type { i64, %11* }
%11 = type { %12, i8*, i64, i64, void (%10*)*, i64, %10* (%10*, i8*)*, i32 (%10*, i8*, %10*)*, %13*, %10* (%10*)*, %14*, %15*, %16*, i64 (%10*)*, %10* (%10*, %10*, %10*)*, %10* (%10*)*, %10* (%10*, %10*)*, i32 (%10*, %10*, %10*)*, %17*, i64, i8*, i32 (%10*, i32 (%10*, i8*)*, i8*)*, i32 (%10*)*, %10* (%10*, %10*, i32)*, i64, %10* (%10*)*, %10* (%10*)*, %19*, %20*, %21*, %11*, %10*, %10* (%10*, %10*, %10*)*, i32 (%10*, %10*, %10*)*, i64, i32 (%10*, %10*, %10*)*, %10* (%11*, i64)*, %10* (%11*, %10*, %10*)*, void (i8*)*, i32 (%10*)*, %10*, %10*, %10*, %10*, %10*, void (%10*)*, i32, void (%10*)*, %10* (%10*, %10**, i64, %10*)* }
%12 = type { %10, i64 }
%13 = type { %10* (%10*)*, %10* (%10*)*, %10* (%10*)* }
%14 = type { %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*, %10*)*, %10* (%10*)*, %10* (%10*)*, %10* (%10*)*, i32 (%10*)*, %10* (%10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*)*, i8*, %10* (%10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)* }
%15 = type { i64 (%10*)*, %10* (%10*, %10*)*, %10* (%10*, i64)*, %10* (%10*, i64)*, i8*, i32 (%10*, i64, %10*)*, i8*, i32 (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, i64)* }
%16 = type { i64 (%10*)*, %10* (%10*, %10*)*, i32 (%10*, %10*, %10*)* }
%17 = type { i32 (%10*, %18*, i32)*, void (%10*, %18*)* }
%18 = type { i8*, %10*, i64, i64, i32, i32, i8*, i64*, i64*, i64*, i8* }
%19 = type { i8*, %10* (%10*, %10*)*, i32, i8* }
%20 = type { i8*, i32, i64, i32, i8* }
%21 = type { i8*, %10* (%10*, i8*)*, i32 (%10*, %10*, i8*)*, i8*, i8* }
%22 = type { %3, i32, %23, %24, i32, i32, %3 }
%23 = type { i32, i64, i64, i64 }
%24 = type { i64, i64* }

define internal fastcc i1 @widget(%0* %arg) {
bb:
  %tmp = getelementptr inbounds %0, %0* %arg, i64 0, i32 15
  %tmp1 = load %5*, %5** %tmp, align 8
  %tmp2 = call fastcc double @baz(%5* %tmp1)
  ret i1 false
}

define internal fastcc double @baz(%5* %arg) {
bb:
  %tmp = getelementptr inbounds %5, %5* %arg, i64 0, i32 30
  %tmp1 = load double (double, double, double, double, double, double, double, double, double)*, double (double, double, double, double, double, double, double, double, double)** %tmp, align 8
  %tmp2 = tail call double %tmp1(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00)
  br label %bb3

bb3:                                              ; preds = %bb
  %tmp4 = getelementptr inbounds %5, %5* %arg, i64 0, i32 27
  br label %bb5

bb5:                                              ; preds = %bb5, %bb3
  %tmp6 = load %0*, %0** %tmp4, align 8
  %tmp7 = call fastcc i1 @widget(%0* %tmp6)
  br label %bb5
}

The text was updated successfully, but these errors were encountered:

llvmbot · 2022-10-26T21:45:59Z

@llvm/issue-subscribers-backend-amdgpu

jdoerfert · 2022-10-26T23:58:47Z

Are you sure llvm-reduce didn't break it? This is all dead code, it's just deleted: https://godbolt.org/z/4n56Gvf36

arsenm · 2022-10-27T17:45:54Z

It's still broken whether or not the code is functional, but the deadness is a red herring. If there is a real use, I still observe the hang / stack overflow. I cut down the test slightly, but I don't see this reproduce with tip of tree. Probably need to bisect this to see if it was deliberately fixed

target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"

%0 = type { double()*, %0* }

define internal fastcc i1 @widget(%0* %arg) {
bb:
  %tmp = getelementptr inbounds %0, %0* %arg, i64 0, i32 1
  %tmp1 = load %0*, %0** %tmp, align 8
  %tmp2 = call fastcc double @baz(%0* %tmp1)
  ret i1 false
}

define internal fastcc double @baz(%0* %arg) {
bb:
  %tmp = getelementptr inbounds %0, %0* %arg, i64 0, i32 0
  %tmp1 = load double ()*, double ()** %tmp, align 8
  %tmp2 = tail call double %tmp1()
  br label %bb3

bb3:                                              ; preds = %bb
  %tmp4 = getelementptr inbounds %0, %0* %arg, i64 0, i32 1
  br label %bb5

bb5:                                              ; preds = %bb5, %bb3
  %tmp6 = load %0*, %0** %tmp4, align 8
  %tmp7 = call fastcc i1 @widget(%0* %tmp6)
  br label %bb5
}

define amdgpu_kernel void @entry() {
  %alloca = alloca %0, align 8, addrspace(5)
  %cast = addrspacecast %0 addrspace(5)* %alloca to %0*
  %arst = call double @baz(%0* %cast)
  ret void
}

pozulp · 2022-10-31T19:33:13Z

I bisected with @arsenm and we found that @jdoerfert fixed the hang in bf789b1. Below I describe how I created this reproducer and bisected.

My llvm installation

I am using rocm to target mi250x on linux. My code is written in c++ and uses hip. I can build with rocm 5.2.3, but when I try 5.3.0 my lld process hangs.

Making a reproducer

I attached gdb to the lld process. I cut off the stacktrace at 1 million frames. Many were llvm::AA::getAssumedUnderlyingObjects. I re-ran with --save-temps. The last bitcode file emitted was precodegen.bc. I ran opt --amdgpu-attributor precodegen.bc which reproduced the hang.

Minimizing the reproducer

My reproducer was almost 400,000 lines long (llvm-dis < precodegen.bc|wc -l). I wrote this "test" script [1] which I called repro.sh

#!/bin/bash -x

timeout 10 opt --amdgpu-attributor "$@" -o foo.bc
if [ $? -eq 124 ]; then
  echo "TIMED OUT"
  exit 0
elif [ $? -eq 0 ]; then
  echo "SUCCESS"
  exit 1
else
  echo "INVALID"
  exit 1
fi

and ran llvm-reduce --test=./repro.sh precodegen.bc --write-tmp-files-as-bitcode. (I first ran without --write-tmp-files-as-bitcode and the process terminated in 2 seconds. This was suspicious because I set a timeout of 10 seconds. In the output I saw opt failed with the error 'wrong number of indexes'. Switching from text to bitcode for intermediate files fixed the problem.)

After about 1 hour, llvm-reduce terminated. It output a reduced.ll that was only 51 lines, an 8000x improvement! I ran opt -strip -metarenamer -instnamer reduced.ll -So hang.ll to create the minimum reproducer that I posted when I opened this issue.

Running git bisect

Thus far I had only run tools in my rocm install's llvm/bin directory. As @jdoerfert noted, upstream opt does not hang. I built the default branch (amd-stg-open) of RadeonOpenCompute's llvm-project fork and opt did not hang. I built the rocm-5.3.x branch and opt hung. Thus, there is a commit on amd-stg-open which fixes the hang. Here is how I ran git bisect to determine said commit

I created a build space (because each step of the bisection needs to build and run opt).

git clone https://github.com/RadeonOpenCompute/llvm-project.git amd-llvm-project
mkdir build_llvm
pushd build_llvm
targets='X86;AMDGPU'
time \
cmake \
    -G Ninja \
    -DCMAKE_BUILD_TYPE=Debug \
    -DCMAKE_C_COMPILER=gcc \
    -DCMAKE_CXX_COMPILER=g++ \
    -DLLVM_USE_LINKER=gold \
    -DLLVM_TARGETS_TO_BUILD=$targets \
    -DLLVM_PARALLEL_LINK_JOBS=18 \
    -DLLVM_ENABLE_ASSERTIONS=ON \
    ../amd-llvm-project/llvm 2>&1 | tee cmake_amdgpubackend.out
popd

I started the bisection with

git -C amd-llvm-project bisect start origin/amd-stg-open origin/rocm-5.3.x
sbatch sbatch1.sh

where sbatch1.sh is the slurm batch script

#!/bin/bash
#SBATCH -t 4:0:0
#SBATCH -N 1 --exclusive
#SBATCH -J bisect
#SBATCH -e /path/to/rundir/sbatch1-%A.e
#SBATCH -o /path/to/rundir/sbatch1-%A.o

cd /path/to/rundir/
cd amd-llvm-project
git -C amd-llvm-project bisect run ./bisect.sh

and bisect.sh builds and runs opt

cd /path/to/rundir
cd build_llvm
time ninja opt
timeout 10 ./bin/opt --amdgpu-attributor ../hang.ll -o foo.bc
# Timing out is "good" because I am looking for the fix not the bug
# good: 0
if [ $? -eq 124 ]; then
  echo "TIMED OUT, good, exit 0"
  exit 0
elif [ $? -eq 0 ]; then
  echo "NO TIME OUT, bad, exit 1"
  exit 1
else
  echo "INVALID, bad, exit 1"
  exit 1
fi

I ran the job on a system with 2 18-core broadwell sockets and 128 GB of DRAM per node. The job finished after about 2 hours. I ran git -C amd-llvm-project bisect log and saw

# bad: [092984937f2778f7e5440e69788a1d9542f7f0fc] merge main into amd-stg-open
# good: [3cf23f77f8208174a2ee7c616f4be23674d7b081] [SROA] Try harder to find a vector promotion viable type when rewriting
git bisect start 'origin/amd-stg-open' 'origin/rocm-5.3.x'
# good: [34b6327ae8a19989a631f1842b5ce6dd3cab7c92] merge main into amd-stg-open
git bisect good 34b6327ae8a19989a631f1842b5ce6dd3cab7c92
# bad: [86bc4587e1fdb7b1b90eadc138619f5e3f2dd6fd] Use std::clamp (NFC)
git bisect bad 86bc4587e1fdb7b1b90eadc138619f5e3f2dd6fd
# bad: [4d9251bd780d20eebbcb124608b36a69787d5575] [C++20] [Modules] Merge same concept decls in global module fragment
git bisect bad 4d9251bd780d20eebbcb124608b36a69787d5575
# good: [3139cc766c86b09426893a7349763c347639cbdc] [mlir][Linalg] Add a pattern to decompose `linalg.generic` ops.
git bisect good 3139cc766c86b09426893a7349763c347639cbdc
# bad: [44f81dfba407c82589abbb5867714ad030d1b80c] Remove references to old mailing lists that have moved to discourse. Replace with links to discourse.
git bisect bad 44f81dfba407c82589abbb5867714ad030d1b80c
# bad: [3f73c5793515867935d59ff8c511c61ace848e79] Argument name support for function pointer signature hints
git bisect bad 3f73c5793515867935d59ff8c511c61ace848e79
# good: [8aad330eebc0b9cfd8dd00e8ed692cb89e7577df] [libc] Fix API for remove_{prefix, suffix}
git bisect good 8aad330eebc0b9cfd8dd00e8ed692cb89e7577df
# good: [4baf8f092b47f4f31bda96a7acb7169d389c96fd] [AMDGPU] Pre-commit tests for D129759
git bisect good 4baf8f092b47f4f31bda96a7acb7169d389c96fd
# good: [f6017abb602780d81be928c93ec6afe74752f613] [lld-macho] Support folding of functions with identical LSDAs
git bisect good f6017abb602780d81be928c93ec6afe74752f613
# good: [374db8fc2e49f7d627e8942681d467422641f4b2] [gn build] (manually) port c91ce941448 (HTMLForestResources.inc)
git bisect good 374db8fc2e49f7d627e8942681d467422641f4b2
# bad: [1cf6b93df168fea81e3ca7c6c3c9fcaaf82c7785] Revert "[Local] Allow creating callbr with duplicate successors"
git bisect bad 1cf6b93df168fea81e3ca7c6c3c9fcaaf82c7785
# good: [f1243fa1933fdbcf292f134e0628604c4b9e5487] [LV] Autogen a partially autogened test for ease of update
git bisect good f1243fa1933fdbcf292f134e0628604c4b9e5487
# bad: [d2c0572b2efef6c71d13bb579ac50f2d3dd8e76e] [mlir] Flip LinAlg dialect to _Both
git bisect bad d2c0572b2efef6c71d13bb579ac50f2d3dd8e76e
# bad: [95401b015393b350f826d097cc5b45b6a604dfa5] Revert "[x86] use zero-extending load of a byte outside of loops too"
git bisect bad 95401b015393b350f826d097cc5b45b6a604dfa5
# bad: [bf789b1957efd2482e1dbd164d91a6612a450fe3] [Attributor] Replace AAValueSimplify with AAPotentialValues
git bisect bad bf789b1957efd2482e1dbd164d91a6612a450fe3
# first bad commit: [bf789b1957efd2482e1dbd164d91a6612a450fe3] [Attributor] Replace AAValueSimplify with AAPotentialValues

Thus, bf789b1 fixed the hang.

Action items

@arsenm will ensure that bf789b1 makes it into the next rocm release
@arsenm will close this issue

(I tried to cherry-pick bf789b1 on the rocm-5.3.x branch of RadeonOpenCompute's llvm-project fork but I got a conflict.)

[1] rust-lang/rust#66036

arsenm · 2022-11-01T07:43:31Z

Pushed testcase in bcedeef

pozulp added the backend:AMDGPU label Oct 26, 2022

arsenm closed this as completed Nov 1, 2022

EugeneZelenko added the llvm:optimizations label Nov 1, 2022

pozulp mentioned this issue Jan 8, 2024

[AMDGPU] Creating relocatable object (-r) from rdc objects (-fgpu-rdc) fails with lld error attempted static link of dynamic object in /opt/rocm-6.0.0/lib #77018

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

lld hangs targeting amdgpu and so does opt using amdgpu-attributor pass #58639

lld hangs targeting amdgpu and so does opt using amdgpu-attributor pass #58639

pozulp commented Oct 26, 2022 •

edited by VoltrexKeyva

Loading

llvmbot commented Oct 26, 2022

jdoerfert commented Oct 26, 2022

arsenm commented Oct 27, 2022

pozulp commented Oct 31, 2022

arsenm commented Nov 1, 2022

lld hangs targeting amdgpu and so does opt using amdgpu-attributor pass #58639

lld hangs targeting amdgpu and so does opt using amdgpu-attributor pass #58639

Comments

pozulp commented Oct 26, 2022 • edited by VoltrexKeyva Loading

llvmbot commented Oct 26, 2022

jdoerfert commented Oct 26, 2022

arsenm commented Oct 27, 2022

pozulp commented Oct 31, 2022

My llvm installation

Making a reproducer

Minimizing the reproducer

Running git bisect

Action items

arsenm commented Nov 1, 2022

pozulp commented Oct 26, 2022 •

edited by VoltrexKeyva

Loading