-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
codegen: explicitly handle Float16 intrinsics #45249
Conversation
f8b2c46
to
73777a7
Compare
n.b. this should be valid now: diff --git a/base/float.jl b/base/float.jl
index 60850b7e02..42693ac46e 100644
--- a/base/float.jl
+++ b/base/float.jl
@@ -224,9 +224,8 @@ function Float32(x::Int128)
reinterpret(Float32, s | d + y)
end
-# TODO: optimize
-Float16(x::UInt128) = convert(Float16, Float32(x))
-Float16(x::Int128) = convert(Float16, Float32(x))
+Float16(x::UInt128) = sitofp(Float16, x)
+Float16(x::Int128) = sitofp(Float16, x)
Float16(x::Float32) = fptrunc(Float16, x)
Float16(x::Float64) = fptrunc(Float16, x)
@@ -344,8 +343,8 @@ function unsafe_trunc(::Type{Int128}, x::Float32)
copysign(unsafe_trunc(UInt128,x) % Int128, x)
end
-unsafe_trunc(::Type{UInt128}, x::Float16) = unsafe_trunc(UInt128, Float32(x))
-unsafe_trunc(::Type{Int128}, x::Float16) = unsafe_trunc(Int128, Float32(x))
+unsafe_trunc(::Type{UInt128}, x::Float16) = fptoui(UInt128, x)
+unsafe_trunc(::Type{Int128}, x::Float16) = fptosi(Int128, x)
# matches convert methods
# also determines floor, ceil, round But on 32-bit platforms, we will see LLVM blow up with:
So we do not yet do this. |
Fixes #44829, until llvm fixes the support for these intrinsics itself
Also need to handle vectors, since the vectorizer may have introduced them. Also change our runtime emulation versions to f32 for consistency.
Probably needs to backported also to 1.7 (see #44829 (comment), will there be a new release in 1.7?) and maybe also 1.6, if relevant? |
There will unlikely be a 1.7.4. |
Is this good to go? |
I believe it was still waiting for review |
Name = "julia__gnu_h2f_ieee"; | ||
RetTy = Type::getFloatTy(ctx); | ||
break; | ||
case Instruction::FPTrunc: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Was it not possible to register our variants of these functions with the TLI as @vchuravy suggested, and avoid having to rewrite code early like this?
Looks like we should probably merge this. |
@KristofferC I think this might merit a small v1.7.4, simply because it seems to affect the ability to continue running those binaries after a system update (though it is only Float16 support, so not hugely impactful) |
I think it is fine to suggest upgrading to 1.8 in those rare cases. |
I tried to backport this PR to JIT session error: Symbols not found: [ __gnu_f2h_ieee, __gnu_h2f_ieee ]
Failure value returned from cantFail wrapped call
Failed to materialize symbols: { (JuliaOJIT, { jfptr_mapreduce_impl_2317, julia_mapreduce_impl_2316 }) }
UNREACHABLE executed at [...]/usr/include/llvm/Support/Error.h:749!
signal (6): Aborted
in expression starting at [...]/share/julia/test/reduce.jl:111
__pthread_kill_implementation at [...]/nptl/pthread_kill.c:44
raise at [...]/signal/../sysdeps/posix/raise.c:26
abort at [...]/stdlib/abort.c:79
_ZN4llvm25llvm_unreachable_internalEPKcS1_j at [...]/lib/julia/libLLVM-11jl.so (unknown line)
unknown function (ip: 0x7fc5559d489f)
jl_add_to_ee at [...]/src/jitlayers.cpp:1068
unknown function (ip: 0x7fc5559d8425)
jl_generate_fptr at [...]/src/jitlayers.cpp:353
jl_compile_method_internal at [...]/src/gf.c:1970
jl_apply_generic at [...]/src/gf.c:2237
macro expansion at [...]/share/julia/test/reduce.jl:120 [inlined] 1.6.6 patch--- src/llvm-demote-float16.cpp 2022-05-24 16:46:47.343006997 +0000
+++ src/llvm-demote-float16.cpp 2022-05-24 19:30:21.399797121 +0000
@@ -27,6 +27,166 @@
namespace {
+
+inline AttributeSet getFnAttrs(const AttributeList &Attrs)
+{
+#if JL_LLVM_VERSION >= 140000
+ return Attrs.getFnAttrs();
+#else
+ return Attrs.getFnAttributes();
+#endif
+}
+
+inline AttributeSet getRetAttrs(const AttributeList &Attrs)
+{
+#if JL_LLVM_VERSION >= 140000
+ return Attrs.getRetAttrs();
+#else
+ return Attrs.getRetAttributes();
+#endif
+}
+
+static Instruction *replaceIntrinsicWith(IntrinsicInst *call, Type *RetTy, ArrayRef<Value*> args)
+{
+ Intrinsic::ID ID = call->getIntrinsicID();
+ assert(ID);
+ auto oldfType = call->getFunctionType();
+ auto nargs = oldfType->getNumParams();
+ assert(args.size() > nargs);
+ SmallVector<Type*, 8> argTys(nargs);
+ for (unsigned i = 0; i < nargs; i++)
+ argTys[i] = args[i]->getType();
+ auto newfType = FunctionType::get(RetTy, argTys, oldfType->isVarArg());
+
+ // Accumulate an array of overloaded types for the given intrinsic
+ // and compute the new name mangling schema
+ SmallVector<Type*, 4> overloadTys;
+ {
+ SmallVector<Intrinsic::IITDescriptor, 8> Table;
+ getIntrinsicInfoTableEntries(ID, Table);
+ ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+ auto res = Intrinsic::matchIntrinsicSignature(newfType, TableRef, overloadTys);
+ assert(res == Intrinsic::MatchIntrinsicTypes_Match);
+ (void)res;
+ bool matchvararg = !Intrinsic::matchIntrinsicVarArg(newfType->isVarArg(), TableRef);
+ assert(matchvararg);
+ (void)matchvararg;
+ }
+ auto newF = Intrinsic::getDeclaration(call->getModule(), ID, overloadTys);
+ assert(newF->getFunctionType() == newfType);
+ newF->setCallingConv(call->getCallingConv());
+ assert(args.back() == call->getCalledFunction());
+ auto newCall = CallInst::Create(newF, args.drop_back(), "", call);
+ newCall->setTailCallKind(call->getTailCallKind());
+ auto old_attrs = call->getAttributes();
+ newCall->setAttributes(AttributeList::get(call->getContext(), getFnAttrs(old_attrs),
+ getRetAttrs(old_attrs), {})); // drop parameter attributes
+ return newCall;
+}
+
+
+static Value* CreateFPCast(Instruction::CastOps opcode, Value *V, Type *DestTy, IRBuilder<> &builder)
+{
+ Type *SrcTy = V->getType();
+ Type *RetTy = DestTy;
+ if (auto *VC = dyn_cast<Constant>(V)) {
+ // The input IR often has things of the form
+ // fcmp olt half %0, 0xH7C00
+ // and we would like to avoid turning that constant into a call here
+ // if we can simply constant fold it to the new type.
+ VC = ConstantExpr::getCast(opcode, VC, DestTy, true);
+ if (VC)
+ return VC;
+ }
+ assert(SrcTy->isVectorTy() == DestTy->isVectorTy());
+ if (SrcTy->isVectorTy()) {
+ unsigned NumElems = cast<FixedVectorType>(SrcTy)->getNumElements();
+ assert(cast<FixedVectorType>(DestTy)->getNumElements() == NumElems && "Mismatched cast");
+ Value *NewV = UndefValue::get(DestTy);
+ RetTy = RetTy->getScalarType();
+ for (unsigned i = 0; i < NumElems; ++i) {
+ Value *I = builder.getInt32(i);
+ Value *Vi = builder.CreateExtractElement(V, I);
+ Vi = CreateFPCast(opcode, Vi, RetTy, builder);
+ NewV = builder.CreateInsertElement(NewV, Vi, I);
+ }
+ return NewV;
+ }
+ auto &M = *builder.GetInsertBlock()->getModule();
+ auto &ctx = M.getContext();
+ // Pick the Function to call in the Julia runtime
+ StringRef Name;
+ switch (opcode) {
+ case Instruction::FPExt:
+ // this is exact, so we only need one conversion
+ assert(SrcTy->isHalfTy());
+ Name = "julia__gnu_h2f_ieee";
+ RetTy = Type::getFloatTy(ctx);
+ break;
+ case Instruction::FPTrunc:
+ assert(DestTy->isHalfTy());
+ if (SrcTy->isFloatTy())
+ Name = "julia__gnu_f2h_ieee";
+ else if (SrcTy->isDoubleTy())
+ Name = "julia__truncdfhf2";
+ break;
+ // All F16 fit exactly in Int32 (-65504 to 65504)
+ case Instruction::FPToSI: JL_FALLTHROUGH;
+ case Instruction::FPToUI:
+ assert(SrcTy->isHalfTy());
+ Name = "julia__gnu_h2f_ieee";
+ RetTy = Type::getFloatTy(ctx);
+ break;
+ case Instruction::SIToFP: JL_FALLTHROUGH;
+ case Instruction::UIToFP:
+ assert(DestTy->isHalfTy());
+ Name = "julia__gnu_f2h_ieee";
+ SrcTy = Type::getFloatTy(ctx);
+ break;
+ default:
+ errs() << Instruction::getOpcodeName(opcode) << ' ';
+ V->getType()->print(errs());
+ errs() << " to ";
+ DestTy->print(errs());
+ errs() << " is an ";
+ llvm_unreachable("invalid cast");
+ }
+ if (Name.empty()) {
+ errs() << Instruction::getOpcodeName(opcode) << ' ';
+ V->getType()->print(errs());
+ errs() << " to ";
+ DestTy->print(errs());
+ errs() << " is an ";
+ llvm_unreachable("illegal cast");
+ }
+ // Coerce the source to the required size and type
+ auto T_int16 = Type::getInt16Ty(ctx);
+ if (SrcTy->isHalfTy())
+ SrcTy = T_int16;
+ if (opcode == Instruction::SIToFP)
+ V = builder.CreateSIToFP(V, SrcTy);
+ else if (opcode == Instruction::UIToFP)
+ V = builder.CreateUIToFP(V, SrcTy);
+ else
+ V = builder.CreateBitCast(V, SrcTy);
+ // Call our intrinsic
+ if (RetTy->isHalfTy())
+ RetTy = T_int16;
+ auto FT = FunctionType::get(RetTy, {SrcTy}, false);
+ FunctionCallee F = M.getOrInsertFunction(Name, FT);
+ Value *I = builder.CreateCall(F, {V});
+ // Coerce the result to the expected type
+ if (opcode == Instruction::FPToSI)
+ I = builder.CreateFPToSI(I, DestTy);
+ else if (opcode == Instruction::FPToUI)
+ I = builder.CreateFPToUI(I, DestTy);
+ else if (opcode == Instruction::FPExt)
+ I = builder.CreateFPCast(I, DestTy);
+ else
+ I = builder.CreateBitCast(I, DestTy);
+ return I;
+}
+
struct DemoteFloat16Pass : public FunctionPass {
static char ID;
DemoteFloat16Pass() : FunctionPass(ID){};
@@ -35,15 +195,40 @@
bool runOnFunction(Function &F) override;
};
+Type *_getWithNewType(Type *VTyp, Type *EltTy) {
+ if (auto *VTy = dyn_cast<VectorType>(VTyp))
+ return VectorType::get(EltTy, VTy->getElementCount());
+ return EltTy;
+}
+
bool DemoteFloat16Pass::runOnFunction(Function &F)
{
auto &ctx = F.getContext();
- auto T_float16 = Type::getHalfTy(ctx);
auto T_float32 = Type::getFloatTy(ctx);
SmallVector<Instruction *, 0> erase;
for (auto &BB : F) {
for (auto &I : BB) {
+ // extend Float16 operands to Float32
+ bool Float16 = I.getType()->getScalarType()->isHalfTy();
+ for (size_t i = 0; !Float16 && i < I.getNumOperands(); i++) {
+ Value *Op = I.getOperand(i);
+ if (Op->getType()->getScalarType()->isHalfTy())
+ Float16 = true;
+ }
+ if (!Float16)
+ continue;
+
+ if (auto CI = dyn_cast<CastInst>(&I)) {
+ if (CI->getOpcode() != Instruction::BitCast) { // aka !CI->isNoopCast(DL)
+ IRBuilder<> builder(&I);
+ Value *NewI = CreateFPCast(CI->getOpcode(), I.getOperand(0), I.getType(), builder);
+ I.replaceAllUsesWith(NewI);
+ erase.push_back(&I);
+ }
+ continue;
+ }
+
switch (I.getOpcode()) {
case Instruction::FNeg:
case Instruction::FAdd:
@@ -54,6 +239,9 @@
case Instruction::FCmp:
break;
default:
+ if (auto intrinsic = dyn_cast<IntrinsicInst>(&I))
+ if (intrinsic->getIntrinsicID())
+ break;
continue;
}
@@ -65,61 +253,68 @@
IRBuilder<> builder(&I);
// extend Float16 operands to Float32
- bool OperandsChanged = false;
+ // XXX: Calls to llvm.fma.f16 may need to go to f64 to be correct?
SmallVector<Value *, 2> Operands(I.getNumOperands());
for (size_t i = 0; i < I.getNumOperands(); i++) {
Value *Op = I.getOperand(i);
- if (Op->getType() == T_float16) {
- Op = builder.CreateFPExt(Op, T_float32);
- OperandsChanged = true;
+ if (Op->getType()->getScalarType()->isHalfTy()) {
+ Op = CreateFPCast(Instruction::FPExt, Op, _getWithNewType(Op->getType(), T_float32), builder);
}
Operands[i] = (Op);
}
// recreate the instruction if any operands changed,
// truncating the result back to Float16
- if (OperandsChanged) {
- Value *NewI;
- switch (I.getOpcode()) {
- case Instruction::FNeg:
- assert(Operands.size() == 1);
- NewI = builder.CreateFNeg(Operands[0]);
- break;
- case Instruction::FAdd:
- assert(Operands.size() == 2);
- NewI = builder.CreateFAdd(Operands[0], Operands[1]);
- break;
- case Instruction::FSub:
- assert(Operands.size() == 2);
- NewI = builder.CreateFSub(Operands[0], Operands[1]);
- break;
- case Instruction::FMul:
- assert(Operands.size() == 2);
- NewI = builder.CreateFMul(Operands[0], Operands[1]);
- break;
- case Instruction::FDiv:
- assert(Operands.size() == 2);
- NewI = builder.CreateFDiv(Operands[0], Operands[1]);
- break;
- case Instruction::FRem:
- assert(Operands.size() == 2);
- NewI = builder.CreateFRem(Operands[0], Operands[1]);
- break;
- case Instruction::FCmp:
- assert(Operands.size() == 2);
- NewI = builder.CreateFCmp(cast<FCmpInst>(&I)->getPredicate(),
- Operands[0], Operands[1]);
+ Value *NewI;
+ switch (I.getOpcode()) {
+ case Instruction::FNeg:
+ assert(Operands.size() == 1);
+ NewI = builder.CreateFNeg(Operands[0]);
+ break;
+ case Instruction::FAdd:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFAdd(Operands[0], Operands[1]);
+ break;
+ case Instruction::FSub:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFSub(Operands[0], Operands[1]);
+ break;
+ case Instruction::FMul:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFMul(Operands[0], Operands[1]);
+ break;
+ case Instruction::FDiv:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFDiv(Operands[0], Operands[1]);
+ break;
+ case Instruction::FRem:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFRem(Operands[0], Operands[1]);
+ break;
+ case Instruction::FCmp:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFCmp(cast<FCmpInst>(&I)->getPredicate(),
+ Operands[0], Operands[1]);
+ break;
+ default:
+ if (auto intrinsic = dyn_cast<IntrinsicInst>(&I)) {
+ // XXX: this is not correct in general
+ // some obvious failures include llvm.convert.to.fp16.*, llvm.vp.*to*, llvm.experimental.constrained.*to*, llvm.masked.*
+ Type *RetTy = I.getType();
+ if (RetTy->getScalarType()->isHalfTy())
+ RetTy = _getWithNewType(RetTy, T_float32);
+ NewI = replaceIntrinsicWith(intrinsic, RetTy, Operands);
break;
- default:
- abort();
}
- cast<Instruction>(NewI)->copyMetadata(I);
- cast<Instruction>(NewI)->copyFastMathFlags(&I);
- if (NewI->getType() != I.getType())
- NewI = builder.CreateFPTrunc(NewI, I.getType());
- I.replaceAllUsesWith(NewI);
- erase.push_back(&I);
+ abort();
+ }
+ cast<Instruction>(NewI)->copyMetadata(I);
+ cast<Instruction>(NewI)->copyFastMathFlags(&I);
+ if (NewI->getType() != I.getType()) {
+ NewI = CreateFPCast(Instruction::FPTrunc, NewI, I.getType(), builder);
}
+ I.replaceAllUsesWith(NewI);
+ erase.push_back(&I);
}
}
--- src/runtime_intrinsics.c 2022-05-24 16:46:35.248322940 +0000
+++ src/runtime_intrinsics.c 2022-05-24 16:59:37.052604180 +0000
@@ -169,9 +169,9 @@
}
#define fp_select(a, func) \
- sizeof(a) == sizeof(float) ? func##f((float)a) : func(a)
+ sizeof(a) <= sizeof(float) ? func##f((float)a) : func(a)
#define fp_select2(a, b, func) \
- sizeof(a) == sizeof(float) ? func##f(a, b) : func(a, b)
+ sizeof(a) <= sizeof(float) ? func##f(a, b) : func(a, b)
// fast-function generators //
@@ -215,11 +215,11 @@
static inline void name(unsigned osize, void *pa, void *pr) JL_NOTSAFEPOINT \
{ \
uint16_t a = *(uint16_t*)pa; \
- float A = __gnu_h2f_ieee(a); \
+ float A = julia__gnu_h2f_ieee(a); \
if (osize == 16) { \
float R; \
OP(&R, A); \
- *(uint16_t*)pr = __gnu_f2h_ieee(R); \
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(R); \
} else { \
OP((uint16_t*)pr, A); \
} \
@@ -243,11 +243,11 @@
{ \
uint16_t a = *(uint16_t*)pa; \
uint16_t b = *(uint16_t*)pb; \
- float A = __gnu_h2f_ieee(a); \
- float B = __gnu_h2f_ieee(b); \
+ float A = julia__gnu_h2f_ieee(a); \
+ float B = julia__gnu_h2f_ieee(b); \
runtime_nbits = 16; \
float R = OP(A, B); \
- *(uint16_t*)pr = __gnu_f2h_ieee(R); \
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(R); \
}
// float or integer inputs, bool output
@@ -268,8 +268,8 @@
{ \
uint16_t a = *(uint16_t*)pa; \
uint16_t b = *(uint16_t*)pb; \
- float A = __gnu_h2f_ieee(a); \
- float B = __gnu_h2f_ieee(b); \
+ float A = julia__gnu_h2f_ieee(a); \
+ float B = julia__gnu_h2f_ieee(b); \
runtime_nbits = 16; \
return OP(A, B); \
}
@@ -309,12 +309,12 @@
uint16_t a = *(uint16_t*)pa; \
uint16_t b = *(uint16_t*)pb; \
uint16_t c = *(uint16_t*)pc; \
- float A = __gnu_h2f_ieee(a); \
- float B = __gnu_h2f_ieee(b); \
- float C = __gnu_h2f_ieee(c); \
+ float A = julia__gnu_h2f_ieee(a); \
+ float B = julia__gnu_h2f_ieee(b); \
+ float C = julia__gnu_h2f_ieee(c); \
runtime_nbits = 16; \
float R = OP(A, B, C); \
- *(uint16_t*)pr = __gnu_f2h_ieee(R); \
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(R); \
}
@@ -832,7 +832,7 @@
fpiseq_n(float, 32)
fpiseq_n(double, 64)
#define fpiseq(a,b) \
- sizeof(a) == sizeof(float) ? fpiseq32(a, b) : fpiseq64(a, b)
+ sizeof(a) <= sizeof(float) ? fpiseq32(a, b) : fpiseq64(a, b)
#define fpislt_n(c_type, nbits) \
static inline int fpislt##nbits(c_type a, c_type b) JL_NOTSAFEPOINT \
@@ -903,7 +903,7 @@
if (!(osize < 8 * sizeof(a))) \
jl_error("fptrunc: output bitsize must be < input bitsize"); \
else if (osize == 16) \
- *(uint16_t*)pr = __gnu_f2h_ieee(a); \
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(a); \
else if (osize == 32) \
*(float*)pr = a; \
else if (osize == 64) \
--- src/julia_internal.h 2022-05-24 16:47:00.597607360 +0000
+++ src/julia_internal.h 2022-05-24 16:49:04.486479125 +0000
@@ -1363,8 +1363,9 @@
#define JL_GC_ASSERT_LIVE(x) (void)(x)
#endif
-float __gnu_h2f_ieee(uint16_t param) JL_NOTSAFEPOINT;
-uint16_t __gnu_f2h_ieee(float param) JL_NOTSAFEPOINT;
+JL_DLLEXPORT float julia__gnu_h2f_ieee(uint16_t param) JL_NOTSAFEPOINT;
+JL_DLLEXPORT uint16_t julia__gnu_f2h_ieee(float param) JL_NOTSAFEPOINT;
+JL_DLLEXPORT uint16_t julia__truncdfhf2(double param) JL_NOTSAFEPOINT;
#ifdef __cplusplus
}
--- src/intrinsics.cpp 2022-05-24 17:20:25.770968733 +0000
+++ src/intrinsics.cpp 2022-05-24 19:56:18.247690470 +0000
@@ -1474,26 +1474,17 @@
return h;
}
-#if !defined(_OS_DARWIN_) // xcode already links compiler-rt
-
-extern "C" JL_DLLEXPORT float __gnu_h2f_ieee(uint16_t param)
-{
- return half_to_float(param);
-}
-
-extern "C" JL_DLLEXPORT float __extendhfsf2(uint16_t param)
+extern "C" JL_DLLEXPORT float julia__gnu_h2f_ieee(uint16_t param)
{
return half_to_float(param);
}
-extern "C" JL_DLLEXPORT uint16_t __gnu_f2h_ieee(float param)
+extern "C" JL_DLLEXPORT uint16_t julia__gnu_f2h_ieee(float param)
{
return float_to_half(param);
}
-extern "C" JL_DLLEXPORT uint16_t __truncdfhf2(double param)
+extern "C" JL_DLLEXPORT uint16_t julia__truncdfhf2(double param)
{
return float_to_half((float)param);
}
-
-#endif
--- src/julia.expmap 2022-05-24 16:47:11.372501561 +0000
+++ src/julia.expmap 2022-05-24 16:48:44.460056440 +0000
@@ -42,12 +42,6 @@
environ;
__progname;
- /* compiler run-time intrinsics */
- __gnu_h2f_ieee;
- __extendhfsf2;
- __gnu_f2h_ieee;
- __truncdfhf2;
-
local:
*;
};
--- src/APInt-C.cpp 2022-05-24 16:47:26.674979181 +0000
+++ src/APInt-C.cpp 2022-05-24 16:48:33.268972838 +0000
@@ -316,7 +316,7 @@
void LLVMFPtoInt(unsigned numbits, void *pa, unsigned onumbits, integerPart *pr, bool isSigned, bool *isExact) {
double Val;
if (numbits == 16)
- Val = __gnu_h2f_ieee(*(uint16_t*)pa);
+ Val = julia__gnu_h2f_ieee(*(uint16_t*)pa);
else if (numbits == 32)
Val = *(float*)pa;
else if (numbits == 64)
@@ -391,7 +391,7 @@
val = a.roundToDouble(true);
}
if (onumbits == 16)
- *(uint16_t*)pr = __gnu_f2h_ieee(val);
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(val);
else if (onumbits == 32)
*(float*)pr = val;
else if (onumbits == 64)
@@ -408,7 +408,7 @@
val = a.roundToDouble(false);
}
if (onumbits == 16)
- *(uint16_t*)pr = __gnu_f2h_ieee(val);
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(val);
else if (onumbits == 32)
*(float*)pr = val;
else if (onumbits == 64) 1.7.3 patch--- src/llvm-demote-float16.cpp 2022-05-24 16:26:36.982535055 +0000
+++ src/llvm-demote-float16.cpp 2022-05-24 19:30:17.191673155 +0000
@@ -27,6 +27,166 @@
namespace {
+inline AttributeSet getFnAttrs(const AttributeList &Attrs)
+{
+#if JL_LLVM_VERSION >= 140000
+ return Attrs.getFnAttrs();
+#else
+ return Attrs.getFnAttributes();
+#endif
+}
+
+inline AttributeSet getRetAttrs(const AttributeList &Attrs)
+{
+#if JL_LLVM_VERSION >= 140000
+ return Attrs.getRetAttrs();
+#else
+ return Attrs.getRetAttributes();
+#endif
+}
+
+static Instruction *replaceIntrinsicWith(IntrinsicInst *call, Type *RetTy, ArrayRef<Value*> args)
+{
+ Intrinsic::ID ID = call->getIntrinsicID();
+ assert(ID);
+ auto oldfType = call->getFunctionType();
+ auto nargs = oldfType->getNumParams();
+ assert(args.size() > nargs);
+ SmallVector<Type*, 8> argTys(nargs);
+ for (unsigned i = 0; i < nargs; i++)
+ argTys[i] = args[i]->getType();
+ auto newfType = FunctionType::get(RetTy, argTys, oldfType->isVarArg());
+
+ // Accumulate an array of overloaded types for the given intrinsic
+ // and compute the new name mangling schema
+ SmallVector<Type*, 4> overloadTys;
+ {
+ SmallVector<Intrinsic::IITDescriptor, 8> Table;
+ getIntrinsicInfoTableEntries(ID, Table);
+ ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+ auto res = Intrinsic::matchIntrinsicSignature(newfType, TableRef, overloadTys);
+ assert(res == Intrinsic::MatchIntrinsicTypes_Match);
+ (void)res;
+ bool matchvararg = !Intrinsic::matchIntrinsicVarArg(newfType->isVarArg(), TableRef);
+ assert(matchvararg);
+ (void)matchvararg;
+ }
+ auto newF = Intrinsic::getDeclaration(call->getModule(), ID, overloadTys);
+ assert(newF->getFunctionType() == newfType);
+ newF->setCallingConv(call->getCallingConv());
+ assert(args.back() == call->getCalledFunction());
+ auto newCall = CallInst::Create(newF, args.drop_back(), "", call);
+ newCall->setTailCallKind(call->getTailCallKind());
+ auto old_attrs = call->getAttributes();
+ newCall->setAttributes(AttributeList::get(call->getContext(), getFnAttrs(old_attrs),
+ getRetAttrs(old_attrs), {})); // drop parameter attributes
+ return newCall;
+}
+
+
+static Value* CreateFPCast(Instruction::CastOps opcode, Value *V, Type *DestTy, IRBuilder<> &builder)
+{
+
+ Type *SrcTy = V->getType();
+ Type *RetTy = DestTy;
+ if (auto *VC = dyn_cast<Constant>(V)) {
+ // The input IR often has things of the form
+ // fcmp olt half %0, 0xH7C00
+ // and we would like to avoid turning that constant into a call here
+ // if we can simply constant fold it to the new type.
+ VC = ConstantExpr::getCast(opcode, VC, DestTy, true);
+ if (VC)
+ return VC;
+ }
+ assert(SrcTy->isVectorTy() == DestTy->isVectorTy());
+ if (SrcTy->isVectorTy()) {
+ unsigned NumElems = cast<FixedVectorType>(SrcTy)->getNumElements();
+ assert(cast<FixedVectorType>(DestTy)->getNumElements() == NumElems && "Mismatched cast");
+ Value *NewV = UndefValue::get(DestTy);
+ RetTy = RetTy->getScalarType();
+ for (unsigned i = 0; i < NumElems; ++i) {
+ Value *I = builder.getInt32(i);
+ Value *Vi = builder.CreateExtractElement(V, I);
+ Vi = CreateFPCast(opcode, Vi, RetTy, builder);
+ NewV = builder.CreateInsertElement(NewV, Vi, I);
+ }
+ return NewV;
+ }
+ auto &M = *builder.GetInsertBlock()->getModule();
+ auto &ctx = M.getContext();
+ // Pick the Function to call in the Julia runtime
+ StringRef Name;
+ switch (opcode) {
+ case Instruction::FPExt:
+ // this is exact, so we only need one conversion
+ assert(SrcTy->isHalfTy());
+ Name = "julia__gnu_h2f_ieee";
+ RetTy = Type::getFloatTy(ctx);
+ break;
+ case Instruction::FPTrunc:
+ assert(DestTy->isHalfTy());
+ if (SrcTy->isFloatTy())
+ Name = "julia__gnu_f2h_ieee";
+ else if (SrcTy->isDoubleTy())
+ Name = "julia__truncdfhf2";
+ break;
+ // All F16 fit exactly in Int32 (-65504 to 65504)
+ case Instruction::FPToSI: JL_FALLTHROUGH;
+ case Instruction::FPToUI:
+ assert(SrcTy->isHalfTy());
+ Name = "julia__gnu_h2f_ieee";
+ RetTy = Type::getFloatTy(ctx);
+ break;
+ case Instruction::SIToFP: JL_FALLTHROUGH;
+ case Instruction::UIToFP:
+ assert(DestTy->isHalfTy());
+ Name = "julia__gnu_f2h_ieee";
+ SrcTy = Type::getFloatTy(ctx);
+ break;
+ default:
+ errs() << Instruction::getOpcodeName(opcode) << ' ';
+ V->getType()->print(errs());
+ errs() << " to ";
+ DestTy->print(errs());
+ errs() << " is an ";
+ llvm_unreachable("invalid cast");
+ }
+ if (Name.empty()) {
+ errs() << Instruction::getOpcodeName(opcode) << ' ';
+ V->getType()->print(errs());
+ errs() << " to ";
+ DestTy->print(errs());
+ errs() << " is an ";
+ llvm_unreachable("illegal cast");
+ }
+ // Coerce the source to the required size and type
+ auto T_int16 = Type::getInt16Ty(ctx);
+ if (SrcTy->isHalfTy())
+ SrcTy = T_int16;
+ if (opcode == Instruction::SIToFP)
+ V = builder.CreateSIToFP(V, SrcTy);
+ else if (opcode == Instruction::UIToFP)
+ V = builder.CreateUIToFP(V, SrcTy);
+ else
+ V = builder.CreateBitCast(V, SrcTy);
+ // Call our intrinsic
+ if (RetTy->isHalfTy())
+ RetTy = T_int16;
+ auto FT = FunctionType::get(RetTy, {SrcTy}, false);
+ FunctionCallee F = M.getOrInsertFunction(Name, FT);
+ Value *I = builder.CreateCall(F, {V});
+ // Coerce the result to the expected type
+ if (opcode == Instruction::FPToSI)
+ I = builder.CreateFPToSI(I, DestTy);
+ else if (opcode == Instruction::FPToUI)
+ I = builder.CreateFPToUI(I, DestTy);
+ else if (opcode == Instruction::FPExt)
+ I = builder.CreateFPCast(I, DestTy);
+ else
+ I = builder.CreateBitCast(I, DestTy);
+ return I;
+}
+
struct DemoteFloat16Pass : public FunctionPass {
static char ID;
DemoteFloat16Pass() : FunctionPass(ID){};
@@ -35,15 +195,40 @@
bool runOnFunction(Function &F) override;
};
+Type *_getWithNewType(Type *VTyp, Type *EltTy) {
+ if (auto *VTy = dyn_cast<VectorType>(VTyp))
+ return VectorType::get(EltTy, VTy->getElementCount());
+ return EltTy;
+}
+
+
bool DemoteFloat16Pass::runOnFunction(Function &F)
{
auto &ctx = F.getContext();
- auto T_float16 = Type::getHalfTy(ctx);
auto T_float32 = Type::getFloatTy(ctx);
SmallVector<Instruction *, 0> erase;
for (auto &BB : F) {
for (auto &I : BB) {
+ // extend Float16 operands to Float32
+ bool Float16 = I.getType()->getScalarType()->isHalfTy();
+ for (size_t i = 0; !Float16 && i < I.getNumOperands(); i++) {
+ Value *Op = I.getOperand(i);
+ if (Op->getType()->getScalarType()->isHalfTy())
+ Float16 = true;
+ }
+ if (!Float16)
+ continue;
+
+ if (auto CI = dyn_cast<CastInst>(&I)) {
+ if (CI->getOpcode() != Instruction::BitCast) { // aka !CI->isNoopCast(DL)
+ IRBuilder<> builder(&I);
+ Value *NewI = CreateFPCast(CI->getOpcode(), I.getOperand(0), I.getType(), builder);
+ I.replaceAllUsesWith(NewI);
+ erase.push_back(&I);
+ }
+ continue;
+ }
switch (I.getOpcode()) {
case Instruction::FNeg:
case Instruction::FAdd:
@@ -54,6 +239,9 @@
case Instruction::FCmp:
break;
default:
+ if (auto intrinsic = dyn_cast<IntrinsicInst>(&I))
+ if (intrinsic->getIntrinsicID())
+ break;
continue;
}
@@ -65,61 +253,68 @@
IRBuilder<> builder(&I);
// extend Float16 operands to Float32
- bool OperandsChanged = false;
+ // XXX: Calls to llvm.fma.f16 may need to go to f64 to be correct?
SmallVector<Value *, 2> Operands(I.getNumOperands());
for (size_t i = 0; i < I.getNumOperands(); i++) {
Value *Op = I.getOperand(i);
- if (Op->getType() == T_float16) {
- Op = builder.CreateFPExt(Op, T_float32);
- OperandsChanged = true;
+ if (Op->getType()->getScalarType()->isHalfTy()) {
+ Op = CreateFPCast(Instruction::FPExt, Op, _getWithNewType(Op->getType(), T_float32), builder);
}
Operands[i] = (Op);
}
// recreate the instruction if any operands changed,
// truncating the result back to Float16
- if (OperandsChanged) {
- Value *NewI;
- switch (I.getOpcode()) {
- case Instruction::FNeg:
- assert(Operands.size() == 1);
- NewI = builder.CreateFNeg(Operands[0]);
- break;
- case Instruction::FAdd:
- assert(Operands.size() == 2);
- NewI = builder.CreateFAdd(Operands[0], Operands[1]);
- break;
- case Instruction::FSub:
- assert(Operands.size() == 2);
- NewI = builder.CreateFSub(Operands[0], Operands[1]);
- break;
- case Instruction::FMul:
- assert(Operands.size() == 2);
- NewI = builder.CreateFMul(Operands[0], Operands[1]);
- break;
- case Instruction::FDiv:
- assert(Operands.size() == 2);
- NewI = builder.CreateFDiv(Operands[0], Operands[1]);
- break;
- case Instruction::FRem:
- assert(Operands.size() == 2);
- NewI = builder.CreateFRem(Operands[0], Operands[1]);
- break;
- case Instruction::FCmp:
- assert(Operands.size() == 2);
- NewI = builder.CreateFCmp(cast<FCmpInst>(&I)->getPredicate(),
- Operands[0], Operands[1]);
+ Value *NewI;
+ switch (I.getOpcode()) {
+ case Instruction::FNeg:
+ assert(Operands.size() == 1);
+ NewI = builder.CreateFNeg(Operands[0]);
+ break;
+ case Instruction::FAdd:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFAdd(Operands[0], Operands[1]);
+ break;
+ case Instruction::FSub:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFSub(Operands[0], Operands[1]);
+ break;
+ case Instruction::FMul:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFMul(Operands[0], Operands[1]);
+ break;
+ case Instruction::FDiv:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFDiv(Operands[0], Operands[1]);
+ break;
+ case Instruction::FRem:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFRem(Operands[0], Operands[1]);
+ break;
+ case Instruction::FCmp:
+ assert(Operands.size() == 2);
+ NewI = builder.CreateFCmp(cast<FCmpInst>(&I)->getPredicate(),
+ Operands[0], Operands[1]);
+ break;
+ default:
+ if (auto intrinsic = dyn_cast<IntrinsicInst>(&I)) {
+ // XXX: this is not correct in general
+ // some obvious failures include llvm.convert.to.fp16.*, llvm.vp.*to*, llvm.experimental.constrained.*to*, llvm.masked.*
+ Type *RetTy = I.getType();
+ if (RetTy->getScalarType()->isHalfTy())
+ RetTy = _getWithNewType(RetTy, T_float32);
+ NewI = replaceIntrinsicWith(intrinsic, RetTy, Operands);
break;
- default:
- abort();
}
- cast<Instruction>(NewI)->copyMetadata(I);
- cast<Instruction>(NewI)->copyFastMathFlags(&I);
- if (NewI->getType() != I.getType())
- NewI = builder.CreateFPTrunc(NewI, I.getType());
- I.replaceAllUsesWith(NewI);
- erase.push_back(&I);
+ abort();
+ }
+ cast<Instruction>(NewI)->copyMetadata(I);
+ cast<Instruction>(NewI)->copyFastMathFlags(&I);
+ if (NewI->getType() != I.getType()) {
+ NewI = CreateFPCast(Instruction::FPTrunc, NewI, I.getType(), builder);
}
+ I.replaceAllUsesWith(NewI);
+ erase.push_back(&I);
}
}
--- src/runtime_intrinsics.c 2022-05-24 16:27:19.292819527 +0000
+++ src/runtime_intrinsics.c 2022-05-24 16:44:16.062717753 +0000
@@ -338,9 +338,9 @@
}
#define fp_select(a, func) \
- sizeof(a) == sizeof(float) ? func##f((float)a) : func(a)
+ sizeof(a) <= sizeof(float) ? func##f((float)a) : func(a)
#define fp_select2(a, b, func) \
- sizeof(a) == sizeof(float) ? func##f(a, b) : func(a, b)
+ sizeof(a) <= sizeof(float) ? func##f(a, b) : func(a, b)
// fast-function generators //
@@ -384,11 +384,11 @@
static inline void name(unsigned osize, void *pa, void *pr) JL_NOTSAFEPOINT \
{ \
uint16_t a = *(uint16_t*)pa; \
- float A = __gnu_h2f_ieee(a); \
+ float A = julia__gnu_h2f_ieee(a); \
if (osize == 16) { \
float R; \
OP(&R, A); \
- *(uint16_t*)pr = __gnu_f2h_ieee(R); \
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(R); \
} else { \
OP((uint16_t*)pr, A); \
} \
@@ -412,11 +412,11 @@
{ \
uint16_t a = *(uint16_t*)pa; \
uint16_t b = *(uint16_t*)pb; \
- float A = __gnu_h2f_ieee(a); \
- float B = __gnu_h2f_ieee(b); \
+ float A = julia__gnu_h2f_ieee(a); \
+ float B = julia__gnu_h2f_ieee(b); \
runtime_nbits = 16; \
float R = OP(A, B); \
- *(uint16_t*)pr = __gnu_f2h_ieee(R); \
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(R); \
}
// float or integer inputs, bool output
@@ -437,8 +437,8 @@
{ \
uint16_t a = *(uint16_t*)pa; \
uint16_t b = *(uint16_t*)pb; \
- float A = __gnu_h2f_ieee(a); \
- float B = __gnu_h2f_ieee(b); \
+ float A = julia__gnu_h2f_ieee(a); \
+ float B = julia__gnu_h2f_ieee(b); \
runtime_nbits = 16; \
return OP(A, B); \
}
@@ -478,12 +478,12 @@
uint16_t a = *(uint16_t*)pa; \
uint16_t b = *(uint16_t*)pb; \
uint16_t c = *(uint16_t*)pc; \
- float A = __gnu_h2f_ieee(a); \
- float B = __gnu_h2f_ieee(b); \
- float C = __gnu_h2f_ieee(c); \
+ float A = julia__gnu_h2f_ieee(a); \
+ float B = julia__gnu_h2f_ieee(b); \
+ float C = julia__gnu_h2f_ieee(c); \
runtime_nbits = 16; \
float R = OP(A, B, C); \
- *(uint16_t*)pr = __gnu_f2h_ieee(R); \
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(R); \
}
@@ -1001,7 +1001,7 @@
fpiseq_n(float, 32)
fpiseq_n(double, 64)
#define fpiseq(a,b) \
- sizeof(a) == sizeof(float) ? fpiseq32(a, b) : fpiseq64(a, b)
+ sizeof(a) <= sizeof(float) ? fpiseq32(a, b) : fpiseq64(a, b)
bool_fintrinsic(eq,eq_float)
bool_fintrinsic(ne,ne_float)
@@ -1050,7 +1050,7 @@
if (!(osize < 8 * sizeof(a))) \
jl_error("fptrunc: output bitsize must be < input bitsize"); \
else if (osize == 16) \
- *(uint16_t*)pr = __gnu_f2h_ieee(a); \
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(a); \
else if (osize == 32) \
*(float*)pr = a; \
else if (osize == 64) \
--- src/julia_internal.h 2022-05-24 16:26:09.953046957 +0000
+++ src/julia_internal.h 2022-05-24 16:28:59.610039708 +0000
@@ -1427,8 +1427,9 @@
#define JL_GC_ASSERT_LIVE(x) (void)(x)
#endif
-float __gnu_h2f_ieee(uint16_t param) JL_NOTSAFEPOINT;
-uint16_t __gnu_f2h_ieee(float param) JL_NOTSAFEPOINT;
+JL_DLLEXPORT float julia__gnu_h2f_ieee(uint16_t param) JL_NOTSAFEPOINT;
+JL_DLLEXPORT uint16_t julia__gnu_f2h_ieee(float param) JL_NOTSAFEPOINT;
+JL_DLLEXPORT uint16_t julia__truncdfhf2(double param) JL_NOTSAFEPOINT;
#ifdef __cplusplus
}
--- src/intrinsics.cpp 2022-05-24 17:18:23.008583654 +0000
+++ src/intrinsics.cpp 2022-05-24 19:56:09.991480454 +0000
@@ -1633,24 +1633,17 @@
return h;
}
-#if !defined(_OS_DARWIN_) // xcode already links compiler-rt
-
-extern "C" JL_DLLEXPORT float __gnu_h2f_ieee(uint16_t param)
-{
- return half_to_float(param);
-}
-
-extern "C" JL_DLLEXPORT float __extendhfsf2(uint16_t param)
+extern "C" JL_DLLEXPORT float julia__gnu_h2f_ieee(uint16_t param)
{
return half_to_float(param);
}
-extern "C" JL_DLLEXPORT uint16_t __gnu_f2h_ieee(float param)
+extern "C" JL_DLLEXPORT uint16_t julia__gnu_f2h_ieee(float param)
{
return float_to_half(param);
}
-extern "C" JL_DLLEXPORT uint16_t __truncdfhf2(double param)
+extern "C" JL_DLLEXPORT uint16_t julia__truncdfhf2(double param)
{
float res = (float)param;
uint32_t resi;
@@ -1671,5 +1664,3 @@
}
return float_to_half(res);
}
-
-#endif
--- src/julia.expmap 2022-05-24 16:25:48.787865159 +0000
+++ src/julia.expmap 2022-05-24 16:28:39.517014539 +0000
@@ -42,12 +42,6 @@
environ;
__progname;
- /* compiler run-time intrinsics */
- __gnu_h2f_ieee;
- __extendhfsf2;
- __gnu_f2h_ieee;
- __truncdfhf2;
-
local:
*;
};
--- src/APInt-C.cpp 2022-05-24 16:25:05.957426445 +0000
+++ src/APInt-C.cpp 2022-05-24 16:28:27.792411828 +0000
@@ -316,7 +316,7 @@
void LLVMFPtoInt(unsigned numbits, void *pa, unsigned onumbits, integerPart *pr, bool isSigned, bool *isExact) {
double Val;
if (numbits == 16)
- Val = __gnu_h2f_ieee(*(uint16_t*)pa);
+ Val = julia__gnu_h2f_ieee(*(uint16_t*)pa);
else if (numbits == 32)
Val = *(float*)pa;
else if (numbits == 64)
@@ -391,7 +391,7 @@
val = a.roundToDouble(true);
}
if (onumbits == 16)
- *(uint16_t*)pr = __gnu_f2h_ieee(val);
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(val);
else if (onumbits == 32)
*(float*)pr = val;
else if (onumbits == 64)
@@ -408,7 +408,7 @@
val = a.roundToDouble(false);
}
if (onumbits == 16)
- *(uint16_t*)pr = __gnu_f2h_ieee(val);
+ *(uint16_t*)pr = julia__gnu_f2h_ieee(val);
else if (onumbits == 32)
*(float*)pr = val;
else if (onumbits == 64) |
You might need to dump out the code_llvm (with Cthulhu) to figure out what f16 operation is still being attempted |
I am not very familiar with julia's jit, to say the least, sorry. Boils down to @code_llvm$ julia -e 'using InteractiveUtils; @code_llvm sum(Float16[])'
; @ reducedim.jl:873 within `sum'
define half @julia_sum_168({}* nonnull align 16 dereferenceable(40) %0) {
top:
; ┌ @ reducedim.jl:873 within `#sum#682'
; │┌ @ reducedim.jl:877 within `_sum'
; ││┌ @ reducedim.jl:877 within `#_sum#684'
; │││┌ @ reducedim.jl:878 within `_sum'
; ││││┌ @ reducedim.jl:878 within `#_sum#685'
; │││││┌ @ reducedim.jl:310 within `mapreduce'
; ││││││┌ @ reducedim.jl:310 within `#mapreduce#675'
; │││││││┌ @ reducedim.jl:318 within `_mapreduce_dim'
; ││││││││┌ @ reduce.jl:397 within `_mapreduce'
; │││││││││┌ @ indices.jl:459 within `LinearIndices'
; ││││││││││┌ @ abstractarray.jl:89 within `axes'
; │││││││││││┌ @ array.jl:133 within `size'
%1 = bitcast {}* %0 to {}**
%2 = getelementptr inbounds {}*, {}** %1, i64 3
%3 = bitcast {}** %2 to i64*
%4 = load i64, i64* %3, align 8
; ││││││││└└└└
; ││││││││┌ @ reduce.jl:399 within `_mapreduce'
switch i64 %4, label %L14 [
i64 0, label %L43
i64 1, label %L12
]
L12: ; preds = %top
; ││││││││└
; ││││││││┌ @ reduce.jl:402 within `_mapreduce'
; │││││││││┌ @ array.jl:805 within `getindex'
%5 = bitcast {}* %0 to half**
%6 = load half*, half** %5, align 8
%7 = load half, half* %6, align 2
; ││││││││└└
; ││││││││┌ @ reduce.jl:403 within `_mapreduce'
br label %L43
L14: ; preds = %top
; ││││││││└
; ││││││││┌ @ reduce.jl:404 within `_mapreduce'
; │││││││││┌ @ int.jl:83 within `<'
%8 = icmp ugt i64 %4, 15
; │││││││││└
br i1 %8, label %L30, label %L16
L16: ; preds = %L14
; ││││││││└
; ││││││││┌ @ reduce.jl:406 within `_mapreduce'
; │││││││││┌ @ array.jl:805 within `getindex'
%9 = bitcast {}* %0 to half**
%10 = load half*, half** %9, align 8
%11 = load half, half* %10, align 2
; ││││││││└└
; ││││││││┌ @ reduce.jl:407 within `_mapreduce'
; │││││││││┌ @ array.jl:805 within `getindex'
%12 = getelementptr inbounds half, half* %10, i64 1
%13 = load half, half* %12, align 2
; ││││││││└└
; ││││││││┌ @ reduce.jl:408 within `_mapreduce'
; │││││││││┌ @ reduce.jl:27 within `add_sum'
; ││││││││││┌ @ float.jl:324 within `+'
%14 = bitcast half %11 to i16
%15 = call float @julia__gnu_h2f_ieee(i16 %14)
%16 = bitcast half %13 to i16
%17 = call float @julia__gnu_h2f_ieee(i16 %16)
%18 = fadd float %15, %17
%19 = call i16 @julia__gnu_f2h_ieee(float %18)
%20 = bitcast i16 %19 to half
; ││││││││└└└
; ││││││││┌ @ reduce.jl:409 within `_mapreduce'
; │││││││││┌ @ int.jl:83 within `<'
%.not910 = icmp ugt i64 %4, 2
; │││││││││└
br i1 %.not910, label %L25, label %L43
L25: ; preds = %L25, %L16
%value_phi212 = phi i64 [ %21, %L25 ], [ 2, %L16 ]
%value_phi111 = phi half [ %30, %L25 ], [ %20, %L16 ]
; ││││││││└
; ││││││││┌ @ reduce.jl:410 within `_mapreduce'
; │││││││││┌ @ int.jl:87 within `+'
%21 = add nuw nsw i64 %value_phi212, 1
; │││││││││└
; │││││││││┌ @ array.jl:805 within `getindex'
%22 = getelementptr inbounds half, half* %10, i64 %value_phi212
%23 = load half, half* %22, align 2
; ││││││││└└
; ││││││││┌ @ reduce.jl:411 within `_mapreduce'
; │││││││││┌ @ reduce.jl:27 within `add_sum'
; ││││││││││┌ @ float.jl:324 within `+'
%24 = bitcast half %value_phi111 to i16
%25 = call float @julia__gnu_h2f_ieee(i16 %24)
%26 = bitcast half %23 to i16
%27 = call float @julia__gnu_h2f_ieee(i16 %26)
%28 = fadd float %25, %27
%29 = call i16 @julia__gnu_f2h_ieee(float %28)
%30 = bitcast i16 %29 to half
; ││││││││└└└
; ││││││││┌ @ reduce.jl:409 within `_mapreduce'
; │││││││││┌ @ int.jl:83 within `<'
%exitcond.not = icmp eq i64 %21, %4
; │││││││││└
br i1 %exitcond.not, label %L43, label %L25
L30: ; preds = %L14
; ││││││││└
; ││││││││┌ @ reduce.jl:415 within `_mapreduce'
; │││││││││┌ @ reduce.jl:257 within `mapreduce_impl'
%31 = call half @j_mapreduce_impl_170({}* nonnull %0, i64 signext 1, i64 signext %4, i64 signext 1024)
; │││││││││└
br label %L43
L43: ; preds = %L30, %L25, %L16, %L12, %top
%value_phi = phi half [ %7, %L12 ], [ %31, %L30 ], [ 0xH0000, %top ], [ %20, %L16 ], [ %30, %L25 ]
; └└└└└└└└└
ret half %value_phi
} @code_native$ julia -e 'using InteractiveUtils; @code_native sum(Float16[])'
JIT session error: Symbols not found: [ __gnu_f2h_ieee, __gnu_h2f_ieee ]
Failure value returned from cantFail wrapped call
Failed to materialize symbols: { (JuliaOJIT, { jfptr_mapreduce_impl_92, julia_mapreduce_impl_91 }) }
UNREACHABLE executed at [...]/usr/include/llvm/Support/Error.h:749!
signal (6): Aborted
in expression starting at none:1
pthread_kill at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
raise at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
abort at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
_ZN4llvm25llvm_unreachable_internalEPKcS1_j at [...]/bin/../lib/julia/libLLVM-11jl.so (unknown line)
unknown function (ip: 0x7ff68aa137e7)
unknown function (ip: 0x7ff68aa14b98)
_jl_compile_codeinst at [...]/src/jitlayers.cpp:1133
jl_generate_fptr at [...]/src/jitlayers.cpp:352
jl_dump_method_asm at [...]/src/jitlayers.cpp:415
_dump_function_linfo_native at [...]/usr/share/julia/stdlib/v1.6/InteractiveUtils/src/codeview.jl:132
_dump_function at [...]/usr/share/julia/stdlib/v1.6/InteractiveUtils/src/codeview.jl:114
_dump_function at [...]/usr/share/julia/stdlib/v1.6/InteractiveUtils/src/codeview.jl:100 [inlined]
#code_native#30 at [...]/usr/share/julia/stdlib/v1.6/InteractiveUtils/src/codeview.jl:192
code_native##kw at [...]/usr/share/julia/stdlib/v1.6/InteractiveUtils/src/codeview.jl:192
unknown function (ip: 0x7ff65c48edc5)
#code_native#31 at [...]/usr/share/julia/stdlib/v1.6/InteractiveUtils/src/codeview.jl:199
code_native at [...]/usr/share/julia/stdlib/v1.6/InteractiveUtils/src/codeview.jl:199
jl_apply at [...]/src/julia.h:1703 [inlined]
do_call at [...]/src/interpreter.c:115
eval_value at [...]/src/interpreter.c:204
eval_stmt_value at [...]/src/interpreter.c:155 [inlined]
eval_body at [...]/src/interpreter.c:576
jl_interpret_toplevel_thunk at [...]/src/interpreter.c:670
jl_toplevel_eval_flex at [...]/src/toplevel.c:877
jl_toplevel_eval_flex at [...]/src/toplevel.c:825
jl_toplevel_eval_flex at [...]/src/toplevel.c:825
jl_toplevel_eval_in at [...]/src/toplevel.c:929
eval at ./boot.jl:360 [inlined]
exec_options at ./client.jl:261
_start at ./client.jl:485
jfptr__start_39518 at [...]/lib/julia/sys.so (unknown line)
true_main at [...]/src/julia.h:1703
repl_entrypoint at [...]/src/jlapi.c:702
main at [...]/cli/loader_exe.c:51
unknown function (ip: 0x7ff68af84d8f)
__libc_start_main at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
_start at [...]/bin/julia (unknown line)
Allocations: 1511101 (Pool: 1510607; Big: 494); GC: 2
Aborted (core dumped) |
I don't think you looked at the right function, since your error says |
I'm afraid the problem is not related to the tentative of applying the mentioned patches. Building unpatched $ readelf -sV $(gcc --print-file-name=libgcc_s.so.1) | sed -n 's/.*@@GCC_//p' | sort -uV | tail -1
12.0.0
$ gcc --version
gcc (Ubuntu 11.2.0-19ubuntu1) 11.2.0
[...] $ [...]/julia
julia> VERSION
1.9.0-DEV.649
julia> sum(Float16[])
JIT session error: Symbols not found: [ __gnu_f2h_ieee, __gnu_h2f_ieee ]
Failure value returned from cantFail wrapped call
Failed to materialize symbols: { (JuliaOJIT, { julia_mapreduce_impl_68, jfptr_mapreduce_impl_69 }) }
UNREACHABLE executed at [...]/usr/include/llvm/Support/Error.h:782!
[...] stacktracesignal (6): Aborted
in expression starting at REPL[4]:1
pthread_kill at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
raise at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
abort at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
_ZN4llvm25llvm_unreachable_internalEPKcS1_j at [...]/bin/../lib/julia/libLLVM-13jl.so (unknown line)
cantFail<llvm::JITEvaluatedSymbol> at [...]/usr/include/llvm/Support/Error.h:782 [inlined]
addModule at [...]/src/jitlayers.cpp:1147
jl_add_to_ee at [...]/src/jitlayers.cpp:1518
jl_add_to_ee at [...]/src/jitlayers.cpp:1540 [inlined]
_jl_compile_codeinst at [...]/src/jitlayers.cpp:162
jl_generate_fptr_impl at [...]/src/jitlayers.cpp:357
jl_compile_method_internal at [...]/src/gf.c:2106 [inlined]
jl_compile_method_internal at [...]/src/gf.c:2047
_jl_invoke at [...]/src/gf.c:2384 [inlined]
ijl_apply_generic at [...]/src/gf.c:2574
jl_apply at [...]/src/julia.h:1841 [inlined]
do_call at [...]/src/interpreter.c:126
eval_value at [...]/src/interpreter.c:215
eval_stmt_value at [...]/src/interpreter.c:166 [inlined]
eval_body at [...]/src/interpreter.c:612
jl_interpret_toplevel_thunk at [...]/src/interpreter.c:750
jl_toplevel_eval_flex at [...]/src/toplevel.c:912
jl_toplevel_eval_flex at [...]/src/toplevel.c:856
ijl_toplevel_eval_in at [...]/src/toplevel.c:971
eval at ./boot.jl:370 [inlined]
eval_user_input at [...]/usr/share/julia/stdlib/v1.9/REPL/src/REPL.jl:152
repl_backend_loop at [...]/usr/share/julia/stdlib/v1.9/REPL/src/REPL.jl:247
start_repl_backend at [...]/usr/share/julia/stdlib/v1.9/REPL/src/REPL.jl:232
#run_repl#47 at [...]/usr/share/julia/stdlib/v1.9/REPL/src/REPL.jl:369
run_repl at [...]/usr/share/julia/stdlib/v1.9/REPL/src/REPL.jl:355
jfptr_run_repl_52325 at [...]/lib/julia/sys.so (unknown line)
#965 at ./client.jl:415
jfptr_YY.965_40969 at [...]/lib/julia/sys.so (unknown line)
jl_apply at [...]/src/julia.h:1841 [inlined]
jl_f__call_latest at [...]/src/builtins.c:774
run_main_repl at ./client.jl:400
exec_options at ./client.jl:314
_start at ./client.jl:516
jfptr__start_28292 at [...]/lib/julia/sys.so (unknown line)
jl_apply at [...]/src/julia.h:1841 [inlined]
true_main at [...]/src/jlapi.c:566
jl_repl_entrypoint at [...]/src/jlapi.c:710
main at [...]/cli/loader_exe.c:59
unknown function (ip: 0x7f4747f51d8f)
__libc_start_main at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
_start at [...]/bin/julia (unknown line)
Allocations: 2943 (Pool: 2930; Big: 13); GC: 0
Aborted (core dumped) |
yes, it is likely just indicating it is incomplete for that version, so I need to know what you see for the IR so that it can be extended to cover that case |
Here is the IR with irdefine half @julia_sum_214({}* nonnull align 16 dereferenceable(40) %0) #0 {
top:
%1 = bitcast {}* %0 to { i8*, i64, i16, i16, i32 }*
%2 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %1, i64 0, i32 1
%3 = load i64, i64* %2, align 8
switch i64 %3, label %L14 [
i64 0, label %L45
i64 1, label %L12
]
L12: ; preds = %top
%4 = bitcast {}* %0 to half**
%5 = load half*, half** %4, align 8
%6 = load half, half* %5, align 2
br label %L45
L14: ; preds = %top
%7 = icmp ugt i64 %3, 15
br i1 %7, label %L31, label %L16
L16: ; preds = %L14
%8 = bitcast {}* %0 to half**
%9 = load half*, half** %8, align 8
%10 = load half, half* %9, align 2
%11 = getelementptr inbounds half, half* %9, i64 1
%12 = load half, half* %11, align 2
%13 = bitcast half %10 to i16
%14 = call float @julia__gnu_h2f_ieee(i16 %13)
%15 = bitcast half %12 to i16
%16 = call float @julia__gnu_h2f_ieee(i16 %15)
%17 = fadd float %14, %16
%18 = call i16 @julia__gnu_f2h_ieee(float %17)
%19 = bitcast i16 %18 to half
%.not910 = icmp ugt i64 %3, 2
br i1 %.not910, label %L26, label %L45
L26: ; preds = %L26, %L16
%value_phi212 = phi i64 [ %20, %L26 ], [ 2, %L16 ]
%value_phi111 = phi half [ %29, %L26 ], [ %19, %L16 ]
%20 = add nuw nsw i64 %value_phi212, 1
%21 = getelementptr inbounds half, half* %9, i64 %value_phi212
%22 = load half, half* %21, align 2
%23 = bitcast half %value_phi111 to i16
%24 = call float @julia__gnu_h2f_ieee(i16 %23)
%25 = bitcast half %22 to i16
%26 = call float @julia__gnu_h2f_ieee(i16 %25)
%27 = fadd float %24, %26
%28 = call i16 @julia__gnu_f2h_ieee(float %27)
%29 = bitcast i16 %28 to half
%exitcond.not = icmp eq i64 %20, %3
br i1 %exitcond.not, label %L45, label %L26
L31: ; preds = %L14
%30 = call half @j_mapreduce_impl_216({}* nonnull %0, i64 signext 1, i64 signext %3, i64 signext 1024) #0
br label %L45
L45: ; preds = %L31, %L26, %L16, %L12, %top
%value_phi = phi half [ %6, %L12 ], [ %30, %L31 ], [ 0xH0000, %top ], [ %19, %L16 ], [ %29, %L26 ]
ret half %value_phi
} |
Also broke PPC: #45556 (comment) |
This reverts commit eb82f18.
This reverts commit eb82f18.
Fixes #44829, until llvm fixes the support for these intrinsics itself
Replaces #44975