Skip to content

Commit

Permalink
Add FMA multiversioning.
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt committed Nov 16, 2021
1 parent 73f46aa commit cdb2c3c
Show file tree
Hide file tree
Showing 9 changed files with 132 additions and 16 deletions.
19 changes: 5 additions & 14 deletions base/floatfuncs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -406,22 +406,13 @@ function fma_emulated(a::Float64, b::Float64,c::Float64)
end
fma_llvm(x::Float32, y::Float32, z::Float32) = fma_float(x, y, z)
fma_llvm(x::Float64, y::Float64, z::Float64) = fma_float(x, y, z)

# Disable LLVM's fma if it is incorrect, e.g. because LLVM falls back
# onto a broken system libm; if so, use a software emulated fma
# 1.0000305f0 = 1 + 1/2^15
# 1.0000000009313226 = 1 + 1/2^30
# If fma_llvm() clobbers the rounding mode, the result of 0.1 + 0.2 will be 0.3
# instead of the properly-rounded 0.30000000000000004; check after calling fma
# TODO actually detect fma in hardware and switch on that.
if (Sys.ARCH !== :i686 && fma_llvm(1.0000305f0, 1.0000305f0, -1.0f0) == 6.103609f-5 &&
(fma_llvm(1.0000000009313226, 1.0000000009313226, -1.0) ==
1.8626451500983188e-9) && 0.1 + 0.2 == 0.30000000000000004)
fma(x::Float32, y::Float32, z::Float32) = fma_llvm(x,y,z)
fma(x::Float64, y::Float64, z::Float64) = fma_llvm(x,y,z)
else
fma(x::Float32, y::Float32, z::Float32) = fma_emulated(x,y,z)
fma(x::Float64, y::Float64, z::Float64) = fma_emulated(x,y,z)
end
have_fma() = ccall("extern julia.cpu.have_fma", llvmcall, Int, ()) == 1
fma(x::Float32, y::Float32, z::Float32) = have_fma() ? fma_llvm(x,y,z) : fma_emulated(x,y,z)
fma(x::Float64, y::Float64, z::Float64) = have_fma() ? fma_llvm(x,y,z) : fma_emulated(x,y,z)

function fma(a::Float16, b::Float16, c::Float16)
Float16(muladd(Float32(a), Float32(b), Float32(c))) #don't use fma if the hardware doesn't have it.
end
Expand Down
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ RUNTIME_CODEGEN_SRCS := jitlayers aotcompile debuginfo disasm llvm-simdloop llvm
llvm-final-gc-lowering llvm-pass-helpers llvm-late-gc-lowering \
llvm-lower-handlers llvm-gc-invariant-verifier llvm-propagate-addrspaces \
llvm-multiversioning llvm-alloc-opt cgmemmgr llvm-remove-addrspaces \
llvm-remove-ni llvm-julia-licm llvm-demote-float16
llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures
FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir)
CG_LLVM_LIBS := all
ifeq ($(USE_POLLY),1)
Expand Down
5 changes: 5 additions & 0 deletions src/aotcompile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,10 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level,
PM->add(createLowerSimdLoopPass()); // Annotate loop marked with "loopinfo" as LLVM parallel loop
if (dump_native)
PM->add(createMultiVersioningPass());
PM->add(createCPUFeaturesPass());
// minimal clean-up to get rid of CPU feature checks
PM->add(createInstSimplifyLegacyPass());
PM->add(createCFGSimplificationPass(simplifyCFGOptions));
#if defined(_COMPILER_ASAN_ENABLED_)
PM->add(createAddressSanitizerFunctionPass());
#endif
Expand Down Expand Up @@ -680,6 +684,7 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level,
PM->add(createCFGSimplificationPass(simplifyCFGOptions));
if (dump_native)
PM->add(createMultiVersioningPass());
PM->add(createCPUFeaturesPass());
PM->add(createSROAPass());
PM->add(createInstSimplifyLegacyPass());
PM->add(createJumpThreadingPass());
Expand Down
1 change: 1 addition & 0 deletions src/jitlayers.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ Pass *createJuliaLICMPass();
Pass *createMultiVersioningPass();
Pass *createAllocOptPass();
Pass *createDemoteFloat16Pass();
Pass *createCPUFeaturesPass();
// Whether the Function is an llvm or julia intrinsic.
static inline bool isIntrinsicFunction(Function *F)
{
Expand Down
110 changes: 110 additions & 0 deletions src/llvm-cpufeatures.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license
//
// Lower intrinsics that expose subtarget information to the language. This makes it
// possible to write code that changes behavior based on, e.g., the availability of
// specific CPU features.
//
// The following intrinsics are supported:
// - julia.cpu.have_fma: returns 1 if the platform supports hardware-accelerated FMA
//
// XXX: can / do we want to make this a codegen pass to enable querying TargetPassConfig?

#include "llvm-version.h"

#include <llvm/IR/Module.h>
#include <llvm/IR/Constants.h>
#include <llvm/IR/PassManager.h>
#include <llvm/IR/LegacyPassManager.h>
#include <llvm/Target/TargetMachine.h>
#include <llvm/Support/Debug.h>

#include "julia.h"

#define DEBUG_TYPE "cpufeatures"

using namespace llvm;

extern TargetMachine *jl_TargetMachine;

namespace {

static void lowerHaveFMA(Function &F, Instruction *I) {
Triple TheTriple = Triple(jl_TargetMachine->getTargetTriple());

Attribute CPUAttr = F.getFnAttribute("target-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");

StringRef CPU =
CPUAttr.isValid() ? CPUAttr.getValueAsString() : jl_TargetMachine->getTargetCPU();
StringRef FS =
FSAttr.isValid() ? FSAttr.getValueAsString() : jl_TargetMachine->getTargetFeatureString();

if (TheTriple.getArch() == Triple::x86_64 && FS.find("+fma") != StringRef::npos)
I->replaceAllUsesWith(ConstantInt::get(I->getType(), 1));
else
I->replaceAllUsesWith(ConstantInt::get(I->getType(), 0));

return;
}

static bool lowerCPUFeatures(Module &M)
{
SmallVector<Instruction*,6> Materialized;
if (auto have_fma = M.getFunction("julia.cpu.have_fma")) {
for (Use &U: have_fma->uses()) {
User *RU = U.getUser();
Instruction *I = cast<Instruction>(RU);
lowerHaveFMA(*I->getParent()->getParent(), I);
Materialized.push_back(I);
}
}

if (!Materialized.empty()) {
for (auto I: Materialized) {
I->eraseFromParent();
}
return true;
} else {
return false;
}
}
}

struct CPUFeatures : PassInfoMixin<CPUFeatures> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};

PreservedAnalyses CPUFeatures::run(Module &M, ModuleAnalysisManager &AM)
{
lowerCPUFeatures(M);
return PreservedAnalyses::all();
}

namespace {
struct CPUFeaturesLegacy : public ModulePass {
static char ID;
CPUFeaturesLegacy() : ModulePass(ID) {};

bool runOnModule(Module &M)
{
return lowerCPUFeatures(M);
}
};

char CPUFeaturesLegacy::ID = 0;
static RegisterPass<CPUFeaturesLegacy>
Y("CPUFeatures",
"Lower calls to CPU feature testing intrinsics.",
false,
false);
}

Pass *createCPUFeaturesPass()
{
return new CPUFeaturesLegacy();
}

extern "C" JL_DLLEXPORT void LLVMExtraAddCPUFeaturesPass_impl(LLVMPassManagerRef PM)
{
unwrap(PM)->add(createCPUFeaturesPass());
}
5 changes: 4 additions & 1 deletion src/llvm-multiversioning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace {

// These are valid detail cloning conditions in the target flags.
constexpr uint32_t clone_mask =
JL_TARGET_CLONE_LOOP | JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH;
JL_TARGET_CLONE_LOOP | JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU;

struct MultiVersioning;

Expand Down Expand Up @@ -469,6 +469,9 @@ uint32_t CloneCtx::collect_func_info(Function &F)
if (name.startswith("llvm.muladd.") || name.startswith("llvm.fma.")) {
flag |= JL_TARGET_CLONE_MATH;
}
else if (name.startswith("julia.cpu.")) {
flag |= JL_TARGET_CLONE_CPU;
}
}
}
else if (auto store = dyn_cast<StoreInst>(&I)) {
Expand Down
2 changes: 2 additions & 0 deletions src/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ enum {
JL_TARGET_OPTSIZE = 1 << 6,
// Only optimize for size for this target
JL_TARGET_MINSIZE = 1 << 7,
// Clone when the function queries CPU features
JL_TARGET_CLONE_CPU = 1 << 8,
};

#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
Expand Down
2 changes: 2 additions & 0 deletions src/processor_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1562,6 +1562,8 @@ static void ensure_jit_target(bool imaging)
auto &t = jit_targets[i];
if (t.en.flags & JL_TARGET_CLONE_ALL)
continue;
// Always clone when code checks CPU features
t.en.flags |= JL_TARGET_CLONE_CPU;
// The most useful one in general...
t.en.flags |= JL_TARGET_CLONE_LOOP;
#ifdef _CPU_ARM_
Expand Down
2 changes: 2 additions & 0 deletions src/processor_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,8 @@ static void ensure_jit_target(bool imaging)
auto &t = jit_targets[i];
if (t.en.flags & JL_TARGET_CLONE_ALL)
continue;
// Always clone when code checks CPU features
t.en.flags |= JL_TARGET_CLONE_CPU;
// The most useful one in general...
t.en.flags |= JL_TARGET_CLONE_LOOP;
auto &features0 = jit_targets[t.base].en.features;
Expand Down

0 comments on commit cdb2c3c

Please sign in to comment.