Fix fundamental confusion about target/tune CPU (#6765)

* Fix fundamental confusion about target/tune CPU Sooo. Uh, remember when in #6655 we've agreed that we want to add support to precisely specify the CPU for which the code should be *tuned* for, but not *targeted* for. Aka, similar to clang's `-mtune=` option, that does not affect the ISA set selection? So guess what, that's not what we did, apparently. `CodeGen_LLVM::mcpu()` / `halide_mcpu` actually do specify the *target* CPU. It was obvious in retrospect, because e.g. `CodeGen_X86::mattrs()` does not, in fact, ever specify `+avx2`, yet we get AVX2 :) So we've unintentionally added `-march=` support. Oops. While i'd like to add `-march=` support, that was not the goal here. Fixing this is complicated by the fact that `llvm::Target::createTargetMachine()` only takes `CPU Target` string, you can't specify `CPU Tune`. But this is actually a blessing in disguise, because it allows us to fix another bug at the same time: There is a problem with halide "compile to llvm ir assembly", a lot of information from Halide Target is not //really// lowered into LLVM Module, but is embedded as a metadata, that is then extracted by halide `make_target_machine()`. While that is not a problem in itself, it makes it *impossible* to dump the LLVM IR, and manually play with it, because e.g. the CPU [Target] and Attributes (ISA set) are not actually lowered into the form LLVM understands, but are in some halide-specific metadata. So, to fix the first bug, we must lower the CPU Tune into per-function `"tune-cpu"` metadata, and while there we might as well lower `"target-cpu"` and `"target-features"` similarly. * Address review notes * Hopefully silence bogus issue reported by ancient GCC * Call `set_function_attributes_from_halide_target_options()` when JIT compiling * Fix grammar
halide · May 19, 2022 · b5f024f · b5f024f
1 parent 61f6af7
commit b5f024f
Show file tree

Hide file tree

Showing 13 changed files with 137 additions and 72 deletions.
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -72,7 +72,8 @@ class CodeGen_ARM : public CodeGen_Posix {
  };
  vector<Pattern> casts, calls, averagings, negations;
 
- string mcpu() const override;
+ string mcpu_target() const override;
+ string mcpu_tune() const override;
  string mattrs() const override;
  bool use_soft_float_abi() const override;
  int native_vector_bits() const override;
@@ -1392,7 +1393,7 @@ Type CodeGen_ARM::upgrade_type_for_storage(const Type &t) const {
  return CodeGen_Posix::upgrade_type_for_storage(t);
 }
 
-string CodeGen_ARM::mcpu() const {
+string CodeGen_ARM::mcpu_target() const {
  if (target.bits == 32) {
  if (target.has_feature(Target::ARMv7s)) {
  return "swift";
@@ -1410,6 +1411,10 @@ string CodeGen_ARM::mcpu() const {
  }
 }
 
+string CodeGen_ARM::mcpu_tune() const {
+ return mcpu_target();
+}
+
 string CodeGen_ARM::mattrs() const {
  if (target.bits == 32) {
  if (target.has_feature(Target::ARMv7s)) {

diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
@@ -42,7 +42,8 @@ class CodeGen_Hexagon : public CodeGen_Posix {
 
  void init_module() override;
 
- std::string mcpu() const override;
+ std::string mcpu_target() const override;
+ std::string mcpu_tune() const override;
  std::string mattrs() const override;
  int isa_version;
  bool use_soft_float_abi() const override;
@@ -1788,7 +1789,7 @@ Value *CodeGen_Hexagon::call_intrin(llvm::Type *result_type, const string &name,
  fn, std::move(args));
 }
 
-string CodeGen_Hexagon::mcpu() const {
+string CodeGen_Hexagon::mcpu_target() const {
  if (target.has_feature(Halide::Target::HVX_v66)) {
  return "hexagonv66";
  } else if (target.has_feature(Halide::Target::HVX_v65)) {
@@ -1798,6 +1799,10 @@ string CodeGen_Hexagon::mcpu() const {
  }
 }
 
+string CodeGen_Hexagon::mcpu_tune() const {
+ return mcpu_target();
+}
+
 string CodeGen_Hexagon::mattrs() const {
  std::stringstream attrs;
  attrs << "+hvx-length128b";

diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
@@ -590,16 +590,15 @@ bool get_md_string(llvm::Metadata *value, std::string &result) {
  return false;
 }
 
-void get_target_options(const llvm::Module &module, llvm::TargetOptions &options, std::string &mcpu, std::string &mattrs) {
+void get_target_options(const llvm::Module &module, llvm::TargetOptions &options) {
  bool use_soft_float_abi = false;
  get_md_bool(module.getModuleFlag("halide_use_soft_float_abi"), use_soft_float_abi);
- get_md_string(module.getModuleFlag("halide_mcpu"), mcpu);
- get_md_string(module.getModuleFlag("halide_mattrs"), mattrs);
  std::string mabi;
  get_md_string(module.getModuleFlag("halide_mabi"), mabi);
  bool use_pic = true;
  get_md_bool(module.getModuleFlag("halide_use_pic"), use_pic);
 
+ // FIXME: can this be migrated into `set_function_attributes_from_halide_target_options()`?
  bool per_instruction_fast_math_flags = false;
  get_md_bool(module.getModuleFlag("halide_per_instruction_fast_math_flags"), per_instruction_fast_math_flags);
 
@@ -629,9 +628,14 @@ void clone_target_options(const llvm::Module &from, llvm::Module &to) {
  to.addModuleFlag(llvm::Module::Warning, "halide_use_soft_float_abi", use_soft_float_abi ? 1 : 0);
  }
 
- std::string mcpu;
- if (get_md_string(from.getModuleFlag("halide_mcpu"), mcpu)) {
- to.addModuleFlag(llvm::Module::Warning, "halide_mcpu", llvm::MDString::get(context, mcpu));
+ std::string mcpu_target;
+ if (get_md_string(from.getModuleFlag("halide_mcpu_target"), mcpu_target)) {
+ to.addModuleFlag(llvm::Module::Warning, "halide_mcpu_target", llvm::MDString::get(context, mcpu_target));
+ }
+
+ std::string mcpu_tune;
+ if (get_md_string(from.getModuleFlag("halide_mcpu_tune"), mcpu_tune)) {
+ to.addModuleFlag(llvm::Module::Warning, "halide_mcpu_tune", llvm::MDString::get(context, mcpu_tune));
  }
 
  std::string mattrs;
@@ -657,9 +661,7 @@ std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &mod
  internal_assert(llvm_target) << "Could not create LLVM target for " << triple.str() << "\n";
 
  llvm::TargetOptions options;
- std::string mcpu = "";
- std::string mattrs = "";
- get_target_options(module, options, mcpu, mattrs);
+ get_target_options(module, options);
 
  bool use_pic = true;
  get_md_bool(module.getModuleFlag("halide_use_pic"), use_pic);
@@ -668,18 +670,29 @@ std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &mod
  get_md_bool(module.getModuleFlag("halide_use_large_code_model"), use_large_code_model);
 
  auto *tm = llvm_target->createTargetMachine(module.getTargetTriple(),
- mcpu, mattrs,
+ /*CPU target=*/"", /*Features=*/"",
  options,
  use_pic ? llvm::Reloc::PIC_ : llvm::Reloc::Static,
  use_large_code_model ? llvm::CodeModel::Large : llvm::CodeModel::Small,
  llvm::CodeGenOpt::Aggressive);
  return std::unique_ptr<llvm::TargetMachine>(tm);
 }
 
-void set_function_attributes_for_target(llvm::Function *fn, const Target &t) {
+void set_function_attributes_from_halide_target_options(llvm::Function &fn) {
+ llvm::Module &module = *fn.getParent();
+
+ std::string mcpu_target, mcpu_tune, mattrs;
+ get_md_string(module.getModuleFlag("halide_mcpu_target"), mcpu_target);
+ get_md_string(module.getModuleFlag("halide_mcpu_tune"), mcpu_tune);
+ get_md_string(module.getModuleFlag("halide_mattrs"), mattrs);
+
+ fn.addFnAttr("target-cpu", mcpu_target);
+ fn.addFnAttr("tune-cpu", mcpu_tune);
+ fn.addFnAttr("target-features", mattrs);
+
  // Turn off approximate reciprocals for division. It's too
  // inaccurate even for us.
- fn->addFnAttr("reciprocal-estimates", "none");
+ fn.addFnAttr("reciprocal-estimates", "none");
 }
 
 void embed_bitcode(llvm::Module *M, const string &halide_command) {

diff --git a/src/CodeGen_Internal.h b/src/CodeGen_Internal.h
@@ -92,17 +92,17 @@ Expr lower_signed_shift_right(const Expr &a, const Expr &b);
 /** Reduce a mux intrinsic to a select tree */
 Expr lower_mux(const Call *mux);
 
-/** Given an llvm::Module, set llvm:TargetOptions, cpu and attr information */
-void get_target_options(const llvm::Module &module, llvm::TargetOptions &options, std::string &mcpu, std::string &mattrs);
+/** Given an llvm::Module, set llvm:TargetOptions information */
+void get_target_options(const llvm::Module &module, llvm::TargetOptions &options);
 
 /** Given two llvm::Modules, clone target options from one to the other */
 void clone_target_options(const llvm::Module &from, llvm::Module &to);
 
 /** Given an llvm::Module, get or create an llvm:TargetMachine */
 std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &module);
 
-/** Set the appropriate llvm Function attributes given a Target. */
-void set_function_attributes_for_target(llvm::Function *, const Target &);
+/** Set the appropriate llvm Function attributes given the Halide Target. */
+void set_function_attributes_from_halide_target_options(llvm::Function &);
 
 /** Save a copy of the llvm IR currently represented by the module as
  * data in the __LLVM,__bitcode section. Emulates clang's

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -455,7 +455,8 @@ void CodeGen_LLVM::init_codegen(const std::string &name, bool any_strict_float)
 
  // Add some target specific info to the module as metadata.
  module->addModuleFlag(llvm::Module::Warning, "halide_use_soft_float_abi", use_soft_float_abi() ? 1 : 0);
- module->addModuleFlag(llvm::Module::Warning, "halide_mcpu", MDString::get(*context, mcpu()));
+ module->addModuleFlag(llvm::Module::Warning, "halide_mcpu_target", MDString::get(*context, mcpu_target()));
+ module->addModuleFlag(llvm::Module::Warning, "halide_mcpu_tune", MDString::get(*context, mcpu_tune()));
  module->addModuleFlag(llvm::Module::Warning, "halide_mattrs", MDString::get(*context, mattrs()));
  module->addModuleFlag(llvm::Module::Warning, "halide_mabi", MDString::get(*context, mabi()));
  module->addModuleFlag(llvm::Module::Warning, "halide_use_pic", use_pic() ? 1 : 0);
@@ -523,7 +524,7 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
  }
  FunctionType *func_t = FunctionType::get(i32_t, arg_types, false);
  function = llvm::Function::Create(func_t, llvm_linkage(f.linkage), names.extern_name, module.get());
- set_function_attributes_for_target(function, target);
+ set_function_attributes_from_halide_target_options(*function);
 
  // Mark the buffer args as no alias and save indication for add_argv_wrapper if needed
  std::vector<bool> buffer_args(f.args.size());
@@ -564,6 +565,8 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
 }
 
 std::unique_ptr<llvm::Module> CodeGen_LLVM::finish_codegen() {
+ llvm::for_each(*module, set_function_attributes_from_halide_target_options);
+
  // Verify the module is ok
  internal_assert(!verifyModule(*module, &llvm::errs()));
  debug(2) << "Done generating llvm bitcode\n";

diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
@@ -106,11 +106,21 @@ class CodeGen_LLVM : public IRVisitor {
  virtual void end_func(const std::vector<LoweredArgument> &args);
  // @}
 
- /** What should be passed as -mcpu, -mattrs, and related for
- * compilation. The architecture-specific code generator should
- * define these. */
+ /** What should be passed as -mcpu (warning: implies attrs!), -mattrs,
+ * and related for compilation. The architecture-specific code generator
+ * should define these.
+ *
+ * `mcpu_target()` - target this specific CPU, in the sense of the allowed
+ * ISA sets *and* the CPU-specific tuning/assembly instruction scheduling.
+ *
+ * `mcpu_tune()` - expect that we will be running on this specific CPU,
+ * so perform CPU-specific tuning/assembly instruction scheduling, *but*
+ * DON'T sacrifice the portability, support running on other CPUs, only
+ * make use of the ISAs that are enabled by `mcpu_target()`+`mattrs()`.
+ */
  // @{
- virtual std::string mcpu() const = 0;
+ virtual std::string mcpu_target() const = 0;
+ virtual std::string mcpu_tune() const = 0;
  virtual std::string mattrs() const = 0;
  virtual std::string mabi() const;
  virtual bool use_soft_float_abi() const = 0;

diff --git a/src/CodeGen_MIPS.cpp b/src/CodeGen_MIPS.cpp
@@ -19,7 +19,8 @@ class CodeGen_MIPS : public CodeGen_Posix {
 protected:
  using CodeGen_Posix::visit;
 
- string mcpu() const override;
+ string mcpu_target() const override;
+ string mcpu_tune() const override;
  string mattrs() const override;
  bool use_soft_float_abi() const override;
  int native_vector_bits() const override;
@@ -29,14 +30,18 @@ CodeGen_MIPS::CodeGen_MIPS(const Target &t)
  : CodeGen_Posix(t) {
 }
 
-string CodeGen_MIPS::mcpu() const {
+string CodeGen_MIPS::mcpu_target() const {
  if (target.bits == 32) {
  return "";
  } else {
  return "";
  }
 }
 
+string CodeGen_MIPS::mcpu_tune() const {
+ return mcpu_target();
+}
+
 string CodeGen_MIPS::mattrs() const {
  if (target.bits == 32) {
  return "";

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
@@ -91,7 +91,8 @@ class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
  // @}
 
  std::string march() const;
- std::string mcpu() const override;
+ std::string mcpu_target() const override;
+ std::string mcpu_tune() const override;
  std::string mattrs() const override;
  bool use_soft_float_abi() const override;
  int native_vector_bits() const override;
@@ -153,7 +154,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
  // Make our function
  FunctionType *func_t = FunctionType::get(void_t, arg_types, false);
  function = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get());
- set_function_attributes_for_target(function, target);
+ set_function_attributes_from_halide_target_options(*function);
 
  // Mark the buffer args as no alias
  for (size_t i = 0; i < args.size(); i++) {
@@ -542,7 +543,7 @@ string CodeGen_PTX_Dev::march() const {
  return "nvptx64";
 }
 
-string CodeGen_PTX_Dev::mcpu() const {
+string CodeGen_PTX_Dev::mcpu_target() const {
  if (target.has_feature(Target::CUDACapability86)) {
  return "sm_86";
  } else if (target.has_feature(Target::CUDACapability80)) {
@@ -566,6 +567,10 @@ string CodeGen_PTX_Dev::mcpu() const {
  }
 }
 
+string CodeGen_PTX_Dev::mcpu_tune() const {
+ return mcpu_target();
+}
+
 string CodeGen_PTX_Dev::mattrs() const {
  if (target.has_feature(Target::CUDACapability86)) {
  return "+ptx71";
@@ -617,7 +622,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
 
  std::unique_ptr<TargetMachine>
  target_machine(llvm_target->createTargetMachine(triple.str(),
- mcpu(), mattrs(), options,
+ mcpu_target(), mattrs(), options,
  llvm::Reloc::PIC_,
  llvm::CodeModel::Small,
  CodeGenOpt::Aggressive));
@@ -758,7 +763,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
  f.write(buffer.data(), buffer.size());
  f.close();
 
- string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname();
+ string cmd = "ptxas --gpu-name " + mcpu_target() + " " + ptx.pathname() + " -o " + sass.pathname();
  if (system(cmd.c_str()) == 0) {
  cmd = "nvdisasm " + sass.pathname();
  int ret = system(cmd.c_str());

diff --git a/src/CodeGen_PowerPC.cpp b/src/CodeGen_PowerPC.cpp
@@ -22,7 +22,8 @@ class CodeGen_PowerPC : public CodeGen_Posix {
 protected:
  void init_module() override;
 
- string mcpu() const override;
+ string mcpu_target() const override;
+ string mcpu_tune() const override;
  string mattrs() const override;
  bool use_soft_float_abi() const override;
  int native_vector_bits() const override;
@@ -141,7 +142,7 @@ void CodeGen_PowerPC::visit(const Max *op) {
  return CodeGen_Posix::visit(op);
 }
 
-string CodeGen_PowerPC::mcpu() const {
+string CodeGen_PowerPC::mcpu_target() const {
  if (target.bits == 32) {
  return "ppc32";
  } else {
@@ -155,6 +156,10 @@ string CodeGen_PowerPC::mcpu() const {
  }
 }
 
+string CodeGen_PowerPC::mcpu_tune() const {
+ return mcpu_target();
+}
+
 string CodeGen_PowerPC::mattrs() const {
  string features;
  string separator;

diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp
@@ -19,7 +19,8 @@ class CodeGen_RISCV : public CodeGen_Posix {
 protected:
  using CodeGen_Posix::visit;
 
- string mcpu() const override;
+ string mcpu_target() const override;
+ string mcpu_tune() const override;
  string mattrs() const override;
  string mabi() const override;
  bool use_soft_float_abi() const override;
@@ -30,10 +31,14 @@ CodeGen_RISCV::CodeGen_RISCV(const Target &t)
  : CodeGen_Posix(t) {
 }
 
-string CodeGen_RISCV::mcpu() const {
+string CodeGen_RISCV::mcpu_target() const {
  return "";
 }
 
+string CodeGen_RISCV::mcpu_tune() const {
+ return mcpu_target();
+}
+
 string CodeGen_RISCV::mattrs() const {
  // Note: the default march is "rv[32|64]imafdc",
  // which includes standard extensions:

diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
@@ -29,7 +29,8 @@ class CodeGen_WebAssembly : public CodeGen_Posix {
 
  void init_module() override;
 
- string mcpu() const override;
+ string mcpu_target() const override;
+ string mcpu_tune() const override;
  string mattrs() const override;
  bool use_soft_float_abi() const override;
  int native_vector_bits() const override;
@@ -256,10 +257,14 @@ void CodeGen_WebAssembly::codegen_vector_reduce(const VectorReduce *op, const Ex
  CodeGen_Posix::codegen_vector_reduce(op, init);
 }
 
-string CodeGen_WebAssembly::mcpu() const {
+string CodeGen_WebAssembly::mcpu_target() const {
  return "";
 }
 
+string CodeGen_WebAssembly::mcpu_tune() const {
+ return mcpu_target();
+}
+
 string CodeGen_WebAssembly::mattrs() const {
  std::ostringstream s;
  string sep;