From b5f024fa83b6f1cfe5e83a459c9378b7c5bf096d Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Thu, 19 May 2022 20:10:53 +0300
Subject: [PATCH] Fix fundamental confusion about target/tune CPU (#6765)

* Fix fundamental confusion about target/tune CPU

Sooo. Uh, remember when in https://github.com/halide/Halide/pull/6655
we've agreed that we want to add support to precisely specify
the CPU for which the code should be *tuned* for,
but not *targeted* for. Aka, similar to clang's `-mtune=` option,
that does not affect the ISA set selection?

So guess what, that's not what we did, apparently.
`CodeGen_LLVM::mcpu()` / `halide_mcpu` actually do specify
the *target* CPU. It was obvious in retrospect, because e.g.
`CodeGen_X86::mattrs()` does not, in fact, ever specify `+avx2`,
yet we get AVX2 :) So we've unintentionally added `-march=` support.
Oops.

While i'd like to add `-march=` support, that was not the goal here.

Fixing this is complicated by the fact that
`llvm::Target::createTargetMachine()` only takes `CPU Target` string,
you can't specify `CPU Tune`.

But this is actually a blessing in disguise,
because it allows us to fix another bug at the same time:

There is a problem with halide "compile to llvm ir assembly",
a lot of information from Halide Target is not //really// lowered
into LLVM Module, but is embedded as a metadata,
that is then extracted by halide `make_target_machine()`.

While that is not a problem in itself, it makes it *impossible*
to dump the LLVM IR, and manually play with it,
because e.g. the CPU [Target] and Attributes (ISA set)
are not actually lowered into the form LLVM understands,
but are in some halide-specific metadata.

So, to fix the first bug, we must lower the CPU Tune
into per-function `"tune-cpu"` metadata,
and while there we might as well lower `"target-cpu"`
and `"target-features"` similarly.

* Address review notes

* Hopefully silence bogus issue reported by ancient GCC

* Call `set_function_attributes_from_halide_target_options()` when JIT compiling

* Fix grammar
---
 src/CodeGen_ARM.cpp         |  9 ++++--
 src/CodeGen_Hexagon.cpp     |  9 ++++--
 src/CodeGen_Internal.cpp    | 37 +++++++++++++++--------
 src/CodeGen_Internal.h      |  8 ++---
 src/CodeGen_LLVM.cpp        |  7 +++--
 src/CodeGen_LLVM.h          | 18 ++++++++---
 src/CodeGen_MIPS.cpp        |  9 ++++--
 src/CodeGen_PTX_Dev.cpp     | 15 ++++++----
 src/CodeGen_PowerPC.cpp     |  9 ++++--
 src/CodeGen_RISCV.cpp       |  9 ++++--
 src/CodeGen_WebAssembly.cpp |  9 ++++--
 src/CodeGen_X86.cpp         | 59 +++++++++++++++++++++----------------
 src/JITModule.cpp           | 11 ++-----
 13 files changed, 137 insertions(+), 72 deletions(-)
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index c9962d530d65..7fa4bd35b84f 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -72,7 +72,8 @@ class CodeGen_ARM : public CodeGen_Posix {
     };
     vector<Pattern> casts, calls, averagings, negations;
 
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -1392,7 +1393,7 @@ Type CodeGen_ARM::upgrade_type_for_storage(const Type &t) const {
     return CodeGen_Posix::upgrade_type_for_storage(t);
 }
 
-string CodeGen_ARM::mcpu() const {
+string CodeGen_ARM::mcpu_target() const {
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
             return "swift";
@@ -1410,6 +1411,10 @@ string CodeGen_ARM::mcpu() const {
     }
 }
 
+string CodeGen_ARM::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_ARM::mattrs() const {
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index a32bca98ff7d..9f7ce30de473 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -42,7 +42,8 @@ class CodeGen_Hexagon : public CodeGen_Posix {
 
     void init_module() override;
 
-    std::string mcpu() const override;
+    std::string mcpu_target() const override;
+    std::string mcpu_tune() const override;
     std::string mattrs() const override;
     int isa_version;
     bool use_soft_float_abi() const override;
@@ -1788,7 +1789,7 @@ Value *CodeGen_Hexagon::call_intrin(llvm::Type *result_type, const string &name,
                                       fn, std::move(args));
 }
 
-string CodeGen_Hexagon::mcpu() const {
+string CodeGen_Hexagon::mcpu_target() const {
     if (target.has_feature(Halide::Target::HVX_v66)) {
         return "hexagonv66";
     } else if (target.has_feature(Halide::Target::HVX_v65)) {
@@ -1798,6 +1799,10 @@ string CodeGen_Hexagon::mcpu() const {
     }
 }
 
+string CodeGen_Hexagon::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_Hexagon::mattrs() const {
     std::stringstream attrs;
     attrs << "+hvx-length128b";
diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
index cf2b25cc0a7e..f880fc86f1eb 100644
--- a/src/CodeGen_Internal.cpp
+++ b/src/CodeGen_Internal.cpp
@@ -590,16 +590,15 @@ bool get_md_string(llvm::Metadata *value, std::string &result) {
     return false;
 }
 
-void get_target_options(const llvm::Module &module, llvm::TargetOptions &options, std::string &mcpu, std::string &mattrs) {
+void get_target_options(const llvm::Module &module, llvm::TargetOptions &options) {
     bool use_soft_float_abi = false;
     get_md_bool(module.getModuleFlag("halide_use_soft_float_abi"), use_soft_float_abi);
-    get_md_string(module.getModuleFlag("halide_mcpu"), mcpu);
-    get_md_string(module.getModuleFlag("halide_mattrs"), mattrs);
     std::string mabi;
     get_md_string(module.getModuleFlag("halide_mabi"), mabi);
     bool use_pic = true;
     get_md_bool(module.getModuleFlag("halide_use_pic"), use_pic);
 
+    // FIXME: can this be migrated into `set_function_attributes_from_halide_target_options()`?
     bool per_instruction_fast_math_flags = false;
     get_md_bool(module.getModuleFlag("halide_per_instruction_fast_math_flags"), per_instruction_fast_math_flags);
 
@@ -629,9 +628,14 @@ void clone_target_options(const llvm::Module &from, llvm::Module &to) {
         to.addModuleFlag(llvm::Module::Warning, "halide_use_soft_float_abi", use_soft_float_abi ? 1 : 0);
     }
 
-    std::string mcpu;
-    if (get_md_string(from.getModuleFlag("halide_mcpu"), mcpu)) {
-        to.addModuleFlag(llvm::Module::Warning, "halide_mcpu", llvm::MDString::get(context, mcpu));
+    std::string mcpu_target;
+    if (get_md_string(from.getModuleFlag("halide_mcpu_target"), mcpu_target)) {
+        to.addModuleFlag(llvm::Module::Warning, "halide_mcpu_target", llvm::MDString::get(context, mcpu_target));
+    }
+
+    std::string mcpu_tune;
+    if (get_md_string(from.getModuleFlag("halide_mcpu_tune"), mcpu_tune)) {
+        to.addModuleFlag(llvm::Module::Warning, "halide_mcpu_tune", llvm::MDString::get(context, mcpu_tune));
     }
 
     std::string mattrs;
@@ -657,9 +661,7 @@ std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &mod
     internal_assert(llvm_target) << "Could not create LLVM target for " << triple.str() << "\n";
 
     llvm::TargetOptions options;
-    std::string mcpu = "";
-    std::string mattrs = "";
-    get_target_options(module, options, mcpu, mattrs);
+    get_target_options(module, options);
 
     bool use_pic = true;
     get_md_bool(module.getModuleFlag("halide_use_pic"), use_pic);
@@ -668,7 +670,7 @@ std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &mod
     get_md_bool(module.getModuleFlag("halide_use_large_code_model"), use_large_code_model);
 
     auto *tm = llvm_target->createTargetMachine(module.getTargetTriple(),
-                                                mcpu, mattrs,
+                                                /*CPU target=*/"", /*Features=*/"",
                                                 options,
                                                 use_pic ? llvm::Reloc::PIC_ : llvm::Reloc::Static,
                                                 use_large_code_model ? llvm::CodeModel::Large : llvm::CodeModel::Small,
@@ -676,10 +678,21 @@ std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &mod
     return std::unique_ptr<llvm::TargetMachine>(tm);
 }
 
-void set_function_attributes_for_target(llvm::Function *fn, const Target &t) {
+void set_function_attributes_from_halide_target_options(llvm::Function &fn) {
+    llvm::Module &module = *fn.getParent();
+
+    std::string mcpu_target, mcpu_tune, mattrs;
+    get_md_string(module.getModuleFlag("halide_mcpu_target"), mcpu_target);
+    get_md_string(module.getModuleFlag("halide_mcpu_tune"), mcpu_tune);
+    get_md_string(module.getModuleFlag("halide_mattrs"), mattrs);
+
+    fn.addFnAttr("target-cpu", mcpu_target);
+    fn.addFnAttr("tune-cpu", mcpu_tune);
+    fn.addFnAttr("target-features", mattrs);
+
     // Turn off approximate reciprocals for division. It's too
     // inaccurate even for us.
-    fn->addFnAttr("reciprocal-estimates", "none");
+    fn.addFnAttr("reciprocal-estimates", "none");
 }
 
 void embed_bitcode(llvm::Module *M, const string &halide_command) {
diff --git a/src/CodeGen_Internal.h b/src/CodeGen_Internal.h
index 3fe1b8b696f5..b48a630e11a7 100644
--- a/src/CodeGen_Internal.h
+++ b/src/CodeGen_Internal.h
@@ -92,8 +92,8 @@ Expr lower_signed_shift_right(const Expr &a, const Expr &b);
 /** Reduce a mux intrinsic to a select tree */
 Expr lower_mux(const Call *mux);
 
-/** Given an llvm::Module, set llvm:TargetOptions, cpu and attr information */
-void get_target_options(const llvm::Module &module, llvm::TargetOptions &options, std::string &mcpu, std::string &mattrs);
+/** Given an llvm::Module, set llvm:TargetOptions information */
+void get_target_options(const llvm::Module &module, llvm::TargetOptions &options);
 
 /** Given two llvm::Modules, clone target options from one to the other */
 void clone_target_options(const llvm::Module &from, llvm::Module &to);
@@ -101,8 +101,8 @@ void clone_target_options(const llvm::Module &from, llvm::Module &to);
 /** Given an llvm::Module, get or create an llvm:TargetMachine */
 std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &module);
 
-/** Set the appropriate llvm Function attributes given a Target. */
-void set_function_attributes_for_target(llvm::Function *, const Target &);
+/** Set the appropriate llvm Function attributes given the Halide Target. */
+void set_function_attributes_from_halide_target_options(llvm::Function &);
 
 /** Save a copy of the llvm IR currently represented by the module as
  * data in the __LLVM,__bitcode section. Emulates clang's
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index a0f84d6cb6f4..a1fa954412cc 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -455,7 +455,8 @@ void CodeGen_LLVM::init_codegen(const std::string &name, bool any_strict_float)
 
     // Add some target specific info to the module as metadata.
     module->addModuleFlag(llvm::Module::Warning, "halide_use_soft_float_abi", use_soft_float_abi() ? 1 : 0);
-    module->addModuleFlag(llvm::Module::Warning, "halide_mcpu", MDString::get(*context, mcpu()));
+    module->addModuleFlag(llvm::Module::Warning, "halide_mcpu_target", MDString::get(*context, mcpu_target()));
+    module->addModuleFlag(llvm::Module::Warning, "halide_mcpu_tune", MDString::get(*context, mcpu_tune()));
     module->addModuleFlag(llvm::Module::Warning, "halide_mattrs", MDString::get(*context, mattrs()));
     module->addModuleFlag(llvm::Module::Warning, "halide_mabi", MDString::get(*context, mabi()));
     module->addModuleFlag(llvm::Module::Warning, "halide_use_pic", use_pic() ? 1 : 0);
@@ -523,7 +524,7 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
         }
         FunctionType *func_t = FunctionType::get(i32_t, arg_types, false);
         function = llvm::Function::Create(func_t, llvm_linkage(f.linkage), names.extern_name, module.get());
-        set_function_attributes_for_target(function, target);
+        set_function_attributes_from_halide_target_options(*function);
 
         // Mark the buffer args as no alias and save indication for add_argv_wrapper if needed
         std::vector<bool> buffer_args(f.args.size());
@@ -564,6 +565,8 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
 }
 
 std::unique_ptr<llvm::Module> CodeGen_LLVM::finish_codegen() {
+    llvm::for_each(*module, set_function_attributes_from_halide_target_options);
+
     // Verify the module is ok
     internal_assert(!verifyModule(*module, &llvm::errs()));
     debug(2) << "Done generating llvm bitcode\n";
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index dcdc6eee07e1..606840a679ec 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -106,11 +106,21 @@ class CodeGen_LLVM : public IRVisitor {
     virtual void end_func(const std::vector<LoweredArgument> &args);
     // @}
 
-    /** What should be passed as -mcpu, -mattrs, and related for
-     * compilation. The architecture-specific code generator should
-     * define these. */
+    /** What should be passed as -mcpu (warning: implies attrs!), -mattrs,
+     *  and related for compilation. The architecture-specific code generator
+     *  should define these.
+     *
+     *  `mcpu_target()` - target this specific CPU, in the sense of the allowed
+     *  ISA sets *and* the CPU-specific tuning/assembly instruction scheduling.
+     *
+     *  `mcpu_tune()` - expect that we will be running on this specific CPU,
+     *  so perform CPU-specific tuning/assembly instruction scheduling, *but*
+     *  DON'T sacrifice the portability, support running on other CPUs, only
+     *  make use of the ISAs that are enabled by `mcpu_target()`+`mattrs()`.
+     */
     // @{
-    virtual std::string mcpu() const = 0;
+    virtual std::string mcpu_target() const = 0;
+    virtual std::string mcpu_tune() const = 0;
     virtual std::string mattrs() const = 0;
     virtual std::string mabi() const;
     virtual bool use_soft_float_abi() const = 0;
diff --git a/src/CodeGen_MIPS.cpp b/src/CodeGen_MIPS.cpp
index 4118a12b684f..26bd3a502146 100644
--- a/src/CodeGen_MIPS.cpp
+++ b/src/CodeGen_MIPS.cpp
@@ -19,7 +19,8 @@ class CodeGen_MIPS : public CodeGen_Posix {
 protected:
     using CodeGen_Posix::visit;
 
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -29,7 +30,7 @@ CodeGen_MIPS::CodeGen_MIPS(const Target &t)
     : CodeGen_Posix(t) {
 }
 
-string CodeGen_MIPS::mcpu() const {
+string CodeGen_MIPS::mcpu_target() const {
     if (target.bits == 32) {
         return "";
     } else {
@@ -37,6 +38,10 @@ string CodeGen_MIPS::mcpu() const {
     }
 }
 
+string CodeGen_MIPS::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_MIPS::mattrs() const {
     if (target.bits == 32) {
         return "";
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 779512dc7348..711040f54afd 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -91,7 +91,8 @@ class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
     // @}
 
     std::string march() const;
-    std::string mcpu() const override;
+    std::string mcpu_target() const override;
+    std::string mcpu_tune() const override;
     std::string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -153,7 +154,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
     // Make our function
     FunctionType *func_t = FunctionType::get(void_t, arg_types, false);
     function = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get());
-    set_function_attributes_for_target(function, target);
+    set_function_attributes_from_halide_target_options(*function);
 
     // Mark the buffer args as no alias
     for (size_t i = 0; i < args.size(); i++) {
@@ -542,7 +543,7 @@ string CodeGen_PTX_Dev::march() const {
     return "nvptx64";
 }
 
-string CodeGen_PTX_Dev::mcpu() const {
+string CodeGen_PTX_Dev::mcpu_target() const {
     if (target.has_feature(Target::CUDACapability86)) {
         return "sm_86";
     } else if (target.has_feature(Target::CUDACapability80)) {
@@ -566,6 +567,10 @@ string CodeGen_PTX_Dev::mcpu() const {
     }
 }
 
+string CodeGen_PTX_Dev::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_PTX_Dev::mattrs() const {
     if (target.has_feature(Target::CUDACapability86)) {
         return "+ptx71";
@@ -617,7 +622,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
 
     std::unique_ptr<TargetMachine>
         target_machine(llvm_target->createTargetMachine(triple.str(),
-                                                        mcpu(), mattrs(), options,
+                                                        mcpu_target(), mattrs(), options,
                                                         llvm::Reloc::PIC_,
                                                         llvm::CodeModel::Small,
                                                         CodeGenOpt::Aggressive));
@@ -758,7 +763,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
         f.write(buffer.data(), buffer.size());
         f.close();
 
-        string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname();
+        string cmd = "ptxas --gpu-name " + mcpu_target() + " " + ptx.pathname() + " -o " + sass.pathname();
         if (system(cmd.c_str()) == 0) {
             cmd = "nvdisasm " + sass.pathname();
             int ret = system(cmd.c_str());
diff --git a/src/CodeGen_PowerPC.cpp b/src/CodeGen_PowerPC.cpp
index 42dec77fd75d..7f1e7252e941 100644
--- a/src/CodeGen_PowerPC.cpp
+++ b/src/CodeGen_PowerPC.cpp
@@ -22,7 +22,8 @@ class CodeGen_PowerPC : public CodeGen_Posix {
 protected:
     void init_module() override;
 
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -141,7 +142,7 @@ void CodeGen_PowerPC::visit(const Max *op) {
     return CodeGen_Posix::visit(op);
 }
 
-string CodeGen_PowerPC::mcpu() const {
+string CodeGen_PowerPC::mcpu_target() const {
     if (target.bits == 32) {
         return "ppc32";
     } else {
@@ -155,6 +156,10 @@ string CodeGen_PowerPC::mcpu() const {
     }
 }
 
+string CodeGen_PowerPC::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_PowerPC::mattrs() const {
     string features;
     string separator;
diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp
index 01395f596b91..434105724c3a 100644
--- a/src/CodeGen_RISCV.cpp
+++ b/src/CodeGen_RISCV.cpp
@@ -19,7 +19,8 @@ class CodeGen_RISCV : public CodeGen_Posix {
 protected:
     using CodeGen_Posix::visit;
 
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     string mabi() const override;
     bool use_soft_float_abi() const override;
@@ -30,10 +31,14 @@ CodeGen_RISCV::CodeGen_RISCV(const Target &t)
     : CodeGen_Posix(t) {
 }
 
-string CodeGen_RISCV::mcpu() const {
+string CodeGen_RISCV::mcpu_target() const {
     return "";
 }
 
+string CodeGen_RISCV::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_RISCV::mattrs() const {
     // Note: the default march is "rv[32|64]imafdc",
     // which includes standard extensions:
diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
index 83dc6775fc5f..2a63b8df2f36 100644
--- a/src/CodeGen_WebAssembly.cpp
+++ b/src/CodeGen_WebAssembly.cpp
@@ -29,7 +29,8 @@ class CodeGen_WebAssembly : public CodeGen_Posix {
 
     void init_module() override;
 
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -256,10 +257,14 @@ void CodeGen_WebAssembly::codegen_vector_reduce(const VectorReduce *op, const Ex
     CodeGen_Posix::codegen_vector_reduce(op, init);
 }
 
-string CodeGen_WebAssembly::mcpu() const {
+string CodeGen_WebAssembly::mcpu_target() const {
     return "";
 }
 
+string CodeGen_WebAssembly::mcpu_tune() const {
+    return mcpu_target();
+}
+
 string CodeGen_WebAssembly::mattrs() const {
     std::ostringstream s;
     string sep;
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index c14a6a0f0671..38fc2321d919 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -53,7 +53,8 @@ class CodeGen_X86 : public CodeGen_Posix {
     CodeGen_X86(Target);
 
 protected:
-    string mcpu() const override;
+    string mcpu_target() const override;
+    string mcpu_tune() const override;
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
@@ -689,8 +690,33 @@ void CodeGen_X86::visit(const Store *op) {
     CodeGen_Posix::visit(op);
 }
 
-string CodeGen_X86::mcpu() const {
-    // First, check if any explicit request for tuning exists.
+string CodeGen_X86::mcpu_target() const {
+    // Perform an ad-hoc guess for the -mcpu given features.
+    // WARNING: this is used to drive -mcpu, *NOT* -mtune!
+    //          The CPU choice here *WILL* affect -mattrs!
+    if (target.has_feature(Target::AVX512_SapphireRapids)) {
+        return "sapphirerapids";
+    } else if (target.has_feature(Target::AVX512_Cannonlake)) {
+        return "cannonlake";
+    } else if (target.has_feature(Target::AVX512_Skylake)) {
+        return "skylake-avx512";
+    } else if (target.has_feature(Target::AVX512_KNL)) {
+        return "knl";
+    } else if (target.has_feature(Target::AVX2)) {
+        return "haswell";
+    } else if (target.has_feature(Target::AVX)) {
+        return "corei7-avx";
+    } else if (target.has_feature(Target::SSE41)) {
+        // We want SSE4.1 but not SSE4.2, hence "penryn" rather than "corei7"
+        return "penryn";
+    } else {
+        // Default should not include SSSE3, hence "k8" rather than "core2"
+        return "k8";
+    }
+}
+
+string CodeGen_X86::mcpu_tune() const {
+    // Check if any explicit request for tuning exists.
     switch (target.processor_tune) {  // Please keep sorted.
     case Target::Processor::AMDFam10:
         return "amdfam10";
@@ -718,31 +744,14 @@ string CodeGen_X86::mcpu() const {
         return "znver3";
 
     case Target::Processor::ProcessorGeneric:
-        break;  // Detect "best" CPU from the enabled ISA's.
-    }
-
-    // And only after that, perform an ad-hoc guess for the tune given features.
-    if (target.has_feature(Target::AVX512_SapphireRapids)) {
-        return "sapphirerapids";
-    } else if (target.has_feature(Target::AVX512_Cannonlake)) {
-        return "cannonlake";
-    } else if (target.has_feature(Target::AVX512_Skylake)) {
-        return "skylake-avx512";
-    } else if (target.has_feature(Target::AVX512_KNL)) {
-        return "knl";
-    } else if (target.has_feature(Target::AVX2)) {
-        return "haswell";
-    } else if (target.has_feature(Target::AVX)) {
-        return "corei7-avx";
-    } else if (target.has_feature(Target::SSE41)) {
-        // We want SSE4.1 but not SSE4.2, hence "penryn" rather than "corei7"
-        return "penryn";
-    } else {
-        // Default should not include SSSE3, hence "k8" rather than "core2"
-        return "k8";
+        break;
     }
+    internal_assert(target.processor_tune == Target::Processor::ProcessorGeneric && "The switch should be exhaustive.");
+    return mcpu_target();  // Detect "best" CPU from the enabled ISA's.
 }
 
+// FIXME: we should lower everything here, instead of relying
+//        that -mcpu= (`mcpu_target()`) implies/sets features for us.
 string CodeGen_X86::mattrs() const {
     string features;
     string separator;
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
index acb8be5da8c7..444b355ba039 100644
--- a/src/JITModule.cpp
+++ b/src/JITModule.cpp
@@ -253,10 +253,10 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
     debug(2) << "Target triple: " << m->getTargetTriple() << "\n";
     string error_string;
 
-    string mcpu;
-    string mattrs;
+    llvm::for_each(*m, set_function_attributes_from_halide_target_options);
+
     llvm::TargetOptions options;
-    get_target_options(*m, options, mcpu, mattrs);
+    get_target_options(*m, options);
 
     DataLayout initial_module_data_layout = m->getDataLayout();
     string module_name = m->getModuleIdentifier();
@@ -269,11 +269,6 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
     engine_builder.setMCJITMemoryManager(std::unique_ptr<RTDyldMemoryManager>(memory_manager));
 
     engine_builder.setOptLevel(CodeGenOpt::Aggressive);
-    if (!mcpu.empty()) {
-        engine_builder.setMCPU(mcpu);
-    }
-    std::vector<string> mattrs_array = {mattrs};
-    engine_builder.setMAttrs(mattrs_array);
 
     TargetMachine *tm = engine_builder.selectTarget();
     internal_assert(tm) << error_string << "\n";