From a22f836713fe489cd74c12827f25d5fa604b5dd0 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Tue, 15 Mar 2022 14:25:38 +0300 Subject: [PATCH 01/12] `-mtune=`/`-mcpu=` support for x86 AMD CPU's --- python_bindings/src/PyEnums.cpp | 12 ++++++++++++ src/CodeGen_X86.cpp | 28 ++++++++++++++++++++++++++++ src/Target.cpp | 23 +++++++++++++++++++++-- src/Target.h | 12 ++++++++++++ src/runtime/HalideRuntime.h | 12 ++++++++++++ 5 files changed, 85 insertions(+), 2 deletions(-) diff --git a/python_bindings/src/PyEnums.cpp b/python_bindings/src/PyEnums.cpp index b7e0d2518368..eef6c2cbd7c9 100644 --- a/python_bindings/src/PyEnums.cpp +++ b/python_bindings/src/PyEnums.cpp @@ -156,6 +156,18 @@ void define_enums(py::module &m) { .value("ARMv81a", Target::Feature::ARMv81a) .value("SanitizerCoverage", Target::Feature::SanitizerCoverage) .value("ProfileByTimer", Target::Feature::ProfileByTimer) + .value("TuneK8", Target::Feature::TuneK8) + .value("TuneK8_SSE3", Target::Feature::TuneK8_SSE3) + .value("TuneAMDFam10", Target::Feature::TuneAMDFam10) + .value("TuneBtVer1", Target::Feature::TuneBtVer1) + .value("TuneBdVer1", Target::Feature::TuneBdVer1) + .value("TuneBdVer2", Target::Feature::TuneBdVer2) + .value("TuneBdVer3", Target::Feature::TuneBdVer3) + .value("TuneBdVer4", Target::Feature::TuneBdVer4) + .value("TuneBtVer2", Target::Feature::TuneBtVer2) + .value("TuneZnVer1", Target::Feature::TuneZnVer1) + .value("TuneZnVer2", Target::Feature::TuneZnVer2) + .value("TuneZnVer3", Target::Feature::TuneZnVer3) .value("FeatureEnd", Target::Feature::FeatureEnd); py::enum_(m, "TypeCode") diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index b75500ee2684..390b061e9507 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -686,6 +686,34 @@ void CodeGen_X86::visit(const Store *op) { } string CodeGen_X86::mcpu() const { + // First, check if any explicit request for tuning exists. + if (target.has_feature(Target::TuneK8)) { + return "k8"; + } else if (target.has_feature(Target::TuneK8_SSE3)) { + return "k8-sse3"; + } else if (target.has_feature(Target::TuneAMDFam10)) { + return "amdfam10"; + } else if (target.has_feature(Target::TuneBtVer1)) { + return "btver1"; + } else if (target.has_feature(Target::TuneBdVer1)) { + return "bdver1"; + } else if (target.has_feature(Target::TuneBdVer2)) { + return "bdver2"; + } else if (target.has_feature(Target::TuneBdVer3)) { + return "bdver3"; + } else if (target.has_feature(Target::TuneBdVer4)) { + return "bdver4"; + } else if (target.has_feature(Target::TuneBtVer2)) { + return "btver2"; + } else if (target.has_feature(Target::TuneZnVer1)) { + return "znver1"; + } else if (target.has_feature(Target::TuneZnVer2)) { + return "znver2"; + } else if (target.has_feature(Target::TuneZnVer3)) { + return "znver3"; + } + + // And only after that, perform an ad-hoc guess for the tune given features. if (target.has_feature(Target::AVX512_SapphireRapids)) { return "sapphirerapids"; } else if (target.has_feature(Target::AVX512_Cannonlake)) { diff --git a/src/Target.cpp b/src/Target.cpp index cd9c08fe9448..fc5f468b3c04 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -386,6 +386,18 @@ const std::map feature_name_map = { {"armv81a", Target::ARMv81a}, {"sanitizer_coverage", Target::SanitizerCoverage}, {"profile_by_timer", Target::ProfileByTimer}, + {"tune_k8", Target::TuneK8}, + {"tune_k8_sse3", Target::TuneK8_SSE3}, + {"tune_amdfam10", Target::TuneAMDFam10}, + {"tune_btver1", Target::TuneBtVer1}, + {"tune_bdver1", Target::TuneBdVer1}, + {"tune_bdver2", Target::TuneBdVer2}, + {"tune_bdver3", Target::TuneBdVer3}, + {"tune_bdver4", Target::TuneBdVer4}, + {"tune_btver2", Target::TuneBtVer2}, + {"tune_znver1", Target::TuneZnVer1}, + {"tune_znver2", Target::TuneZnVer2}, + {"tune_znver3", Target::TuneZnVer3}, // NOTE: When adding features to this map, be sure to update PyEnums.cpp as well. }; @@ -454,7 +466,7 @@ bool merge_string(Target &t, const std::string &target) { } tokens.push_back(rest); - bool os_specified = false, arch_specified = false, bits_specified = false, features_specified = false; + bool os_specified = false, arch_specified = false, bits_specified = false, tune_specified = false, features_specified = false; bool is_host = false; for (size_t i = 0; i < tokens.size(); i++) { @@ -485,6 +497,13 @@ bool merge_string(Target &t, const std::string &target) { } os_specified = true; } else if (lookup_feature(tok, feature)) { + if (tok.substr(0, std::strlen("tune_")) == "tune_") { + if (tune_specified) { + // Only a single tune makes sense. + return false; + } + tune_specified = true; + } t.set_feature(feature); features_specified = true; } else if (tok == "trace_all") { @@ -980,7 +999,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result) // clang-format on // clang-format off - const std::array intersection_features = {{ + const std::array intersection_features = {{ ARMv7s, ARMv81a, AVX, diff --git a/src/Target.h b/src/Target.h index 5b9588ab60d9..6c22c3651d5e 100644 --- a/src/Target.h +++ b/src/Target.h @@ -133,6 +133,18 @@ struct Target { ARMv81a = halide_target_feature_armv81a, SanitizerCoverage = halide_target_feature_sanitizer_coverage, ProfileByTimer = halide_target_feature_profile_by_timer, + TuneK8 = halide_target_feature_tune_k8, + TuneK8_SSE3 = halide_target_feature_tune_k8_sse3, + TuneAMDFam10 = halide_target_feature_tune_amdfam10, + TuneBtVer1 = halide_target_feature_tune_btver1, + TuneBdVer1 = halide_target_feature_tune_bdver1, + TuneBdVer2 = halide_target_feature_tune_bdver2, + TuneBdVer3 = halide_target_feature_tune_bdver3, + TuneBdVer4 = halide_target_feature_tune_bdver4, + TuneBtVer2 = halide_target_feature_tune_btver2, + TuneZnVer1 = halide_target_feature_tune_znver1, + TuneZnVer2 = halide_target_feature_tune_znver2, + TuneZnVer3 = halide_target_feature_tune_znver3, FeatureEnd = halide_target_feature_end }; Target() = default; diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 6089110420a7..8df5c326ae59 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1348,6 +1348,18 @@ typedef enum halide_target_feature_t { halide_target_feature_armv81a, ///< Enable ARMv8.1-a instructions halide_target_feature_sanitizer_coverage, ///< Enable hooks for SanitizerCoverage support. halide_target_feature_profile_by_timer, ///< Alternative to halide_target_feature_profile using timer interrupt for systems without threads or applicartions that need to avoid them. + halide_target_feature_tune_k8, ///< Tune specifically for the AMD K8 CPU. + halide_target_feature_tune_k8_sse3, ///< Tune specifically for the AMD K8 w/SSE3 CPU. + halide_target_feature_tune_amdfam10, ///< Tune specifically for the AMD FAM10 CPU. + halide_target_feature_tune_btver1, ///< Tune specifically for the AMD BtVer1 CPU. + halide_target_feature_tune_bdver1, ///< Tune specifically for the AMD BdVer1 CPU. + halide_target_feature_tune_bdver2, ///< Tune specifically for the AMD BdVer2 CPU. + halide_target_feature_tune_bdver3, ///< Tune specifically for the AMD BdVer3 CPU. + halide_target_feature_tune_bdver4, ///< Tune specifically for the AMD BdVer4 CPU. + halide_target_feature_tune_btver2, ///< Tune specifically for the AMD BtVer2 CPU. + halide_target_feature_tune_znver1, ///< Tune specifically for the AMD ZnVer1 CPU. + halide_target_feature_tune_znver2, ///< Tune specifically for the AMD ZnVer2 CPU. + halide_target_feature_tune_znver3, ///< Tune specifically for the AMD ZnVer3 CPU. halide_target_feature_end ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing. } halide_target_feature_t; From 8c0c07b3fbf02358a1f1a551a409c624c6b6e471 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sat, 26 Mar 2022 17:50:54 +0300 Subject: [PATCH 02/12] Move processor tune into it's own enum, out of features --- python_bindings/src/PyEnums.cpp | 27 +++++----- python_bindings/src/PyTarget.cpp | 7 +-- src/CPlusPlusMangle.cpp | 16 +++--- src/CodeGen_X86.cpp | 27 +++++----- src/HexagonOffload.cpp | 2 +- src/Module.cpp | 4 +- src/Target.cpp | 68 +++++++++++++++++++------- src/Target.h | 37 ++++++++------ src/runtime/HalideRuntime.h | 12 ----- test/correctness/simd_op_check_hvx.cpp | 2 +- test/correctness/target.cpp | 16 +++--- 11 files changed, 126 insertions(+), 92 deletions(-) diff --git a/python_bindings/src/PyEnums.cpp b/python_bindings/src/PyEnums.cpp index eef6c2cbd7c9..7e96b60f02a2 100644 --- a/python_bindings/src/PyEnums.cpp +++ b/python_bindings/src/PyEnums.cpp @@ -82,6 +82,21 @@ void define_enums(py::module &m) { .value("RISCV", Target::Arch::RISCV) .value("WebAssembly", Target::Arch::WebAssembly); + py::enum_(m, "TargetProcessor") + .value("TuneGeneric", Target::Processor::ProcessorGeneric) + .value("TuneK8", Target::Processor::K8) + .value("TuneK8_SSE3", Target::Processor::K8_SSE3) + .value("TuneAMDFam10", Target::Processor::AMDFam10) + .value("TuneBtVer1", Target::Processor::BtVer1) + .value("TuneBdVer1", Target::Processor::BdVer1) + .value("TuneBdVer2", Target::Processor::BdVer2) + .value("TuneBdVer3", Target::Processor::BdVer3) + .value("TuneBdVer4", Target::Processor::BdVer4) + .value("TuneBtVer2", Target::Processor::BtVer2) + .value("TuneZnVer1", Target::Processor::ZnVer1) + .value("TuneZnVer2", Target::Processor::ZnVer2) + .value("TuneZnVer3", Target::Processor::ZnVer3); + py::enum_(m, "TargetFeature") .value("JIT", Target::Feature::JIT) .value("Debug", Target::Feature::Debug) @@ -156,18 +171,6 @@ void define_enums(py::module &m) { .value("ARMv81a", Target::Feature::ARMv81a) .value("SanitizerCoverage", Target::Feature::SanitizerCoverage) .value("ProfileByTimer", Target::Feature::ProfileByTimer) - .value("TuneK8", Target::Feature::TuneK8) - .value("TuneK8_SSE3", Target::Feature::TuneK8_SSE3) - .value("TuneAMDFam10", Target::Feature::TuneAMDFam10) - .value("TuneBtVer1", Target::Feature::TuneBtVer1) - .value("TuneBdVer1", Target::Feature::TuneBdVer1) - .value("TuneBdVer2", Target::Feature::TuneBdVer2) - .value("TuneBdVer3", Target::Feature::TuneBdVer3) - .value("TuneBdVer4", Target::Feature::TuneBdVer4) - .value("TuneBtVer2", Target::Feature::TuneBtVer2) - .value("TuneZnVer1", Target::Feature::TuneZnVer1) - .value("TuneZnVer2", Target::Feature::TuneZnVer2) - .value("TuneZnVer3", Target::Feature::TuneZnVer3) .value("FeatureEnd", Target::Feature::FeatureEnd); py::enum_(m, "TypeCode") diff --git a/python_bindings/src/PyTarget.cpp b/python_bindings/src/PyTarget.cpp index 718936332ea9..21f87038bd15 100644 --- a/python_bindings/src/PyTarget.cpp +++ b/python_bindings/src/PyTarget.cpp @@ -23,8 +23,8 @@ void define_target(py::module &m) { py::class_(m, "Target") .def(py::init<>()) .def(py::init()) - .def(py::init()) - .def(py::init>()) + .def(py::init()) + .def(py::init>()) .def("__eq__", [](const Target &value, Target *value2) { return value2 && value == *value2; }) .def("__ne__", [](const Target &value, Target *value2) { return !value2 || value != *value2; }) @@ -32,12 +32,13 @@ void define_target(py::module &m) { .def_readwrite("os", &Target::os) .def_readwrite("arch", &Target::arch) .def_readwrite("bits", &Target::bits) + .def_readwrite("processor", &Target::processor) .def("__repr__", &target_repr) .def("__str__", &Target::to_string) .def("to_string", &Target::to_string) - .def("has_feature", (bool (Target::*)(Target::Feature) const) & Target::has_feature) + .def("has_feature", (bool(Target::*)(Target::Feature) const) & Target::has_feature) .def("features_any_of", &Target::features_any_of, py::arg("features")) .def("features_all_of", &Target::features_all_of, py::arg("features")) diff --git a/src/CPlusPlusMangle.cpp b/src/CPlusPlusMangle.cpp index 05c9d552e68f..37efa6f904b8 100644 --- a/src/CPlusPlusMangle.cpp +++ b/src/CPlusPlusMangle.cpp @@ -942,14 +942,14 @@ void main_tests(const MangleResult *expecteds, const Target &target) { void cplusplus_mangle_test() { Target targets[kTestTargetCount]{ - Target(Target::Linux, Target::X86, 32), - Target(Target::Linux, Target::X86, 64), - Target(Target::OSX, Target::X86, 32), - Target(Target::OSX, Target::X86, 64), - Target(Target::IOS, Target::ARM, 32), - Target(Target::IOS, Target::ARM, 64), - Target(Target::Windows, Target::X86, 32), - Target(Target::Windows, Target::X86, 64)}; + Target(Target::Linux, Target::X86, 32, Target::Processor::ProcessorGeneric), + Target(Target::Linux, Target::X86, 64, Target::Processor::ProcessorGeneric), + Target(Target::OSX, Target::X86, 32, Target::Processor::ProcessorGeneric), + Target(Target::OSX, Target::X86, 64, Target::Processor::ProcessorGeneric), + Target(Target::IOS, Target::ARM, 32, Target::Processor::ProcessorGeneric), + Target(Target::IOS, Target::ARM, 64, Target::Processor::ProcessorGeneric), + Target(Target::Windows, Target::X86, 32, Target::Processor::ProcessorGeneric), + Target(Target::Windows, Target::X86, 64, Target::Processor::ProcessorGeneric)}; MangleResult *expecteds[kTestTargetCount]{ ItaniumABIMangling_main, ItaniumABIMangling_main, ItaniumABIMangling_main, ItaniumABIMangling_main, diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 390b061e9507..f91f12045d72 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -687,30 +687,33 @@ void CodeGen_X86::visit(const Store *op) { string CodeGen_X86::mcpu() const { // First, check if any explicit request for tuning exists. - if (target.has_feature(Target::TuneK8)) { + switch (target.processor) { + case Target::Processor::K8: return "k8"; - } else if (target.has_feature(Target::TuneK8_SSE3)) { + case Target::Processor::K8_SSE3: return "k8-sse3"; - } else if (target.has_feature(Target::TuneAMDFam10)) { + case Target::Processor::AMDFam10: return "amdfam10"; - } else if (target.has_feature(Target::TuneBtVer1)) { + case Target::Processor::BtVer1: return "btver1"; - } else if (target.has_feature(Target::TuneBdVer1)) { + case Target::Processor::BdVer1: return "bdver1"; - } else if (target.has_feature(Target::TuneBdVer2)) { + case Target::Processor::BdVer2: return "bdver2"; - } else if (target.has_feature(Target::TuneBdVer3)) { + case Target::Processor::BdVer3: return "bdver3"; - } else if (target.has_feature(Target::TuneBdVer4)) { + case Target::Processor::BdVer4: return "bdver4"; - } else if (target.has_feature(Target::TuneBtVer2)) { + case Target::Processor::BtVer2: return "btver2"; - } else if (target.has_feature(Target::TuneZnVer1)) { + case Target::Processor::ZnVer1: return "znver1"; - } else if (target.has_feature(Target::TuneZnVer2)) { + case Target::Processor::ZnVer2: return "znver2"; - } else if (target.has_feature(Target::TuneZnVer3)) { + case Target::Processor::ZnVer3: return "znver3"; + case Target::Processor::ProcessorGeneric: + break; // Detect "best" CPU from the enabled ISA's. } // And only after that, perform an ad-hoc guess for the tune given features. diff --git a/src/HexagonOffload.cpp b/src/HexagonOffload.cpp index 8ffd1d0c2e4d..05b16a953ec6 100644 --- a/src/HexagonOffload.cpp +++ b/src/HexagonOffload.cpp @@ -967,7 +967,7 @@ class InjectHexagonRpc : public IRMutator { Stmt inject_hexagon_rpc(Stmt s, const Target &host_target, Module &containing_module) { // Make a new target for the device module. - Target target(Target::NoOS, Target::Hexagon, 32); + Target target(Target::NoOS, Target::Hexagon, 32, Target::ProcessorGeneric); // There are two ways of offloading, on device and on host. // In the former we have true QuRT available, while on the // latter we simulate the Hexagon side code with a barebones diff --git a/src/Module.cpp b/src/Module.cpp index ec05676a0ecd..50e7a9787c22 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -615,7 +615,7 @@ void Module::compile(const std::map &output_files) } } debug(1) << "Module.compile(): static_library " << output_files.at(OutputFileType::static_library) << "\n"; - Target base_target(target().os, target().arch, target().bits); + Target base_target(target().os, target().arch, target().bits, target().processor); create_static_library(temp_dir.files(), base_target, output_files.at(OutputFileType::static_library)); } if (contains(output_files, OutputFileType::assembly)) { @@ -923,7 +923,7 @@ void compile_multitarget(const std::string &fn_name, // and add that to the result. if (!base_target.has_feature(Target::NoRuntime)) { // Start with a bare Target, set only the features we know are common to all. - Target runtime_target(base_target.os, base_target.arch, base_target.bits); + Target runtime_target(base_target.os, base_target.arch, base_target.bits, base_target.processor); for (int i = 0; i < Target::FeatureEnd; ++i) { // We never want NoRuntime set here. if (i == Target::NoRuntime) { diff --git a/src/Target.cpp b/src/Target.cpp index fc5f468b3c04..9f2619231bff 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -76,6 +76,7 @@ Target calculate_host_target() { bool use_64_bits = (sizeof(size_t) == 8); int bits = use_64_bits ? 64 : 32; + Target::Processor processor = Target::Processor::ProcessorGeneric; std::vector initial_features; #if __riscv @@ -189,7 +190,7 @@ Target calculate_host_target() { #endif #endif - return {os, arch, bits, initial_features}; + return {os, arch, bits, processor, initial_features}; } bool is_using_hexagon(const Target &t) { @@ -307,6 +308,31 @@ bool lookup_arch(const std::string &tok, Target::Arch &result) { return false; } +const std::map processor_name_map = { + {"tune_generic", Target::Processor::ProcessorGeneric}, + {"tune_k8", Target::Processor::K8}, + {"tune_k8_sse3", Target::Processor::K8_SSE3}, + {"tune_amdfam10", Target::Processor::AMDFam10}, + {"tune_btver1", Target::Processor::BtVer1}, + {"tune_bdver1", Target::Processor::BdVer1}, + {"tune_bdver2", Target::Processor::BdVer2}, + {"tune_bdver3", Target::Processor::BdVer3}, + {"tune_bdver4", Target::Processor::BdVer4}, + {"tune_btver2", Target::Processor::BtVer2}, + {"tune_znver1", Target::Processor::ZnVer1}, + {"tune_znver2", Target::Processor::ZnVer2}, + {"tune_znver3", Target::Processor::ZnVer3}, +}; + +bool lookup_processor(const std::string &tok, Target::Processor &result) { + auto processor_iter = processor_name_map.find(tok); + if (processor_iter != processor_name_map.end()) { + result = processor_iter->second; + return true; + } + return false; +} + const std::map feature_name_map = { {"jit", Target::JIT}, {"debug", Target::Debug}, @@ -386,18 +412,6 @@ const std::map feature_name_map = { {"armv81a", Target::ARMv81a}, {"sanitizer_coverage", Target::SanitizerCoverage}, {"profile_by_timer", Target::ProfileByTimer}, - {"tune_k8", Target::TuneK8}, - {"tune_k8_sse3", Target::TuneK8_SSE3}, - {"tune_amdfam10", Target::TuneAMDFam10}, - {"tune_btver1", Target::TuneBtVer1}, - {"tune_bdver1", Target::TuneBdVer1}, - {"tune_bdver2", Target::TuneBdVer2}, - {"tune_bdver3", Target::TuneBdVer3}, - {"tune_bdver4", Target::TuneBdVer4}, - {"tune_btver2", Target::TuneBtVer2}, - {"tune_znver1", Target::TuneZnVer1}, - {"tune_znver2", Target::TuneZnVer2}, - {"tune_znver3", Target::TuneZnVer3}, // NOTE: When adding features to this map, be sure to update PyEnums.cpp as well. }; @@ -466,7 +480,7 @@ bool merge_string(Target &t, const std::string &target) { } tokens.push_back(rest); - bool os_specified = false, arch_specified = false, bits_specified = false, tune_specified = false, features_specified = false; + bool os_specified = false, arch_specified = false, bits_specified = false, processor_specified = false, features_specified = false; bool is_host = false; for (size_t i = 0; i < tokens.size(); i++) { @@ -496,13 +510,18 @@ bool merge_string(Target &t, const std::string &target) { return false; } os_specified = true; + } else if (lookup_processor(tok, t.processor)) { + if (processor_specified) { + return false; + } + processor_specified = true; } else if (lookup_feature(tok, feature)) { if (tok.substr(0, std::strlen("tune_")) == "tune_") { - if (tune_specified) { + if (processor_specified) { // Only a single tune makes sense. return false; } - tune_specified = true; + processor_specified = true; } t.set_feature(feature); features_specified = true; @@ -560,6 +579,12 @@ void bad_target_string(const std::string &target) { separator = ", "; } separator = ""; + std::string processors; + for (const auto &processor_entry : processor_name_map) { + processors += separator + processor_entry.first; + separator = ", "; + } + separator = ""; // Format the features to go one feature over 70 characters per line, // assume the first line starts with "Features are ". int line_char_start = -(int)sizeof("Features are"); @@ -574,10 +599,11 @@ void bad_target_string(const std::string &target) { } } user_error << "Did not understand Halide target " << target << "\n" - << "Expected format is arch-bits-os-feature1-feature2-...\n" + << "Expected format is arch-bits-os-processor-feature1-feature2-...\n" << "Where arch is: " << architectures << ".\n" << "bits is either 32 or 64.\n" << "os is: " << oses << ".\n" + << "processor is: " << processors << ".\n" << "\n" << "If arch, bits, or os are omitted, they default to the host.\n" << "\n" @@ -647,6 +673,12 @@ std::string Target::to_string() const { break; } } + for (const auto &processor_entry : processor_name_map) { + if (processor_entry.second == processor) { + result += "-" + processor_entry.first; + break; + } + } for (const auto &feature_entry : feature_name_map) { if (has_feature(feature_entry.second)) { result += "-" + feature_entry.first; @@ -1066,7 +1098,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result) // Union of features is computed through bitwise-or, and masked away by the features we care about // Intersection of features is computed through bitwise-and and masked away, too. // We merge the bits via bitwise or. - Target output = Target{os, arch, bits}; + Target output = Target{os, arch, bits, processor}; output.features = ((features | other.features) & union_mask) | ((features | other.features) & matching_mask) | ((features & other.features) & intersection_mask); // Pick tight lower bound for CUDA capability. Use fall-through to clear redundant features diff --git a/src/Target.h b/src/Target.h index 6c22c3651d5e..c0d9043e8081 100644 --- a/src/Target.h +++ b/src/Target.h @@ -50,6 +50,24 @@ struct Target { /** The bit-width of the target machine. Must be 0 for unknown, or 32 or 64. */ int bits = 0; + /** The specific processor to be targeted, tuned for. + * Corresponds to processor_name_map in Target.cpp. */ + enum Processor { + ProcessorGeneric = 0, + K8, + K8_SSE3, + AMDFam10, + BtVer1, + BdVer1, + BdVer2, + BdVer3, + BdVer4, + BtVer2, + ZnVer1, + ZnVer2, + ZnVer3, + } processor = ProcessorGeneric; + /** Optional features a target can have. * Corresponds to feature_name_map in Target.cpp. * See definitions in HalideRuntime.h for full information. @@ -133,23 +151,11 @@ struct Target { ARMv81a = halide_target_feature_armv81a, SanitizerCoverage = halide_target_feature_sanitizer_coverage, ProfileByTimer = halide_target_feature_profile_by_timer, - TuneK8 = halide_target_feature_tune_k8, - TuneK8_SSE3 = halide_target_feature_tune_k8_sse3, - TuneAMDFam10 = halide_target_feature_tune_amdfam10, - TuneBtVer1 = halide_target_feature_tune_btver1, - TuneBdVer1 = halide_target_feature_tune_bdver1, - TuneBdVer2 = halide_target_feature_tune_bdver2, - TuneBdVer3 = halide_target_feature_tune_bdver3, - TuneBdVer4 = halide_target_feature_tune_bdver4, - TuneBtVer2 = halide_target_feature_tune_btver2, - TuneZnVer1 = halide_target_feature_tune_znver1, - TuneZnVer2 = halide_target_feature_tune_znver2, - TuneZnVer3 = halide_target_feature_tune_znver3, FeatureEnd = halide_target_feature_end }; Target() = default; - Target(OS o, Arch a, int b, const std::vector &initial_features = std::vector()) - : os(o), arch(a), bits(b) { + Target(OS o, Arch a, int b, Processor p, const std::vector &initial_features = std::vector()) + : os(o), arch(a), bits(b), processor(p) { for (const auto &f : initial_features) { set_feature(f); } @@ -238,6 +244,7 @@ struct Target { return os == other.os && arch == other.arch && bits == other.bits && + processor == other.processor && features == other.features; } @@ -259,7 +266,7 @@ struct Target { /** Convert the Target into a string form that can be reconstituted * by merge_string(), which will always be of the form * - * arch-bits-os-feature1-feature2...featureN. + * arch-bits-os-processor-feature1-feature2...featureN. * * Note that is guaranteed that Target(t1.to_string()) == t1, * but not that Target(s).to_string() == s (since there can be diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 8df5c326ae59..6089110420a7 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1348,18 +1348,6 @@ typedef enum halide_target_feature_t { halide_target_feature_armv81a, ///< Enable ARMv8.1-a instructions halide_target_feature_sanitizer_coverage, ///< Enable hooks for SanitizerCoverage support. halide_target_feature_profile_by_timer, ///< Alternative to halide_target_feature_profile using timer interrupt for systems without threads or applicartions that need to avoid them. - halide_target_feature_tune_k8, ///< Tune specifically for the AMD K8 CPU. - halide_target_feature_tune_k8_sse3, ///< Tune specifically for the AMD K8 w/SSE3 CPU. - halide_target_feature_tune_amdfam10, ///< Tune specifically for the AMD FAM10 CPU. - halide_target_feature_tune_btver1, ///< Tune specifically for the AMD BtVer1 CPU. - halide_target_feature_tune_bdver1, ///< Tune specifically for the AMD BdVer1 CPU. - halide_target_feature_tune_bdver2, ///< Tune specifically for the AMD BdVer2 CPU. - halide_target_feature_tune_bdver3, ///< Tune specifically for the AMD BdVer3 CPU. - halide_target_feature_tune_bdver4, ///< Tune specifically for the AMD BdVer4 CPU. - halide_target_feature_tune_btver2, ///< Tune specifically for the AMD BtVer2 CPU. - halide_target_feature_tune_znver1, ///< Tune specifically for the AMD ZnVer1 CPU. - halide_target_feature_tune_znver2, ///< Tune specifically for the AMD ZnVer2 CPU. - halide_target_feature_tune_znver3, ///< Tune specifically for the AMD ZnVer3 CPU. halide_target_feature_end ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing. } halide_target_feature_t; diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp index 8b1b42e94eda..1e9940ce9b3f 100644 --- a/test/correctness/simd_op_check_hvx.cpp +++ b/test/correctness/simd_op_check_hvx.cpp @@ -708,7 +708,7 @@ int main(int argc, char **argv) { printf("host is: %s\n", host.to_string().c_str()); printf("HL_TARGET is: %s\n", hl_target.to_string().c_str()); - Target t(Target::NoOS, Target::Hexagon, 32); + Target t(Target::NoOS, Target::Hexagon, 32, Target::ProcessorGeneric); for (const auto &f : {Target::HVX, Target::HVX_v62, Target::HVX_v65, diff --git a/test/correctness/target.cpp b/test/correctness/target.cpp index 7c575c5233ee..2da8f9a0038f 100644 --- a/test/correctness/target.cpp +++ b/test/correctness/target.cpp @@ -38,7 +38,7 @@ int main(int argc, char **argv) { // } // Full specification round-trip: - t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41}); + t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41}); ts = t1.to_string(); if (ts != "x86-32-linux-sse41") { printf("to_string failure: %s\n", ts.c_str()); @@ -50,7 +50,7 @@ int main(int argc, char **argv) { } // Full specification round-trip, crazy features - t1 = Target(Target::Android, Target::ARM, 32, + t1 = Target(Target::Android, Target::ARM, 32, Target::ProcessorGeneric, {Target::JIT, Target::SSE41, Target::AVX, Target::AVX2, Target::CUDA, Target::OpenCL, Target::OpenGLCompute, Target::Debug}); @@ -99,7 +99,7 @@ int main(int argc, char **argv) { } // with_feature - t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41}); + t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41}); t2 = t1.with_feature(Target::NoAsserts).with_feature(Target::NoBoundsQuery); ts = t2.to_string(); if (ts != "x86-32-linux-no_asserts-no_bounds_query-sse41") { @@ -108,7 +108,7 @@ int main(int argc, char **argv) { } // without_feature - t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41, Target::NoAsserts}); + t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41, Target::NoAsserts}); // Note that NoBoundsQuery wasn't set here, so 'without' is a no-op t2 = t1.without_feature(Target::NoAsserts).without_feature(Target::NoBoundsQuery); ts = t2.to_string(); @@ -119,7 +119,7 @@ int main(int argc, char **argv) { // natural_vector_size // SSE4.1 is 16 bytes wide - t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41}); + t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41}); if (t1.natural_vector_size() != 16) { printf("natural_vector_size failure\n"); return -1; @@ -139,7 +139,7 @@ int main(int argc, char **argv) { // AVX is 32 bytes wide for float, but we treat as only 16 for integral types, // due to suboptimal integer instructions - t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41, Target::AVX}); + t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41, Target::AVX}); if (t1.natural_vector_size() != 16) { printf("natural_vector_size failure\n"); return -1; @@ -158,7 +158,7 @@ int main(int argc, char **argv) { } // AVX2 is 32 bytes wide - t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41, Target::AVX, Target::AVX2}); + t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41, Target::AVX, Target::AVX2}); if (t1.natural_vector_size() != 32) { printf("natural_vector_size failure\n"); return -1; @@ -177,7 +177,7 @@ int main(int argc, char **argv) { } // NEON is 16 bytes wide - t1 = Target(Target::Linux, Target::ARM, 32); + t1 = Target(Target::Linux, Target::ARM, 32, Target::ProcessorGeneric); if (t1.natural_vector_size() != 16) { printf("natural_vector_size failure\n"); return -1; From 78bfb586bbe4f1140caf7f5fa7c641a7629d39be Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sat, 26 Mar 2022 20:11:35 +0300 Subject: [PATCH 03/12] clang-format --- python_bindings/src/PyTarget.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_bindings/src/PyTarget.cpp b/python_bindings/src/PyTarget.cpp index 21f87038bd15..7984dfc60441 100644 --- a/python_bindings/src/PyTarget.cpp +++ b/python_bindings/src/PyTarget.cpp @@ -38,7 +38,7 @@ void define_target(py::module &m) { .def("__str__", &Target::to_string) .def("to_string", &Target::to_string) - .def("has_feature", (bool(Target::*)(Target::Feature) const) & Target::has_feature) + .def("has_feature", (bool (Target::*)(Target::Feature) const) & Target::has_feature) .def("features_any_of", &Target::features_any_of, py::arg("features")) .def("features_all_of", &Target::features_all_of, py::arg("features")) From 8b9aaefd8158c28ef6932e2c67083fcf30b2fbe1 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 30 Mar 2022 02:04:21 +0300 Subject: [PATCH 04/12] Target: make Processor more optional --- python_bindings/src/PyTarget.cpp | 2 ++ src/Target.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python_bindings/src/PyTarget.cpp b/python_bindings/src/PyTarget.cpp index 7984dfc60441..8f3f9a631c58 100644 --- a/python_bindings/src/PyTarget.cpp +++ b/python_bindings/src/PyTarget.cpp @@ -23,7 +23,9 @@ void define_target(py::module &m) { py::class_(m, "Target") .def(py::init<>()) .def(py::init()) + .def(py::init()) .def(py::init()) + .def(py::init>()) .def(py::init>()) .def("__eq__", [](const Target &value, Target *value2) { return value2 && value == *value2; }) diff --git a/src/Target.h b/src/Target.h index c0d9043e8081..2e1e365a8b93 100644 --- a/src/Target.h +++ b/src/Target.h @@ -154,7 +154,7 @@ struct Target { FeatureEnd = halide_target_feature_end }; Target() = default; - Target(OS o, Arch a, int b, Processor p, const std::vector &initial_features = std::vector()) + Target(OS o, Arch a, int b, Processor p = ProcessorGeneric, const std::vector &initial_features = std::vector()) : os(o), arch(a), bits(b), processor(p) { for (const auto &f : initial_features) { set_feature(f); From 2c3e5f9c5a03765793cdc018c54d582cf3802c41 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 30 Mar 2022 02:04:43 +0300 Subject: [PATCH 05/12] Processor: add explanatory comments which CPU is what --- src/Target.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/Target.h b/src/Target.h index 2e1e365a8b93..48feb86acf3c 100644 --- a/src/Target.h +++ b/src/Target.h @@ -53,18 +53,32 @@ struct Target { /** The specific processor to be targeted, tuned for. * Corresponds to processor_name_map in Target.cpp. */ enum Processor { + /// Do not tune for any specific CPU. In practice, this means that + /// halide will decide the tune CPU based on the enabled features. ProcessorGeneric = 0, + /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). K8, + /// Tune for later versions of AMD K8 CPU, with SSE3 support. K8_SSE3, + /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). AMDFam10, + /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). BtVer1, + /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). BdVer1, + /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). BdVer2, + /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). BdVer3, + /// Tune for AMD Steamroller CPU (AMD Family 15h (4th-gen), launched 2015). BdVer4, + /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2011). BtVer2, + /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017). ZnVer1, + /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). ZnVer2, + /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). ZnVer3, } processor = ProcessorGeneric; From a1128b45f371568616a5dc61885452782b1f731e Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 30 Mar 2022 02:05:01 +0300 Subject: [PATCH 06/12] Drop outdated changes --- src/Target.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/Target.cpp b/src/Target.cpp index 9f2619231bff..93ef617b5a64 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -516,13 +516,6 @@ bool merge_string(Target &t, const std::string &target) { } processor_specified = true; } else if (lookup_feature(tok, feature)) { - if (tok.substr(0, std::strlen("tune_")) == "tune_") { - if (processor_specified) { - // Only a single tune makes sense. - return false; - } - processor_specified = true; - } t.set_feature(feature); features_specified = true; } else if (tok == "trace_all") { @@ -1031,7 +1024,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result) // clang-format on // clang-format off - const std::array intersection_features = {{ + const std::array intersection_features = {{ ARMv7s, ARMv81a, AVX, From 1fe950f39c0ed7603d9cc78debdaaf63ef5ddfce Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 30 Mar 2022 02:12:28 +0300 Subject: [PATCH 07/12] Make comments in Processor more readable / fix BtVer2 comment --- src/Target.h | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/src/Target.h b/src/Target.h index 48feb86acf3c..062871717b09 100644 --- a/src/Target.h +++ b/src/Target.h @@ -56,30 +56,18 @@ struct Target { /// Do not tune for any specific CPU. In practice, this means that /// halide will decide the tune CPU based on the enabled features. ProcessorGeneric = 0, - /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). - K8, - /// Tune for later versions of AMD K8 CPU, with SSE3 support. - K8_SSE3, - /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). - AMDFam10, - /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). - BtVer1, - /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). - BdVer1, - /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). - BdVer2, - /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). - BdVer3, - /// Tune for AMD Steamroller CPU (AMD Family 15h (4th-gen), launched 2015). - BdVer4, - /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2011). - BtVer2, - /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017). - ZnVer1, - /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). - ZnVer2, - /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). - ZnVer3, + K8, /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). + K8_SSE3, /// Tune for later versions of AMD K8 CPU, with SSE3 support. + AMDFam10, /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). + BtVer1, /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). + BdVer1, /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). + BdVer2, /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). + BdVer3, /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). + BdVer4, /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015). + BtVer2, /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2011). + ZnVer1, /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017). + ZnVer2, /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). + ZnVer3, /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). } processor = ProcessorGeneric; /** Optional features a target can have. From bc4d8fbef597e35d0dd9cf2b444c97187d224dfe Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 30 Mar 2022 02:18:45 +0300 Subject: [PATCH 08/12] Target: don't require passing Processor --- src/CPlusPlusMangle.cpp | 16 ++++++++-------- src/HexagonOffload.cpp | 2 +- src/Target.h | 6 +++++- test/correctness/simd_op_check_hvx.cpp | 2 +- test/correctness/target.cpp | 16 ++++++++-------- 5 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/CPlusPlusMangle.cpp b/src/CPlusPlusMangle.cpp index 37efa6f904b8..05c9d552e68f 100644 --- a/src/CPlusPlusMangle.cpp +++ b/src/CPlusPlusMangle.cpp @@ -942,14 +942,14 @@ void main_tests(const MangleResult *expecteds, const Target &target) { void cplusplus_mangle_test() { Target targets[kTestTargetCount]{ - Target(Target::Linux, Target::X86, 32, Target::Processor::ProcessorGeneric), - Target(Target::Linux, Target::X86, 64, Target::Processor::ProcessorGeneric), - Target(Target::OSX, Target::X86, 32, Target::Processor::ProcessorGeneric), - Target(Target::OSX, Target::X86, 64, Target::Processor::ProcessorGeneric), - Target(Target::IOS, Target::ARM, 32, Target::Processor::ProcessorGeneric), - Target(Target::IOS, Target::ARM, 64, Target::Processor::ProcessorGeneric), - Target(Target::Windows, Target::X86, 32, Target::Processor::ProcessorGeneric), - Target(Target::Windows, Target::X86, 64, Target::Processor::ProcessorGeneric)}; + Target(Target::Linux, Target::X86, 32), + Target(Target::Linux, Target::X86, 64), + Target(Target::OSX, Target::X86, 32), + Target(Target::OSX, Target::X86, 64), + Target(Target::IOS, Target::ARM, 32), + Target(Target::IOS, Target::ARM, 64), + Target(Target::Windows, Target::X86, 32), + Target(Target::Windows, Target::X86, 64)}; MangleResult *expecteds[kTestTargetCount]{ ItaniumABIMangling_main, ItaniumABIMangling_main, ItaniumABIMangling_main, ItaniumABIMangling_main, diff --git a/src/HexagonOffload.cpp b/src/HexagonOffload.cpp index 05b16a953ec6..8ffd1d0c2e4d 100644 --- a/src/HexagonOffload.cpp +++ b/src/HexagonOffload.cpp @@ -967,7 +967,7 @@ class InjectHexagonRpc : public IRMutator { Stmt inject_hexagon_rpc(Stmt s, const Target &host_target, Module &containing_module) { // Make a new target for the device module. - Target target(Target::NoOS, Target::Hexagon, 32, Target::ProcessorGeneric); + Target target(Target::NoOS, Target::Hexagon, 32); // There are two ways of offloading, on device and on host. // In the former we have true QuRT available, while on the // latter we simulate the Hexagon side code with a barebones diff --git a/src/Target.h b/src/Target.h index 062871717b09..2667d6ed2504 100644 --- a/src/Target.h +++ b/src/Target.h @@ -156,13 +156,17 @@ struct Target { FeatureEnd = halide_target_feature_end }; Target() = default; - Target(OS o, Arch a, int b, Processor p = ProcessorGeneric, const std::vector &initial_features = std::vector()) + Target(OS o, Arch a, int b, Processor p, const std::vector &initial_features = std::vector()) : os(o), arch(a), bits(b), processor(p) { for (const auto &f : initial_features) { set_feature(f); } } + Target(OS o, Arch a, int b, const std::vector &initial_features = std::vector()) + : Target(o, a, b, ProcessorGeneric, initial_features) { + } + /** Given a string of the form used in HL_TARGET * (e.g. "x86-64-avx"), construct the Target it specifies. Note * that this always starts with the result of get_host_target(), diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp index 1e9940ce9b3f..8b1b42e94eda 100644 --- a/test/correctness/simd_op_check_hvx.cpp +++ b/test/correctness/simd_op_check_hvx.cpp @@ -708,7 +708,7 @@ int main(int argc, char **argv) { printf("host is: %s\n", host.to_string().c_str()); printf("HL_TARGET is: %s\n", hl_target.to_string().c_str()); - Target t(Target::NoOS, Target::Hexagon, 32, Target::ProcessorGeneric); + Target t(Target::NoOS, Target::Hexagon, 32); for (const auto &f : {Target::HVX, Target::HVX_v62, Target::HVX_v65, diff --git a/test/correctness/target.cpp b/test/correctness/target.cpp index 2da8f9a0038f..7c575c5233ee 100644 --- a/test/correctness/target.cpp +++ b/test/correctness/target.cpp @@ -38,7 +38,7 @@ int main(int argc, char **argv) { // } // Full specification round-trip: - t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41}); + t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41}); ts = t1.to_string(); if (ts != "x86-32-linux-sse41") { printf("to_string failure: %s\n", ts.c_str()); @@ -50,7 +50,7 @@ int main(int argc, char **argv) { } // Full specification round-trip, crazy features - t1 = Target(Target::Android, Target::ARM, 32, Target::ProcessorGeneric, + t1 = Target(Target::Android, Target::ARM, 32, {Target::JIT, Target::SSE41, Target::AVX, Target::AVX2, Target::CUDA, Target::OpenCL, Target::OpenGLCompute, Target::Debug}); @@ -99,7 +99,7 @@ int main(int argc, char **argv) { } // with_feature - t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41}); + t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41}); t2 = t1.with_feature(Target::NoAsserts).with_feature(Target::NoBoundsQuery); ts = t2.to_string(); if (ts != "x86-32-linux-no_asserts-no_bounds_query-sse41") { @@ -108,7 +108,7 @@ int main(int argc, char **argv) { } // without_feature - t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41, Target::NoAsserts}); + t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41, Target::NoAsserts}); // Note that NoBoundsQuery wasn't set here, so 'without' is a no-op t2 = t1.without_feature(Target::NoAsserts).without_feature(Target::NoBoundsQuery); ts = t2.to_string(); @@ -119,7 +119,7 @@ int main(int argc, char **argv) { // natural_vector_size // SSE4.1 is 16 bytes wide - t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41}); + t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41}); if (t1.natural_vector_size() != 16) { printf("natural_vector_size failure\n"); return -1; @@ -139,7 +139,7 @@ int main(int argc, char **argv) { // AVX is 32 bytes wide for float, but we treat as only 16 for integral types, // due to suboptimal integer instructions - t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41, Target::AVX}); + t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41, Target::AVX}); if (t1.natural_vector_size() != 16) { printf("natural_vector_size failure\n"); return -1; @@ -158,7 +158,7 @@ int main(int argc, char **argv) { } // AVX2 is 32 bytes wide - t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41, Target::AVX, Target::AVX2}); + t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41, Target::AVX, Target::AVX2}); if (t1.natural_vector_size() != 32) { printf("natural_vector_size failure\n"); return -1; @@ -177,7 +177,7 @@ int main(int argc, char **argv) { } // NEON is 16 bytes wide - t1 = Target(Target::Linux, Target::ARM, 32, Target::ProcessorGeneric); + t1 = Target(Target::Linux, Target::ARM, 32); if (t1.natural_vector_size() != 16) { printf("natural_vector_size failure\n"); return -1; From e218a4bfbdb987c39faf9c346878ac9160657b10 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 30 Mar 2022 02:40:43 +0300 Subject: [PATCH 09/12] Make processor more optional in the features string serialization/verification --- src/Target.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Target.cpp b/src/Target.cpp index 93ef617b5a64..3bb966955fad 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -600,6 +600,8 @@ void bad_target_string(const std::string &target) { << "\n" << "If arch, bits, or os are omitted, they default to the host.\n" << "\n" + << "If processor is omitted, it default to the tune_generic.\n" + << "\n" << "Features are: " << features << ".\n" << "\n" << "The target can also begin with \"host\", which sets the " @@ -666,10 +668,12 @@ std::string Target::to_string() const { break; } } - for (const auto &processor_entry : processor_name_map) { - if (processor_entry.second == processor) { - result += "-" + processor_entry.first; - break; + if (processor != ProcessorGeneric) { + for (const auto &processor_entry : processor_name_map) { + if (processor_entry.second == processor) { + result += "-" + processor_entry.first; + break; + } } } for (const auto &feature_entry : feature_name_map) { From 9bda615fb23535cf06557bde3871594b765510ba Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 30 Mar 2022 20:11:20 +0300 Subject: [PATCH 10/12] Address review notes --- python_bindings/src/PyEnums.cpp | 9 +++++---- src/CodeGen_X86.cpp | 15 ++++++++------- src/Target.cpp | 17 ++++++++++++----- src/Target.h | 32 ++++++++++++++++---------------- src/runtime/HalideRuntime.h | 21 +++++++++++++++++++++ 5 files changed, 62 insertions(+), 32 deletions(-) diff --git a/python_bindings/src/PyEnums.cpp b/python_bindings/src/PyEnums.cpp index 7e96b60f02a2..5b2a3204db19 100644 --- a/python_bindings/src/PyEnums.cpp +++ b/python_bindings/src/PyEnums.cpp @@ -82,17 +82,18 @@ void define_enums(py::module &m) { .value("RISCV", Target::Arch::RISCV) .value("WebAssembly", Target::Arch::WebAssembly); + // Please keep sorted. py::enum_(m, "TargetProcessor") - .value("TuneGeneric", Target::Processor::ProcessorGeneric) - .value("TuneK8", Target::Processor::K8) - .value("TuneK8_SSE3", Target::Processor::K8_SSE3) .value("TuneAMDFam10", Target::Processor::AMDFam10) - .value("TuneBtVer1", Target::Processor::BtVer1) .value("TuneBdVer1", Target::Processor::BdVer1) .value("TuneBdVer2", Target::Processor::BdVer2) .value("TuneBdVer3", Target::Processor::BdVer3) .value("TuneBdVer4", Target::Processor::BdVer4) + .value("TuneBtVer1", Target::Processor::BtVer1) .value("TuneBtVer2", Target::Processor::BtVer2) + .value("TuneGeneric", Target::Processor::ProcessorGeneric) + .value("TuneK8", Target::Processor::K8) + .value("TuneK8_SSE3", Target::Processor::K8_SSE3) .value("TuneZnVer1", Target::Processor::ZnVer1) .value("TuneZnVer2", Target::Processor::ZnVer2) .value("TuneZnVer3", Target::Processor::ZnVer3); diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index f91f12045d72..c1a957242848 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -687,15 +687,9 @@ void CodeGen_X86::visit(const Store *op) { string CodeGen_X86::mcpu() const { // First, check if any explicit request for tuning exists. - switch (target.processor) { - case Target::Processor::K8: - return "k8"; - case Target::Processor::K8_SSE3: - return "k8-sse3"; + switch (target.processor) { // Please keep sorted. case Target::Processor::AMDFam10: return "amdfam10"; - case Target::Processor::BtVer1: - return "btver1"; case Target::Processor::BdVer1: return "bdver1"; case Target::Processor::BdVer2: @@ -704,14 +698,21 @@ string CodeGen_X86::mcpu() const { return "bdver3"; case Target::Processor::BdVer4: return "bdver4"; + case Target::Processor::BtVer1: + return "btver1"; case Target::Processor::BtVer2: return "btver2"; + case Target::Processor::K8: + return "k8"; + case Target::Processor::K8_SSE3: + return "k8-sse3"; case Target::Processor::ZnVer1: return "znver1"; case Target::Processor::ZnVer2: return "znver2"; case Target::Processor::ZnVer3: return "znver3"; + case Target::Processor::ProcessorGeneric: break; // Detect "best" CPU from the enabled ISA's. } diff --git a/src/Target.cpp b/src/Target.cpp index 3bb966955fad..7e529ef924ee 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -308,17 +308,24 @@ bool lookup_arch(const std::string &tok, Target::Arch &result) { return false; } +/// Important design consideration: currently, the string key is +/// effectively identical to the LLVM CPU string, and it would be really really +/// good to keep it that way, so the proper tune_* can be autogenerated easily +/// from the LLVM CPU string (currently, by replacing "-" with "_", +/// and prepending "tune_" prefix) +/// +/// Please keep sorted. const std::map processor_name_map = { - {"tune_generic", Target::Processor::ProcessorGeneric}, - {"tune_k8", Target::Processor::K8}, - {"tune_k8_sse3", Target::Processor::K8_SSE3}, {"tune_amdfam10", Target::Processor::AMDFam10}, - {"tune_btver1", Target::Processor::BtVer1}, {"tune_bdver1", Target::Processor::BdVer1}, {"tune_bdver2", Target::Processor::BdVer2}, {"tune_bdver3", Target::Processor::BdVer3}, {"tune_bdver4", Target::Processor::BdVer4}, + {"tune_btver1", Target::Processor::BtVer1}, {"tune_btver2", Target::Processor::BtVer2}, + {"tune_generic", Target::Processor::ProcessorGeneric}, + {"tune_k8", Target::Processor::K8}, + {"tune_k8_sse3", Target::Processor::K8_SSE3}, {"tune_znver1", Target::Processor::ZnVer1}, {"tune_znver2", Target::Processor::ZnVer2}, {"tune_znver3", Target::Processor::ZnVer3}, @@ -600,7 +607,7 @@ void bad_target_string(const std::string &target) { << "\n" << "If arch, bits, or os are omitted, they default to the host.\n" << "\n" - << "If processor is omitted, it default to the tune_generic.\n" + << "If processor is omitted, it defaults to tune_generic.\n" << "\n" << "Features are: " << features << ".\n" << "\n" diff --git a/src/Target.h b/src/Target.h index 2667d6ed2504..45aaeefa9d84 100644 --- a/src/Target.h +++ b/src/Target.h @@ -51,23 +51,23 @@ struct Target { int bits = 0; /** The specific processor to be targeted, tuned for. - * Corresponds to processor_name_map in Target.cpp. */ + * Corresponds to processor_name_map in Target.cpp. + * + * Please keep sorted. */ enum Processor { - /// Do not tune for any specific CPU. In practice, this means that - /// halide will decide the tune CPU based on the enabled features. - ProcessorGeneric = 0, - K8, /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). - K8_SSE3, /// Tune for later versions of AMD K8 CPU, with SSE3 support. - AMDFam10, /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). - BtVer1, /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). - BdVer1, /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). - BdVer2, /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). - BdVer3, /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). - BdVer4, /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015). - BtVer2, /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2011). - ZnVer1, /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017). - ZnVer2, /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). - ZnVer3, /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). + AMDFam10 = halide_target_processor_amdfam10, /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). + BdVer1 = halide_target_processor_bdver1, /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). + BdVer2 = halide_target_processor_bdver2, /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). + BdVer3 = halide_target_processor_bdver3, /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). + BdVer4 = halide_target_processor_bdver4, /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015). + BtVer1 = halide_target_processor_btver1, /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). + BtVer2 = halide_target_processor_btver2, /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2011). + K8 = halide_target_processor_k8, /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). + K8_SSE3 = halide_target_processor_k8_sse3, /// Tune for later versions of AMD K8 CPU, with SSE3 support. + ProcessorGeneric = halide_target_processor_generic, /// Do not tune for any specific CPU. In practice, this means that halide will decide the tune CPU based on the enabled features. + ZnVer1 = halide_target_processor_znver1, /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017). + ZnVer2 = halide_target_processor_znver2, /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). + ZnVer3 = halide_target_processor_znver3, /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). } processor = ProcessorGeneric; /** Optional features a target can have. diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 6089110420a7..9b302c97e003 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1253,6 +1253,27 @@ extern int halide_error_storage_bound_too_small(void *user_context, const char * extern int halide_error_device_crop_failed(void *user_context); // @} +/** The specific processor to be targeted, tuned for. + * Be sure to keep this in sync with the Processor enum in Target.h + * + * New entries should be added to the end, before halide_target_processor_end. */ +typedef enum halide_target_processor_t { + halide_target_processor_generic = 0, ///< Do not tune for any specific CPU. In practice, this means that halide will decide the tune CPU based on the enabled features. + halide_target_processor_k8, ///< Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). + halide_target_processor_k8_sse3, ///< Tune for later versions of AMD K8 CPU, with SSE3 support. + halide_target_processor_amdfam10, ///< Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). + halide_target_processor_btver1, ///< Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). + halide_target_processor_bdver1, ///< Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). + halide_target_processor_bdver2, ///< Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). + halide_target_processor_bdver3, ///< Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). + halide_target_processor_bdver4, ///< Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015). + halide_target_processor_btver2, ///< Tune for AMD Jaguar CPU (AMD Family 16h, launched 2011). + halide_target_processor_znver1, ///< Tune for AMD Zen CPU (AMD Family 17h, launched 2017). + halide_target_processor_znver2, ///< Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). + halide_target_processor_znver3, ///< Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). + halide_target_processor_end, ///< A sentinel. +} halide_target_processor_t; + /** Optional features a compilation Target can have. * Be sure to keep this in sync with the Feature enum in Target.h and the implementation of * get_runtime_compatible_target in Target.cpp if you add a new feature. From f2086fd803fc3cc58faae9fdf186b92ed01f6afa Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 30 Mar 2022 22:12:02 +0300 Subject: [PATCH 11/12] Undo introduction of halide_target_processor_t --- src/Target.h | 29 +++++++++++++++-------------- src/runtime/HalideRuntime.h | 21 --------------------- 2 files changed, 15 insertions(+), 35 deletions(-) diff --git a/src/Target.h b/src/Target.h index 45aaeefa9d84..db772b6bef1b 100644 --- a/src/Target.h +++ b/src/Target.h @@ -53,21 +53,22 @@ struct Target { /** The specific processor to be targeted, tuned for. * Corresponds to processor_name_map in Target.cpp. * - * Please keep sorted. */ + * New entries should be added to the end. */ enum Processor { - AMDFam10 = halide_target_processor_amdfam10, /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). - BdVer1 = halide_target_processor_bdver1, /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). - BdVer2 = halide_target_processor_bdver2, /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). - BdVer3 = halide_target_processor_bdver3, /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). - BdVer4 = halide_target_processor_bdver4, /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015). - BtVer1 = halide_target_processor_btver1, /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). - BtVer2 = halide_target_processor_btver2, /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2011). - K8 = halide_target_processor_k8, /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). - K8_SSE3 = halide_target_processor_k8_sse3, /// Tune for later versions of AMD K8 CPU, with SSE3 support. - ProcessorGeneric = halide_target_processor_generic, /// Do not tune for any specific CPU. In practice, this means that halide will decide the tune CPU based on the enabled features. - ZnVer1 = halide_target_processor_znver1, /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017). - ZnVer2 = halide_target_processor_znver2, /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). - ZnVer3 = halide_target_processor_znver3, /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). + /// Do not tune for any specific CPU. In practice, this means that halide will decide the tune CPU based on the enabled features. + ProcessorGeneric = 0, + K8, /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). + K8_SSE3, /// Tune for later versions of AMD K8 CPU, with SSE3 support. + AMDFam10, /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). + BtVer1, /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). + BdVer1, /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). + BdVer2, /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). + BdVer3, /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). + BdVer4, /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015). + BtVer2, /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2011). + ZnVer1, /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017). + ZnVer2, /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). + ZnVer3, /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). } processor = ProcessorGeneric; /** Optional features a target can have. diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 9b302c97e003..6089110420a7 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1253,27 +1253,6 @@ extern int halide_error_storage_bound_too_small(void *user_context, const char * extern int halide_error_device_crop_failed(void *user_context); // @} -/** The specific processor to be targeted, tuned for. - * Be sure to keep this in sync with the Processor enum in Target.h - * - * New entries should be added to the end, before halide_target_processor_end. */ -typedef enum halide_target_processor_t { - halide_target_processor_generic = 0, ///< Do not tune for any specific CPU. In practice, this means that halide will decide the tune CPU based on the enabled features. - halide_target_processor_k8, ///< Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). - halide_target_processor_k8_sse3, ///< Tune for later versions of AMD K8 CPU, with SSE3 support. - halide_target_processor_amdfam10, ///< Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). - halide_target_processor_btver1, ///< Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). - halide_target_processor_bdver1, ///< Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). - halide_target_processor_bdver2, ///< Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). - halide_target_processor_bdver3, ///< Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). - halide_target_processor_bdver4, ///< Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015). - halide_target_processor_btver2, ///< Tune for AMD Jaguar CPU (AMD Family 16h, launched 2011). - halide_target_processor_znver1, ///< Tune for AMD Zen CPU (AMD Family 17h, launched 2017). - halide_target_processor_znver2, ///< Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). - halide_target_processor_znver3, ///< Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). - halide_target_processor_end, ///< A sentinel. -} halide_target_processor_t; - /** Optional features a compilation Target can have. * Be sure to keep this in sync with the Feature enum in Target.h and the implementation of * get_runtime_compatible_target in Target.cpp if you add a new feature. From d78fb63319cfa7bf81a33251a1ace7d5344226fc Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 30 Mar 2022 22:17:41 +0300 Subject: [PATCH 12/12] Fix year for btver2/jaguar --- src/Target.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Target.h b/src/Target.h index db772b6bef1b..514d3466f9dc 100644 --- a/src/Target.h +++ b/src/Target.h @@ -65,7 +65,7 @@ struct Target { BdVer2, /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). BdVer3, /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). BdVer4, /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015). - BtVer2, /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2011). + BtVer2, /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2013). ZnVer1, /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017). ZnVer2, /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). ZnVer3, /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020).