diff --git a/python_bindings/src/PyEnums.cpp b/python_bindings/src/PyEnums.cpp index b7e0d2518368..5b2a3204db19 100644 --- a/python_bindings/src/PyEnums.cpp +++ b/python_bindings/src/PyEnums.cpp @@ -82,6 +82,22 @@ void define_enums(py::module &m) { .value("RISCV", Target::Arch::RISCV) .value("WebAssembly", Target::Arch::WebAssembly); + // Please keep sorted. + py::enum_(m, "TargetProcessor") + .value("TuneAMDFam10", Target::Processor::AMDFam10) + .value("TuneBdVer1", Target::Processor::BdVer1) + .value("TuneBdVer2", Target::Processor::BdVer2) + .value("TuneBdVer3", Target::Processor::BdVer3) + .value("TuneBdVer4", Target::Processor::BdVer4) + .value("TuneBtVer1", Target::Processor::BtVer1) + .value("TuneBtVer2", Target::Processor::BtVer2) + .value("TuneGeneric", Target::Processor::ProcessorGeneric) + .value("TuneK8", Target::Processor::K8) + .value("TuneK8_SSE3", Target::Processor::K8_SSE3) + .value("TuneZnVer1", Target::Processor::ZnVer1) + .value("TuneZnVer2", Target::Processor::ZnVer2) + .value("TuneZnVer3", Target::Processor::ZnVer3); + py::enum_(m, "TargetFeature") .value("JIT", Target::Feature::JIT) .value("Debug", Target::Feature::Debug) diff --git a/python_bindings/src/PyTarget.cpp b/python_bindings/src/PyTarget.cpp index 718936332ea9..8f3f9a631c58 100644 --- a/python_bindings/src/PyTarget.cpp +++ b/python_bindings/src/PyTarget.cpp @@ -24,7 +24,9 @@ void define_target(py::module &m) { .def(py::init<>()) .def(py::init()) .def(py::init()) + .def(py::init()) .def(py::init>()) + .def(py::init>()) .def("__eq__", [](const Target &value, Target *value2) { return value2 && value == *value2; }) .def("__ne__", [](const Target &value, Target *value2) { return !value2 || value != *value2; }) @@ -32,6 +34,7 @@ void define_target(py::module &m) { .def_readwrite("os", &Target::os) .def_readwrite("arch", &Target::arch) .def_readwrite("bits", &Target::bits) + .def_readwrite("processor", &Target::processor) .def("__repr__", &target_repr) .def("__str__", &Target::to_string) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index b75500ee2684..c1a957242848 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -686,6 +686,38 @@ void CodeGen_X86::visit(const Store *op) { } string CodeGen_X86::mcpu() const { + // First, check if any explicit request for tuning exists. + switch (target.processor) { // Please keep sorted. + case Target::Processor::AMDFam10: + return "amdfam10"; + case Target::Processor::BdVer1: + return "bdver1"; + case Target::Processor::BdVer2: + return "bdver2"; + case Target::Processor::BdVer3: + return "bdver3"; + case Target::Processor::BdVer4: + return "bdver4"; + case Target::Processor::BtVer1: + return "btver1"; + case Target::Processor::BtVer2: + return "btver2"; + case Target::Processor::K8: + return "k8"; + case Target::Processor::K8_SSE3: + return "k8-sse3"; + case Target::Processor::ZnVer1: + return "znver1"; + case Target::Processor::ZnVer2: + return "znver2"; + case Target::Processor::ZnVer3: + return "znver3"; + + case Target::Processor::ProcessorGeneric: + break; // Detect "best" CPU from the enabled ISA's. + } + + // And only after that, perform an ad-hoc guess for the tune given features. if (target.has_feature(Target::AVX512_SapphireRapids)) { return "sapphirerapids"; } else if (target.has_feature(Target::AVX512_Cannonlake)) { diff --git a/src/Module.cpp b/src/Module.cpp index ec05676a0ecd..50e7a9787c22 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -615,7 +615,7 @@ void Module::compile(const std::map &output_files) } } debug(1) << "Module.compile(): static_library " << output_files.at(OutputFileType::static_library) << "\n"; - Target base_target(target().os, target().arch, target().bits); + Target base_target(target().os, target().arch, target().bits, target().processor); create_static_library(temp_dir.files(), base_target, output_files.at(OutputFileType::static_library)); } if (contains(output_files, OutputFileType::assembly)) { @@ -923,7 +923,7 @@ void compile_multitarget(const std::string &fn_name, // and add that to the result. if (!base_target.has_feature(Target::NoRuntime)) { // Start with a bare Target, set only the features we know are common to all. - Target runtime_target(base_target.os, base_target.arch, base_target.bits); + Target runtime_target(base_target.os, base_target.arch, base_target.bits, base_target.processor); for (int i = 0; i < Target::FeatureEnd; ++i) { // We never want NoRuntime set here. if (i == Target::NoRuntime) { diff --git a/src/Target.cpp b/src/Target.cpp index cd9c08fe9448..7e529ef924ee 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -76,6 +76,7 @@ Target calculate_host_target() { bool use_64_bits = (sizeof(size_t) == 8); int bits = use_64_bits ? 64 : 32; + Target::Processor processor = Target::Processor::ProcessorGeneric; std::vector initial_features; #if __riscv @@ -189,7 +190,7 @@ Target calculate_host_target() { #endif #endif - return {os, arch, bits, initial_features}; + return {os, arch, bits, processor, initial_features}; } bool is_using_hexagon(const Target &t) { @@ -307,6 +308,38 @@ bool lookup_arch(const std::string &tok, Target::Arch &result) { return false; } +/// Important design consideration: currently, the string key is +/// effectively identical to the LLVM CPU string, and it would be really really +/// good to keep it that way, so the proper tune_* can be autogenerated easily +/// from the LLVM CPU string (currently, by replacing "-" with "_", +/// and prepending "tune_" prefix) +/// +/// Please keep sorted. +const std::map processor_name_map = { + {"tune_amdfam10", Target::Processor::AMDFam10}, + {"tune_bdver1", Target::Processor::BdVer1}, + {"tune_bdver2", Target::Processor::BdVer2}, + {"tune_bdver3", Target::Processor::BdVer3}, + {"tune_bdver4", Target::Processor::BdVer4}, + {"tune_btver1", Target::Processor::BtVer1}, + {"tune_btver2", Target::Processor::BtVer2}, + {"tune_generic", Target::Processor::ProcessorGeneric}, + {"tune_k8", Target::Processor::K8}, + {"tune_k8_sse3", Target::Processor::K8_SSE3}, + {"tune_znver1", Target::Processor::ZnVer1}, + {"tune_znver2", Target::Processor::ZnVer2}, + {"tune_znver3", Target::Processor::ZnVer3}, +}; + +bool lookup_processor(const std::string &tok, Target::Processor &result) { + auto processor_iter = processor_name_map.find(tok); + if (processor_iter != processor_name_map.end()) { + result = processor_iter->second; + return true; + } + return false; +} + const std::map feature_name_map = { {"jit", Target::JIT}, {"debug", Target::Debug}, @@ -454,7 +487,7 @@ bool merge_string(Target &t, const std::string &target) { } tokens.push_back(rest); - bool os_specified = false, arch_specified = false, bits_specified = false, features_specified = false; + bool os_specified = false, arch_specified = false, bits_specified = false, processor_specified = false, features_specified = false; bool is_host = false; for (size_t i = 0; i < tokens.size(); i++) { @@ -484,6 +517,11 @@ bool merge_string(Target &t, const std::string &target) { return false; } os_specified = true; + } else if (lookup_processor(tok, t.processor)) { + if (processor_specified) { + return false; + } + processor_specified = true; } else if (lookup_feature(tok, feature)) { t.set_feature(feature); features_specified = true; @@ -541,6 +579,12 @@ void bad_target_string(const std::string &target) { separator = ", "; } separator = ""; + std::string processors; + for (const auto &processor_entry : processor_name_map) { + processors += separator + processor_entry.first; + separator = ", "; + } + separator = ""; // Format the features to go one feature over 70 characters per line, // assume the first line starts with "Features are ". int line_char_start = -(int)sizeof("Features are"); @@ -555,13 +599,16 @@ void bad_target_string(const std::string &target) { } } user_error << "Did not understand Halide target " << target << "\n" - << "Expected format is arch-bits-os-feature1-feature2-...\n" + << "Expected format is arch-bits-os-processor-feature1-feature2-...\n" << "Where arch is: " << architectures << ".\n" << "bits is either 32 or 64.\n" << "os is: " << oses << ".\n" + << "processor is: " << processors << ".\n" << "\n" << "If arch, bits, or os are omitted, they default to the host.\n" << "\n" + << "If processor is omitted, it defaults to tune_generic.\n" + << "\n" << "Features are: " << features << ".\n" << "\n" << "The target can also begin with \"host\", which sets the " @@ -628,6 +675,14 @@ std::string Target::to_string() const { break; } } + if (processor != ProcessorGeneric) { + for (const auto &processor_entry : processor_name_map) { + if (processor_entry.second == processor) { + result += "-" + processor_entry.first; + break; + } + } + } for (const auto &feature_entry : feature_name_map) { if (has_feature(feature_entry.second)) { result += "-" + feature_entry.first; @@ -1047,7 +1102,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result) // Union of features is computed through bitwise-or, and masked away by the features we care about // Intersection of features is computed through bitwise-and and masked away, too. // We merge the bits via bitwise or. - Target output = Target{os, arch, bits}; + Target output = Target{os, arch, bits, processor}; output.features = ((features | other.features) & union_mask) | ((features | other.features) & matching_mask) | ((features & other.features) & intersection_mask); // Pick tight lower bound for CUDA capability. Use fall-through to clear redundant features diff --git a/src/Target.h b/src/Target.h index 5b9588ab60d9..514d3466f9dc 100644 --- a/src/Target.h +++ b/src/Target.h @@ -50,6 +50,27 @@ struct Target { /** The bit-width of the target machine. Must be 0 for unknown, or 32 or 64. */ int bits = 0; + /** The specific processor to be targeted, tuned for. + * Corresponds to processor_name_map in Target.cpp. + * + * New entries should be added to the end. */ + enum Processor { + /// Do not tune for any specific CPU. In practice, this means that halide will decide the tune CPU based on the enabled features. + ProcessorGeneric = 0, + K8, /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003). + K8_SSE3, /// Tune for later versions of AMD K8 CPU, with SSE3 support. + AMDFam10, /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007). + BtVer1, /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011). + BdVer1, /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011). + BdVer2, /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012). + BdVer3, /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014). + BdVer4, /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015). + BtVer2, /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2013). + ZnVer1, /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017). + ZnVer2, /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019). + ZnVer3, /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020). + } processor = ProcessorGeneric; + /** Optional features a target can have. * Corresponds to feature_name_map in Target.cpp. * See definitions in HalideRuntime.h for full information. @@ -136,13 +157,17 @@ struct Target { FeatureEnd = halide_target_feature_end }; Target() = default; - Target(OS o, Arch a, int b, const std::vector &initial_features = std::vector()) - : os(o), arch(a), bits(b) { + Target(OS o, Arch a, int b, Processor p, const std::vector &initial_features = std::vector()) + : os(o), arch(a), bits(b), processor(p) { for (const auto &f : initial_features) { set_feature(f); } } + Target(OS o, Arch a, int b, const std::vector &initial_features = std::vector()) + : Target(o, a, b, ProcessorGeneric, initial_features) { + } + /** Given a string of the form used in HL_TARGET * (e.g. "x86-64-avx"), construct the Target it specifies. Note * that this always starts with the result of get_host_target(), @@ -226,6 +251,7 @@ struct Target { return os == other.os && arch == other.arch && bits == other.bits && + processor == other.processor && features == other.features; } @@ -247,7 +273,7 @@ struct Target { /** Convert the Target into a string form that can be reconstituted * by merge_string(), which will always be of the form * - * arch-bits-os-feature1-feature2...featureN. + * arch-bits-os-processor-feature1-feature2...featureN. * * Note that is guaranteed that Target(t1.to_string()) == t1, * but not that Target(s).to_string() == s (since there can be