Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

-mtune=/-mcpu= support for x86 AMD CPU's #6655

Merged
merged 12 commits into from
Mar 31, 2022
16 changes: 16 additions & 0 deletions python_bindings/src/PyEnums.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,22 @@ void define_enums(py::module &m) {
.value("RISCV", Target::Arch::RISCV)
.value("WebAssembly", Target::Arch::WebAssembly);

// Please keep sorted.
py::enum_<Target::Processor>(m, "TargetProcessor")
.value("TuneAMDFam10", Target::Processor::AMDFam10)
.value("TuneBdVer1", Target::Processor::BdVer1)
.value("TuneBdVer2", Target::Processor::BdVer2)
.value("TuneBdVer3", Target::Processor::BdVer3)
.value("TuneBdVer4", Target::Processor::BdVer4)
.value("TuneBtVer1", Target::Processor::BtVer1)
.value("TuneBtVer2", Target::Processor::BtVer2)
.value("TuneGeneric", Target::Processor::ProcessorGeneric)
.value("TuneK8", Target::Processor::K8)
.value("TuneK8_SSE3", Target::Processor::K8_SSE3)
.value("TuneZnVer1", Target::Processor::ZnVer1)
.value("TuneZnVer2", Target::Processor::ZnVer2)
.value("TuneZnVer3", Target::Processor::ZnVer3);

py::enum_<Target::Feature>(m, "TargetFeature")
.value("JIT", Target::Feature::JIT)
.value("Debug", Target::Feature::Debug)
Expand Down
3 changes: 3 additions & 0 deletions python_bindings/src/PyTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,17 @@ void define_target(py::module &m) {
.def(py::init<>())
.def(py::init<const std::string &>())
.def(py::init<Target::OS, Target::Arch, int>())
.def(py::init<Target::OS, Target::Arch, int, Target::Processor>())
.def(py::init<Target::OS, Target::Arch, int, std::vector<Target::Feature>>())
.def(py::init<Target::OS, Target::Arch, int, Target::Processor, std::vector<Target::Feature>>())

.def("__eq__", [](const Target &value, Target *value2) { return value2 && value == *value2; })
.def("__ne__", [](const Target &value, Target *value2) { return !value2 || value != *value2; })

.def_readwrite("os", &Target::os)
.def_readwrite("arch", &Target::arch)
.def_readwrite("bits", &Target::bits)
.def_readwrite("processor", &Target::processor)

.def("__repr__", &target_repr)
.def("__str__", &Target::to_string)
Expand Down
32 changes: 32 additions & 0 deletions src/CodeGen_X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,38 @@ void CodeGen_X86::visit(const Store *op) {
}

string CodeGen_X86::mcpu() const {
// First, check if any explicit request for tuning exists.
switch (target.processor) { // Please keep sorted.
case Target::Processor::AMDFam10:
return "amdfam10";
case Target::Processor::BdVer1:
return "bdver1";
case Target::Processor::BdVer2:
return "bdver2";
case Target::Processor::BdVer3:
return "bdver3";
case Target::Processor::BdVer4:
return "bdver4";
case Target::Processor::BtVer1:
return "btver1";
case Target::Processor::BtVer2:
return "btver2";
case Target::Processor::K8:
return "k8";
case Target::Processor::K8_SSE3:
return "k8-sse3";
case Target::Processor::ZnVer1:
return "znver1";
case Target::Processor::ZnVer2:
return "znver2";
case Target::Processor::ZnVer3:
return "znver3";

case Target::Processor::ProcessorGeneric:
break; // Detect "best" CPU from the enabled ISA's.
}

// And only after that, perform an ad-hoc guess for the tune given features.
if (target.has_feature(Target::AVX512_SapphireRapids)) {
return "sapphirerapids";
} else if (target.has_feature(Target::AVX512_Cannonlake)) {
Expand Down
4 changes: 2 additions & 2 deletions src/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ void Module::compile(const std::map<OutputFileType, std::string> &output_files)
}
}
debug(1) << "Module.compile(): static_library " << output_files.at(OutputFileType::static_library) << "\n";
Target base_target(target().os, target().arch, target().bits);
Target base_target(target().os, target().arch, target().bits, target().processor);
create_static_library(temp_dir.files(), base_target, output_files.at(OutputFileType::static_library));
}
if (contains(output_files, OutputFileType::assembly)) {
Expand Down Expand Up @@ -923,7 +923,7 @@ void compile_multitarget(const std::string &fn_name,
// and add that to the result.
if (!base_target.has_feature(Target::NoRuntime)) {
// Start with a bare Target, set only the features we know are common to all.
Target runtime_target(base_target.os, base_target.arch, base_target.bits);
Target runtime_target(base_target.os, base_target.arch, base_target.bits, base_target.processor);
for (int i = 0; i < Target::FeatureEnd; ++i) {
// We never want NoRuntime set here.
if (i == Target::NoRuntime) {
Expand Down
63 changes: 59 additions & 4 deletions src/Target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ Target calculate_host_target() {

bool use_64_bits = (sizeof(size_t) == 8);
int bits = use_64_bits ? 64 : 32;
Target::Processor processor = Target::Processor::ProcessorGeneric;
std::vector<Target::Feature> initial_features;

#if __riscv
Expand Down Expand Up @@ -189,7 +190,7 @@ Target calculate_host_target() {
#endif
#endif

return {os, arch, bits, initial_features};
return {os, arch, bits, processor, initial_features};
}

bool is_using_hexagon(const Target &t) {
Expand Down Expand Up @@ -307,6 +308,38 @@ bool lookup_arch(const std::string &tok, Target::Arch &result) {
return false;
}

/// Important design consideration: currently, the string key is
/// effectively identical to the LLVM CPU string, and it would be really really
/// good to keep it that way, so the proper tune_* can be autogenerated easily
/// from the LLVM CPU string (currently, by replacing "-" with "_",
/// and prepending "tune_" prefix)
///
/// Please keep sorted.
const std::map<std::string, Target::Processor> processor_name_map = {
{"tune_amdfam10", Target::Processor::AMDFam10},
{"tune_bdver1", Target::Processor::BdVer1},
{"tune_bdver2", Target::Processor::BdVer2},
{"tune_bdver3", Target::Processor::BdVer3},
{"tune_bdver4", Target::Processor::BdVer4},
{"tune_btver1", Target::Processor::BtVer1},
{"tune_btver2", Target::Processor::BtVer2},
{"tune_generic", Target::Processor::ProcessorGeneric},
{"tune_k8", Target::Processor::K8},
{"tune_k8_sse3", Target::Processor::K8_SSE3},
{"tune_znver1", Target::Processor::ZnVer1},
{"tune_znver2", Target::Processor::ZnVer2},
{"tune_znver3", Target::Processor::ZnVer3},
};

bool lookup_processor(const std::string &tok, Target::Processor &result) {
auto processor_iter = processor_name_map.find(tok);
if (processor_iter != processor_name_map.end()) {
result = processor_iter->second;
return true;
}
return false;
}

const std::map<std::string, Target::Feature> feature_name_map = {
{"jit", Target::JIT},
{"debug", Target::Debug},
Expand Down Expand Up @@ -454,7 +487,7 @@ bool merge_string(Target &t, const std::string &target) {
}
tokens.push_back(rest);

bool os_specified = false, arch_specified = false, bits_specified = false, features_specified = false;
bool os_specified = false, arch_specified = false, bits_specified = false, processor_specified = false, features_specified = false;
bool is_host = false;

for (size_t i = 0; i < tokens.size(); i++) {
Expand Down Expand Up @@ -484,6 +517,11 @@ bool merge_string(Target &t, const std::string &target) {
return false;
}
os_specified = true;
} else if (lookup_processor(tok, t.processor)) {
if (processor_specified) {
return false;
}
processor_specified = true;
} else if (lookup_feature(tok, feature)) {
t.set_feature(feature);
features_specified = true;
Expand Down Expand Up @@ -541,6 +579,12 @@ void bad_target_string(const std::string &target) {
separator = ", ";
}
separator = "";
std::string processors;
for (const auto &processor_entry : processor_name_map) {
processors += separator + processor_entry.first;
separator = ", ";
}
separator = "";
// Format the features to go one feature over 70 characters per line,
// assume the first line starts with "Features are ".
int line_char_start = -(int)sizeof("Features are");
Expand All @@ -555,13 +599,16 @@ void bad_target_string(const std::string &target) {
}
}
user_error << "Did not understand Halide target " << target << "\n"
<< "Expected format is arch-bits-os-feature1-feature2-...\n"
<< "Expected format is arch-bits-os-processor-feature1-feature2-...\n"
LebedevRI marked this conversation as resolved.
Show resolved Hide resolved
<< "Where arch is: " << architectures << ".\n"
<< "bits is either 32 or 64.\n"
<< "os is: " << oses << ".\n"
<< "processor is: " << processors << ".\n"
<< "\n"
<< "If arch, bits, or os are omitted, they default to the host.\n"
<< "\n"
<< "If processor is omitted, it defaults to tune_generic.\n"
<< "\n"
<< "Features are: " << features << ".\n"
<< "\n"
<< "The target can also begin with \"host\", which sets the "
Expand Down Expand Up @@ -628,6 +675,14 @@ std::string Target::to_string() const {
break;
}
}
if (processor != ProcessorGeneric) {
for (const auto &processor_entry : processor_name_map) {
if (processor_entry.second == processor) {
result += "-" + processor_entry.first;
break;
}
}
}
for (const auto &feature_entry : feature_name_map) {
if (has_feature(feature_entry.second)) {
result += "-" + feature_entry.first;
Expand Down Expand Up @@ -1047,7 +1102,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
// Union of features is computed through bitwise-or, and masked away by the features we care about
// Intersection of features is computed through bitwise-and and masked away, too.
// We merge the bits via bitwise or.
Target output = Target{os, arch, bits};
Target output = Target{os, arch, bits, processor};
output.features = ((features | other.features) & union_mask) | ((features | other.features) & matching_mask) | ((features & other.features) & intersection_mask);

// Pick tight lower bound for CUDA capability. Use fall-through to clear redundant features
Expand Down
32 changes: 29 additions & 3 deletions src/Target.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,27 @@ struct Target {
/** The bit-width of the target machine. Must be 0 for unknown, or 32 or 64. */
int bits = 0;

/** The specific processor to be targeted, tuned for.
LebedevRI marked this conversation as resolved.
Show resolved Hide resolved
* Corresponds to processor_name_map in Target.cpp.
*
* New entries should be added to the end. */
enum Processor {
LebedevRI marked this conversation as resolved.
Show resolved Hide resolved
/// Do not tune for any specific CPU. In practice, this means that halide will decide the tune CPU based on the enabled features.
ProcessorGeneric = 0,
K8, /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003).
K8_SSE3, /// Tune for later versions of AMD K8 CPU, with SSE3 support.
AMDFam10, /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007).
BtVer1, /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011).
BdVer1, /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011).
BdVer2, /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012).
BdVer3, /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014).
BdVer4, /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015).
BtVer2, /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2013).
ZnVer1, /// Tune for AMD Zen CPU (AMD Family 17h, launched 2017).
ZnVer2, /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019).
ZnVer3, /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020).
} processor = ProcessorGeneric;

/** Optional features a target can have.
* Corresponds to feature_name_map in Target.cpp.
* See definitions in HalideRuntime.h for full information.
Expand Down Expand Up @@ -136,13 +157,17 @@ struct Target {
FeatureEnd = halide_target_feature_end
};
Target() = default;
Target(OS o, Arch a, int b, const std::vector<Feature> &initial_features = std::vector<Feature>())
: os(o), arch(a), bits(b) {
Target(OS o, Arch a, int b, Processor p, const std::vector<Feature> &initial_features = std::vector<Feature>())
: os(o), arch(a), bits(b), processor(p) {
for (const auto &f : initial_features) {
set_feature(f);
}
}

Target(OS o, Arch a, int b, const std::vector<Feature> &initial_features = std::vector<Feature>())
: Target(o, a, b, ProcessorGeneric, initial_features) {
}

/** Given a string of the form used in HL_TARGET
* (e.g. "x86-64-avx"), construct the Target it specifies. Note
* that this always starts with the result of get_host_target(),
Expand Down Expand Up @@ -226,6 +251,7 @@ struct Target {
return os == other.os &&
arch == other.arch &&
bits == other.bits &&
processor == other.processor &&
features == other.features;
}

Expand All @@ -247,7 +273,7 @@ struct Target {
/** Convert the Target into a string form that can be reconstituted
* by merge_string(), which will always be of the form
*
* arch-bits-os-feature1-feature2...featureN.
* arch-bits-os-processor-feature1-feature2...featureN.
*
* Note that is guaranteed that Target(t1.to_string()) == t1,
* but not that Target(s).to_string() == s (since there can be
Expand Down