Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

-mtune=/-mcpu= support for x86 AMD CPU's #6655

Merged
merged 12 commits into from
Mar 31, 2022
15 changes: 15 additions & 0 deletions python_bindings/src/PyEnums.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,21 @@ void define_enums(py::module &m) {
.value("RISCV", Target::Arch::RISCV)
.value("WebAssembly", Target::Arch::WebAssembly);

py::enum_<Target::Processor>(m, "TargetProcessor")
.value("TuneGeneric", Target::Processor::ProcessorGeneric)
LebedevRI marked this conversation as resolved.
Show resolved Hide resolved
.value("TuneK8", Target::Processor::K8)
.value("TuneK8_SSE3", Target::Processor::K8_SSE3)
.value("TuneAMDFam10", Target::Processor::AMDFam10)
.value("TuneBtVer1", Target::Processor::BtVer1)
.value("TuneBdVer1", Target::Processor::BdVer1)
.value("TuneBdVer2", Target::Processor::BdVer2)
.value("TuneBdVer3", Target::Processor::BdVer3)
.value("TuneBdVer4", Target::Processor::BdVer4)
.value("TuneBtVer2", Target::Processor::BtVer2)
.value("TuneZnVer1", Target::Processor::ZnVer1)
.value("TuneZnVer2", Target::Processor::ZnVer2)
.value("TuneZnVer3", Target::Processor::ZnVer3);

py::enum_<Target::Feature>(m, "TargetFeature")
.value("JIT", Target::Feature::JIT)
.value("Debug", Target::Feature::Debug)
Expand Down
5 changes: 3 additions & 2 deletions python_bindings/src/PyTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,16 @@ void define_target(py::module &m) {
py::class_<Target>(m, "Target")
.def(py::init<>())
.def(py::init<const std::string &>())
.def(py::init<Target::OS, Target::Arch, int>())
.def(py::init<Target::OS, Target::Arch, int, std::vector<Target::Feature>>())
.def(py::init<Target::OS, Target::Arch, int, Target::Processor>())
.def(py::init<Target::OS, Target::Arch, int, Target::Processor, std::vector<Target::Feature>>())

.def("__eq__", [](const Target &value, Target *value2) { return value2 && value == *value2; })
.def("__ne__", [](const Target &value, Target *value2) { return !value2 || value != *value2; })

.def_readwrite("os", &Target::os)
.def_readwrite("arch", &Target::arch)
.def_readwrite("bits", &Target::bits)
.def_readwrite("processor", &Target::processor)

.def("__repr__", &target_repr)
.def("__str__", &Target::to_string)
Expand Down
16 changes: 8 additions & 8 deletions src/CPlusPlusMangle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -942,14 +942,14 @@ void main_tests(const MangleResult *expecteds, const Target &target) {

void cplusplus_mangle_test() {
Target targets[kTestTargetCount]{
Target(Target::Linux, Target::X86, 32),
Target(Target::Linux, Target::X86, 64),
Target(Target::OSX, Target::X86, 32),
Target(Target::OSX, Target::X86, 64),
Target(Target::IOS, Target::ARM, 32),
Target(Target::IOS, Target::ARM, 64),
Target(Target::Windows, Target::X86, 32),
Target(Target::Windows, Target::X86, 64)};
Target(Target::Linux, Target::X86, 32, Target::Processor::ProcessorGeneric),
Target(Target::Linux, Target::X86, 64, Target::Processor::ProcessorGeneric),
Target(Target::OSX, Target::X86, 32, Target::Processor::ProcessorGeneric),
Target(Target::OSX, Target::X86, 64, Target::Processor::ProcessorGeneric),
Target(Target::IOS, Target::ARM, 32, Target::Processor::ProcessorGeneric),
Target(Target::IOS, Target::ARM, 64, Target::Processor::ProcessorGeneric),
Target(Target::Windows, Target::X86, 32, Target::Processor::ProcessorGeneric),
Target(Target::Windows, Target::X86, 64, Target::Processor::ProcessorGeneric)};
MangleResult *expecteds[kTestTargetCount]{
ItaniumABIMangling_main, ItaniumABIMangling_main,
ItaniumABIMangling_main, ItaniumABIMangling_main,
Expand Down
31 changes: 31 additions & 0 deletions src/CodeGen_X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,37 @@ void CodeGen_X86::visit(const Store *op) {
}

string CodeGen_X86::mcpu() const {
// First, check if any explicit request for tuning exists.
switch (target.processor) {
case Target::Processor::K8:
return "k8";
case Target::Processor::K8_SSE3:
return "k8-sse3";
case Target::Processor::AMDFam10:
return "amdfam10";
case Target::Processor::BtVer1:
return "btver1";
case Target::Processor::BdVer1:
return "bdver1";
case Target::Processor::BdVer2:
return "bdver2";
case Target::Processor::BdVer3:
return "bdver3";
case Target::Processor::BdVer4:
return "bdver4";
case Target::Processor::BtVer2:
return "btver2";
case Target::Processor::ZnVer1:
return "znver1";
case Target::Processor::ZnVer2:
return "znver2";
case Target::Processor::ZnVer3:
return "znver3";
case Target::Processor::ProcessorGeneric:
break; // Detect "best" CPU from the enabled ISA's.
}

// And only after that, perform an ad-hoc guess for the tune given features.
if (target.has_feature(Target::AVX512_SapphireRapids)) {
return "sapphirerapids";
} else if (target.has_feature(Target::AVX512_Cannonlake)) {
Expand Down
2 changes: 1 addition & 1 deletion src/HexagonOffload.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,7 @@ class InjectHexagonRpc : public IRMutator {
Stmt inject_hexagon_rpc(Stmt s, const Target &host_target,
Module &containing_module) {
// Make a new target for the device module.
Target target(Target::NoOS, Target::Hexagon, 32);
Target target(Target::NoOS, Target::Hexagon, 32, Target::ProcessorGeneric);
// There are two ways of offloading, on device and on host.
// In the former we have true QuRT available, while on the
// latter we simulate the Hexagon side code with a barebones
Expand Down
4 changes: 2 additions & 2 deletions src/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ void Module::compile(const std::map<OutputFileType, std::string> &output_files)
}
}
debug(1) << "Module.compile(): static_library " << output_files.at(OutputFileType::static_library) << "\n";
Target base_target(target().os, target().arch, target().bits);
Target base_target(target().os, target().arch, target().bits, target().processor);
create_static_library(temp_dir.files(), base_target, output_files.at(OutputFileType::static_library));
}
if (contains(output_files, OutputFileType::assembly)) {
Expand Down Expand Up @@ -923,7 +923,7 @@ void compile_multitarget(const std::string &fn_name,
// and add that to the result.
if (!base_target.has_feature(Target::NoRuntime)) {
// Start with a bare Target, set only the features we know are common to all.
Target runtime_target(base_target.os, base_target.arch, base_target.bits);
Target runtime_target(base_target.os, base_target.arch, base_target.bits, base_target.processor);
for (int i = 0; i < Target::FeatureEnd; ++i) {
// We never want NoRuntime set here.
if (i == Target::NoRuntime) {
Expand Down
61 changes: 56 additions & 5 deletions src/Target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ Target calculate_host_target() {

bool use_64_bits = (sizeof(size_t) == 8);
int bits = use_64_bits ? 64 : 32;
Target::Processor processor = Target::Processor::ProcessorGeneric;
std::vector<Target::Feature> initial_features;

#if __riscv
Expand Down Expand Up @@ -189,7 +190,7 @@ Target calculate_host_target() {
#endif
#endif

return {os, arch, bits, initial_features};
return {os, arch, bits, processor, initial_features};
}

bool is_using_hexagon(const Target &t) {
Expand Down Expand Up @@ -307,6 +308,31 @@ bool lookup_arch(const std::string &tok, Target::Arch &result) {
return false;
}

const std::map<std::string, Target::Processor> processor_name_map = {
{"tune_generic", Target::Processor::ProcessorGeneric},
{"tune_k8", Target::Processor::K8},
{"tune_k8_sse3", Target::Processor::K8_SSE3},
{"tune_amdfam10", Target::Processor::AMDFam10},
{"tune_btver1", Target::Processor::BtVer1},
{"tune_bdver1", Target::Processor::BdVer1},
{"tune_bdver2", Target::Processor::BdVer2},
{"tune_bdver3", Target::Processor::BdVer3},
{"tune_bdver4", Target::Processor::BdVer4},
{"tune_btver2", Target::Processor::BtVer2},
{"tune_znver1", Target::Processor::ZnVer1},
{"tune_znver2", Target::Processor::ZnVer2},
{"tune_znver3", Target::Processor::ZnVer3},
LebedevRI marked this conversation as resolved.
Show resolved Hide resolved
};

bool lookup_processor(const std::string &tok, Target::Processor &result) {
auto processor_iter = processor_name_map.find(tok);
if (processor_iter != processor_name_map.end()) {
result = processor_iter->second;
return true;
}
return false;
}

const std::map<std::string, Target::Feature> feature_name_map = {
{"jit", Target::JIT},
{"debug", Target::Debug},
Expand Down Expand Up @@ -454,7 +480,7 @@ bool merge_string(Target &t, const std::string &target) {
}
tokens.push_back(rest);

bool os_specified = false, arch_specified = false, bits_specified = false, features_specified = false;
bool os_specified = false, arch_specified = false, bits_specified = false, processor_specified = false, features_specified = false;
bool is_host = false;

for (size_t i = 0; i < tokens.size(); i++) {
Expand Down Expand Up @@ -484,7 +510,19 @@ bool merge_string(Target &t, const std::string &target) {
return false;
}
os_specified = true;
} else if (lookup_processor(tok, t.processor)) {
if (processor_specified) {
return false;
}
processor_specified = true;
} else if (lookup_feature(tok, feature)) {
if (tok.substr(0, std::strlen("tune_")) == "tune_") {
LebedevRI marked this conversation as resolved.
Show resolved Hide resolved
if (processor_specified) {
// Only a single tune makes sense.
return false;
}
processor_specified = true;
}
t.set_feature(feature);
features_specified = true;
} else if (tok == "trace_all") {
Expand Down Expand Up @@ -541,6 +579,12 @@ void bad_target_string(const std::string &target) {
separator = ", ";
}
separator = "";
std::string processors;
for (const auto &processor_entry : processor_name_map) {
processors += separator + processor_entry.first;
separator = ", ";
}
separator = "";
// Format the features to go one feature over 70 characters per line,
// assume the first line starts with "Features are ".
int line_char_start = -(int)sizeof("Features are");
Expand All @@ -555,10 +599,11 @@ void bad_target_string(const std::string &target) {
}
}
user_error << "Did not understand Halide target " << target << "\n"
<< "Expected format is arch-bits-os-feature1-feature2-...\n"
<< "Expected format is arch-bits-os-processor-feature1-feature2-...\n"
LebedevRI marked this conversation as resolved.
Show resolved Hide resolved
<< "Where arch is: " << architectures << ".\n"
<< "bits is either 32 or 64.\n"
<< "os is: " << oses << ".\n"
<< "processor is: " << processors << ".\n"
<< "\n"
<< "If arch, bits, or os are omitted, they default to the host.\n"
<< "\n"
Expand Down Expand Up @@ -628,6 +673,12 @@ std::string Target::to_string() const {
break;
}
}
for (const auto &processor_entry : processor_name_map) {
if (processor_entry.second == processor) {
result += "-" + processor_entry.first;
break;
}
}
for (const auto &feature_entry : feature_name_map) {
if (has_feature(feature_entry.second)) {
result += "-" + feature_entry.first;
Expand Down Expand Up @@ -980,7 +1031,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
// clang-format on

// clang-format off
const std::array<Feature, 14> intersection_features = {{
const std::array<Feature, 15> intersection_features = {{
ARMv7s,
ARMv81a,
AVX,
Expand Down Expand Up @@ -1047,7 +1098,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
// Union of features is computed through bitwise-or, and masked away by the features we care about
// Intersection of features is computed through bitwise-and and masked away, too.
// We merge the bits via bitwise or.
Target output = Target{os, arch, bits};
Target output = Target{os, arch, bits, processor};
output.features = ((features | other.features) & union_mask) | ((features | other.features) & matching_mask) | ((features & other.features) & intersection_mask);

// Pick tight lower bound for CUDA capability. Use fall-through to clear redundant features
Expand Down
25 changes: 22 additions & 3 deletions src/Target.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,24 @@ struct Target {
/** The bit-width of the target machine. Must be 0 for unknown, or 32 or 64. */
int bits = 0;

/** The specific processor to be targeted, tuned for.
LebedevRI marked this conversation as resolved.
Show resolved Hide resolved
* Corresponds to processor_name_map in Target.cpp. */
enum Processor {
LebedevRI marked this conversation as resolved.
Show resolved Hide resolved
ProcessorGeneric = 0,
K8,
K8_SSE3,
AMDFam10,
BtVer1,
BdVer1,
BdVer2,
BdVer3,
BdVer4,
BtVer2,
ZnVer1,
ZnVer2,
ZnVer3,
} processor = ProcessorGeneric;

/** Optional features a target can have.
* Corresponds to feature_name_map in Target.cpp.
* See definitions in HalideRuntime.h for full information.
Expand Down Expand Up @@ -136,8 +154,8 @@ struct Target {
FeatureEnd = halide_target_feature_end
};
Target() = default;
Target(OS o, Arch a, int b, const std::vector<Feature> &initial_features = std::vector<Feature>())
: os(o), arch(a), bits(b) {
Target(OS o, Arch a, int b, Processor p, const std::vector<Feature> &initial_features = std::vector<Feature>())
: os(o), arch(a), bits(b), processor(p) {
for (const auto &f : initial_features) {
set_feature(f);
}
Expand Down Expand Up @@ -226,6 +244,7 @@ struct Target {
return os == other.os &&
arch == other.arch &&
bits == other.bits &&
processor == other.processor &&
features == other.features;
}

Expand All @@ -247,7 +266,7 @@ struct Target {
/** Convert the Target into a string form that can be reconstituted
* by merge_string(), which will always be of the form
*
* arch-bits-os-feature1-feature2...featureN.
* arch-bits-os-processor-feature1-feature2...featureN.
*
* Note that is guaranteed that Target(t1.to_string()) == t1,
* but not that Target(s).to_string() == s (since there can be
Expand Down
2 changes: 1 addition & 1 deletion test/correctness/simd_op_check_hvx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ int main(int argc, char **argv) {
printf("host is: %s\n", host.to_string().c_str());
printf("HL_TARGET is: %s\n", hl_target.to_string().c_str());

Target t(Target::NoOS, Target::Hexagon, 32);
Target t(Target::NoOS, Target::Hexagon, 32, Target::ProcessorGeneric);
for (const auto &f : {Target::HVX,
Target::HVX_v62,
Target::HVX_v65,
Expand Down
16 changes: 8 additions & 8 deletions test/correctness/target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ int main(int argc, char **argv) {
// }

// Full specification round-trip:
t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41});
t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41});
ts = t1.to_string();
if (ts != "x86-32-linux-sse41") {
printf("to_string failure: %s\n", ts.c_str());
Expand All @@ -50,7 +50,7 @@ int main(int argc, char **argv) {
}

// Full specification round-trip, crazy features
t1 = Target(Target::Android, Target::ARM, 32,
t1 = Target(Target::Android, Target::ARM, 32, Target::ProcessorGeneric,
{Target::JIT, Target::SSE41, Target::AVX, Target::AVX2,
Target::CUDA, Target::OpenCL, Target::OpenGLCompute,
Target::Debug});
Expand Down Expand Up @@ -99,7 +99,7 @@ int main(int argc, char **argv) {
}

// with_feature
t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41});
t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41});
t2 = t1.with_feature(Target::NoAsserts).with_feature(Target::NoBoundsQuery);
ts = t2.to_string();
if (ts != "x86-32-linux-no_asserts-no_bounds_query-sse41") {
Expand All @@ -108,7 +108,7 @@ int main(int argc, char **argv) {
}

// without_feature
t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41, Target::NoAsserts});
t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41, Target::NoAsserts});
// Note that NoBoundsQuery wasn't set here, so 'without' is a no-op
t2 = t1.without_feature(Target::NoAsserts).without_feature(Target::NoBoundsQuery);
ts = t2.to_string();
Expand All @@ -119,7 +119,7 @@ int main(int argc, char **argv) {

// natural_vector_size
// SSE4.1 is 16 bytes wide
t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41});
t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41});
if (t1.natural_vector_size<uint8_t>() != 16) {
printf("natural_vector_size failure\n");
return -1;
Expand All @@ -139,7 +139,7 @@ int main(int argc, char **argv) {

// AVX is 32 bytes wide for float, but we treat as only 16 for integral types,
// due to suboptimal integer instructions
t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41, Target::AVX});
t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41, Target::AVX});
if (t1.natural_vector_size<uint8_t>() != 16) {
printf("natural_vector_size failure\n");
return -1;
Expand All @@ -158,7 +158,7 @@ int main(int argc, char **argv) {
}

// AVX2 is 32 bytes wide
t1 = Target(Target::Linux, Target::X86, 32, {Target::SSE41, Target::AVX, Target::AVX2});
t1 = Target(Target::Linux, Target::X86, 32, Target::ProcessorGeneric, {Target::SSE41, Target::AVX, Target::AVX2});
if (t1.natural_vector_size<uint8_t>() != 32) {
printf("natural_vector_size failure\n");
return -1;
Expand All @@ -177,7 +177,7 @@ int main(int argc, char **argv) {
}

// NEON is 16 bytes wide
t1 = Target(Target::Linux, Target::ARM, 32);
t1 = Target(Target::Linux, Target::ARM, 32, Target::ProcessorGeneric);
if (t1.natural_vector_size<uint8_t>() != 16) {
printf("natural_vector_size failure\n");
return -1;
Expand Down