Skip to content

Commit

Permalink
merge main into amd-staging
Browse files Browse the repository at this point in the history
Change-Id: Idde6f9cb3e60219d4beeff98b5fe5b575432fa16
  • Loading branch information
ronlieb committed Nov 26, 2024
2 parents 76c5481 + 1ea7ced commit a2e7740
Show file tree
Hide file tree
Showing 217 changed files with 14,196 additions and 3,432 deletions.
8 changes: 8 additions & 0 deletions bolt/include/bolt/Profile/DataAggregator.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ class DataAggregator : public DataReader {
std::string BuildIDBinaryName;

/// Memory map info for a single file as recorded in perf.data
/// When a binary has multiple text segments, the Size is computed as the
/// difference of the last address of these segments from the BaseAddress.
/// The base addresses of all text segments must be the same.
struct MMapInfo {
uint64_t BaseAddress{0}; /// Base address of the mapped binary.
uint64_t MMapAddress{0}; /// Address of the executable segment.
Expand Down Expand Up @@ -493,6 +496,11 @@ class DataAggregator : public DataReader {
/// and return a file name matching a given \p FileBuildID.
std::optional<StringRef> getFileNameForBuildID(StringRef FileBuildID);

/// Get a constant reference to the parsed binary mmap entries.
const std::unordered_map<uint64_t, MMapInfo> &getBinaryMMapInfo() {
return BinaryMMapInfo;
}

friend class YAMLProfileWriter;
};
} // namespace bolt
Expand Down
43 changes: 29 additions & 14 deletions bolt/lib/Profile/DataAggregator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@ cl::opt<bool> ReadPreAggregated(
"pa", cl::desc("skip perf and read data from a pre-aggregated file format"),
cl::cat(AggregatorCategory));

cl::opt<std::string>
ReadPerfEvents("perf-script-events",
cl::desc("skip perf event collection by supplying a "
"perf-script output in a textual format"),
cl::ReallyHidden, cl::init(""), cl::cat(AggregatorCategory));

static cl::opt<bool>
TimeAggregator("time-aggr",
cl::desc("time BOLT aggregator"),
Expand Down Expand Up @@ -167,8 +173,9 @@ void DataAggregator::findPerfExecutable() {
void DataAggregator::start() {
outs() << "PERF2BOLT: Starting data aggregation job for " << Filename << "\n";

// Don't launch perf for pre-aggregated files
if (opts::ReadPreAggregated)
// Don't launch perf for pre-aggregated files or when perf input is specified
// by the user.
if (opts::ReadPreAggregated || !opts::ReadPerfEvents.empty())
return;

findPerfExecutable();
Expand Down Expand Up @@ -464,6 +471,13 @@ void DataAggregator::filterBinaryMMapInfo() {

int DataAggregator::prepareToParse(StringRef Name, PerfProcessInfo &Process,
PerfProcessErrorCallbackTy Callback) {
if (!opts::ReadPerfEvents.empty()) {
outs() << "PERF2BOLT: using pre-processed perf events for '" << Name
<< "' (perf-script-events)\n";
ParsingBuf = opts::ReadPerfEvents;
return 0;
}

std::string Error;
outs() << "PERF2BOLT: waiting for perf " << Name
<< " collection to finish...\n";
Expand Down Expand Up @@ -2056,15 +2070,6 @@ std::error_code DataAggregator::parseMMapEvents() {
if (FileMMapInfo.first == "(deleted)")
continue;

// Consider only the first mapping of the file for any given PID
auto Range = GlobalMMapInfo.equal_range(FileMMapInfo.first);
bool PIDExists = llvm::any_of(make_range(Range), [&](const auto &MI) {
return MI.second.PID == FileMMapInfo.second.PID;
});

if (PIDExists)
continue;

GlobalMMapInfo.insert(FileMMapInfo);
}

Expand Down Expand Up @@ -2116,12 +2121,22 @@ std::error_code DataAggregator::parseMMapEvents() {
<< " using file offset 0x" << Twine::utohexstr(MMapInfo.Offset)
<< ". Ignoring profile data for this mapping\n";
continue;
} else {
MMapInfo.BaseAddress = *BaseAddress;
}
MMapInfo.BaseAddress = *BaseAddress;
}

BinaryMMapInfo.insert(std::make_pair(MMapInfo.PID, MMapInfo));
// Try to add MMapInfo to the map and update its size. Large binaries may
// span to multiple text segments, so the mapping is inserted only on the
// first occurrence.
if (!BinaryMMapInfo.insert(std::make_pair(MMapInfo.PID, MMapInfo)).second)
assert(MMapInfo.BaseAddress == BinaryMMapInfo[MMapInfo.PID].BaseAddress &&
"Base address on multiple segment mappings should match");

// Update mapping size.
const uint64_t EndAddress = MMapInfo.MMapAddress + MMapInfo.Size;
const uint64_t Size = EndAddress - BinaryMMapInfo[MMapInfo.PID].BaseAddress;
if (Size > BinaryMMapInfo[MMapInfo.PID].Size)
BinaryMMapInfo[MMapInfo.PID].Size = Size;
}

if (BinaryMMapInfo.empty()) {
Expand Down
3 changes: 3 additions & 0 deletions bolt/unittests/Core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(LLVM_LINK_COMPONENTS
add_bolt_unittest(CoreTests
BinaryContext.cpp
MCPlusBuilder.cpp
MemoryMaps.cpp
DynoStats.cpp

DISABLE_LLVM_LINK_LLVM_DYLIB
Expand All @@ -17,6 +18,8 @@ target_link_libraries(CoreTests
PRIVATE
LLVMBOLTCore
LLVMBOLTRewrite
LLVMBOLTProfile
LLVMTestingSupport
)

foreach (tgt ${BOLT_TARGETS_TO_BUILD})
Expand Down
142 changes: 142 additions & 0 deletions bolt/unittests/Core/MemoryMaps.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
//===- bolt/unittest/Core/MemoryMaps.cpp ----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "bolt/Core/BinaryContext.h"
#include "bolt/Profile/DataAggregator.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Testing/Support/Error.h"
#include "gtest/gtest.h"

using namespace llvm;
using namespace llvm::object;
using namespace llvm::ELF;
using namespace bolt;

namespace opts {
extern cl::opt<std::string> ReadPerfEvents;
} // namespace opts

namespace {

/// Perform checks on memory map events normally captured in perf. Tests use
/// the 'opts::ReadPerfEvents' flag to emulate these events, passing a custom
/// 'perf script' output to DataAggregator.
struct MemoryMapsTester : public testing::TestWithParam<Triple::ArchType> {
void SetUp() override {
initalizeLLVM();
prepareElf();
initializeBOLT();
}

protected:
void initalizeLLVM() {
llvm::InitializeAllTargetInfos();
llvm::InitializeAllTargetMCs();
llvm::InitializeAllAsmParsers();
llvm::InitializeAllDisassemblers();
llvm::InitializeAllTargets();
llvm::InitializeAllAsmPrinters();
}

void prepareElf() {
memcpy(ElfBuf, "\177ELF", 4);
ELF64LE::Ehdr *EHdr = reinterpret_cast<typename ELF64LE::Ehdr *>(ElfBuf);
EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64;
EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB;
EHdr->e_machine = GetParam() == Triple::aarch64 ? EM_AARCH64 : EM_X86_64;
MemoryBufferRef Source(StringRef(ElfBuf, sizeof(ElfBuf)), "ELF");
ObjFile = cantFail(ObjectFile::createObjectFile(Source));
}

void initializeBOLT() {
Relocation::Arch = ObjFile->makeTriple().getArch();
BC = cantFail(BinaryContext::createBinaryContext(
ObjFile->makeTriple(), ObjFile->getFileName(), nullptr, true,
DWARFContext::create(*ObjFile.get()), {llvm::outs(), llvm::errs()}));
ASSERT_FALSE(!BC);
}

char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {};
std::unique_ptr<ObjectFile> ObjFile;
std::unique_ptr<BinaryContext> BC;
};
} // namespace

#ifdef X86_AVAILABLE

INSTANTIATE_TEST_SUITE_P(X86, MemoryMapsTester,
::testing::Values(Triple::x86_64));

#endif

#ifdef AARCH64_AVAILABLE

INSTANTIATE_TEST_SUITE_P(AArch64, MemoryMapsTester,
::testing::Values(Triple::aarch64));

#endif

/// Check that the correct mmap size is computed when we have multiple text
/// segment mappings.
TEST_P(MemoryMapsTester, ParseMultipleSegments) {
const int Pid = 1234;
StringRef Filename = "BINARY";
opts::ReadPerfEvents = formatv(
"name 0 [000] 0.000000: PERF_RECORD_MMAP2 {0}/{0}: "
"[0xabc0000000(0x1000000) @ 0x11c0000 103:01 1573523 0]: r-xp {1}\n"
"name 0 [000] 0.000000: PERF_RECORD_MMAP2 {0}/{0}: "
"[0xabc2000000(0x8000000) @ 0x31d0000 103:01 1573523 0]: r-xp {1}\n",
Pid, Filename);

BC->SegmentMapInfo[0x11da000] =
SegmentInfo{0x11da000, 0x10da000, 0x11ca000, 0x10da000, 0x10000, true};
BC->SegmentMapInfo[0x31d0000] =
SegmentInfo{0x31d0000, 0x51ac82c, 0x31d0000, 0x3000000, 0x200000, true};

DataAggregator DA("");
BC->setFilename(Filename);
Error Err = DA.preprocessProfile(*BC);

// Ignore errors from perf2bolt when parsing memory events later on.
ASSERT_THAT_ERROR(std::move(Err), Succeeded());

auto &BinaryMMapInfo = DA.getBinaryMMapInfo();
auto El = BinaryMMapInfo.find(Pid);
// Check that memory mapping is present and has the expected size.
ASSERT_NE(El, BinaryMMapInfo.end());
ASSERT_EQ(El->second.Size, static_cast<uint64_t>(0xb1d0000));
}

/// Check that DataAggregator aborts when pre-processing an input binary
/// with multiple text segments that have different base addresses.
TEST_P(MemoryMapsTester, MultipleSegmentsMismatchedBaseAddress) {
const int Pid = 1234;
StringRef Filename = "BINARY";
opts::ReadPerfEvents = formatv(
"name 0 [000] 0.000000: PERF_RECORD_MMAP2 {0}/{0}: "
"[0xabc0000000(0x1000000) @ 0x11c0000 103:01 1573523 0]: r-xp {1}\n"
"name 0 [000] 0.000000: PERF_RECORD_MMAP2 {0}/{0}: "
"[0xabc2000000(0x8000000) @ 0x31d0000 103:01 1573523 0]: r-xp {1}\n",
Pid, Filename);

BC->SegmentMapInfo[0x11da000] =
SegmentInfo{0x11da000, 0x10da000, 0x11ca000, 0x10da000, 0x10000, true};
// Using '0x31d0fff' FileOffset which triggers a different base address
// for this second text segment.
BC->SegmentMapInfo[0x31d0000] =
SegmentInfo{0x31d0000, 0x51ac82c, 0x31d0fff, 0x3000000, 0x200000, true};

DataAggregator DA("");
BC->setFilename(Filename);
ASSERT_DEATH(
{ Error Err = DA.preprocessProfile(*BC); },
"Base address on multiple segment mappings should match");
}
4 changes: 2 additions & 2 deletions bolt/utils/bughunter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ if [[ $FAIL -eq "0" ]]; then
fi
else
echo "Did it pass? Type the return code [0 = pass, 1 = fail]"
read -n1 PASS
read -n1 FAIL
fi
if [[ $FAIL -eq "0" ]] ; then
echo " Warning: optimized binary passes."
Expand Down Expand Up @@ -205,7 +205,7 @@ while [[ "$CONTINUE" -ne "0" ]] ; do
echo " OPTIMIZED_BINARY failure=$FAIL"
else
echo "Did it pass? Type the return code [0 = pass, 1 = fail]"
read -n1 PASS
read -n1 FAIL
fi
else
FAIL=1
Expand Down
2 changes: 2 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ C23 Feature Support

- Clang now supports `N3029 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3029.htm>`_ Improved Normal Enumerations.
- Clang now officially supports `N3030 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3030.htm>`_ Enhancements to Enumerations. Clang already supported it as an extension, so there were no changes to compiler behavior.
- Fixed the value of ``BOOL_WIDTH`` in ``<limits.h>`` to return ``1``
explicitly, as mandated by the standard. Fixes #GH117348

Non-comprehensive list of changes in this release
-------------------------------------------------
Expand Down
14 changes: 14 additions & 0 deletions clang/docs/SanitizerCoverage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,20 @@ Users need to implement a single function to capture the CF table at startup:
// the collected control flow.
}
Gated Trace Callbacks
=====================

Gate the invocation of the tracing callbacks with
``-sanitizer-coverage-gated-trace-callbacks``.

When this option is enabled, the instrumentation will not call into the
runtime-provided callbacks for tracing, thus only incurring in a trivial
branch without going through a function call.

It is up to the runtime to toggle the value of the global variable in order to
enable tracing.

This option is only supported for trace-pc-guard and trace-cmp.

Disabling instrumentation with ``__attribute__((no_sanitize("coverage")))``
===========================================================================
Expand Down
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,11 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8, "V16fV4iV8iV16fiIiI
TARGET_BUILTIN(__builtin_amdgcn_permlane16_swap, "V2UiUiUiIbIb", "nc", "permlane16-swap")
TARGET_BUILTIN(__builtin_amdgcn_permlane32_swap, "V2UiUiUiIbIb", "nc", "permlane32-swap")

TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")

//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
//===----------------------------------------------------------------------===//
Expand Down
23 changes: 21 additions & 2 deletions clang/lib/AST/ByteCode/Compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,10 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {

QualType SubExprTy = SubExpr->getType();
std::optional<PrimType> FromT = classify(SubExprTy);
// Casts from integer to vectors in C.
if (FromT && CE->getType()->isVectorType())
return this->emitBuiltinBitCast(CE);

std::optional<PrimType> ToT = classify(CE->getType());
if (!FromT || !ToT)
return false;
Expand Down Expand Up @@ -6494,8 +6498,23 @@ bool Compiler<Emitter>::emitBuiltinBitCast(const CastExpr *E) {
}

// Get a pointer to the value-to-cast on the stack.
if (!this->visit(SubExpr))
return false;
// For CK_LValueToRValueBitCast, this is always an lvalue and
// we later assume it to be one (i.e. a PT_Ptr). However,
// we call this function for other utility methods where
// a bitcast might be useful, so convert it to a PT_Ptr in that case.
if (SubExpr->isGLValue()) {
if (!this->visit(SubExpr))
return false;
} else if (std::optional<PrimType> FromT = classify(SubExpr)) {
unsigned TempOffset = allocateLocalPrimitive(
SubExpr, *FromT, /*IsConst=*/true, /*IsExtended=*/false);
if (!this->visit(SubExpr))
return false;
if (!this->emitSetLocal(*FromT, TempOffset, E))
return false;
if (!this->emitGetPtrLocal(TempOffset, E))
return false;
}

if (!ToT || ToT == PT_Ptr) {
if (!this->emitBitCastPtr(E))
Expand Down
19 changes: 17 additions & 2 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19694,8 +19694,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: {

case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
Expand All @@ -19710,6 +19713,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
IID = Intrinsic::amdgcn_ds_read_tr4_b64;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
IID = Intrinsic::amdgcn_ds_read_tr8_b64;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
IID = Intrinsic::amdgcn_ds_read_tr6_b96;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
IID = Intrinsic::amdgcn_ds_read_tr16_b64;
break;
}
llvm::Type *LoadTy = ConvertType(E->getType());
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
Expand Down
Loading

0 comments on commit a2e7740

Please sign in to comment.