Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Arm64 "constructed" constants in SuperPMI asm diffs #76616

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions src/coreclr/tools/superpmi/superpmi-shared/spmiutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,3 +335,46 @@ void PutThumb2BlRel24(UINT16* p, INT32 imm24)
p[0] = Opcode0;
p[1] = Opcode1;
}

// GetArm64MovConstant / GetArm64MovkConstant: Decode arm64 mov / movk instructions, e.g.:
// d29ff600 mov x0, #65456
// f2ab8640 movk x0, #23602, lsl #16
// f2c04bc0 movk x0, #606, lsl #32
//
// This is used in the NearDiffer to determine if a sequence of mov/movk is actually an address.
//
// Return `true` if the instruction pointed to by `p` is a mov/movk, `false` otherwise.
// If true, fill out the target register in `*pReg`, constant in `*pCon`, and (for movk) shift value in `*pShift`.

bool GetArm64MovConstant(UINT32* p, unsigned* pReg, unsigned* pCon)
{
UINT32 instr = *p;
if ((instr & 0xffe00000) == 0xd2800000)
{
*pReg = instr & 0x1f;
*pCon = (instr >> 5) & 0xffff;
return true;
}

return false;
}

bool GetArm64MovkConstant(UINT32* p, unsigned* pReg, unsigned* pCon, unsigned* pShift)
{
UINT32 instr = *p;
if ((instr & 0xff800000) == 0xf2800000)
{
*pReg = instr & 0x1f;
*pCon = (instr >> 5) & 0xffff;
*pShift = ((instr >> 21) & 0x3) * 16;
return true;
}

return false;
}

// PutArm64MovkConstant: set the constant field in an Arm64 `movk` instruction
void PutArm64MovkConstant(UINT32* p, unsigned con)
{
*p = (*p & ~(0xffff << 5)) | ((con & 0xffff) << 5);
}
5 changes: 5 additions & 0 deletions src/coreclr/tools/superpmi/superpmi-shared/spmiutil.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ void PutArm64Rel12(UINT32* pCode, INT32 imm12);
void PutThumb2Mov32(UINT16* p, UINT32 imm32);
void PutThumb2BlRel24(UINT16* p, INT32 imm24);

bool GetArm64MovConstant(UINT32* p, unsigned* pReg, unsigned* pCon);
bool GetArm64MovkConstant(UINT32* p, unsigned* pReg, unsigned* pCon, unsigned* pShift);

void PutArm64MovkConstant(UINT32* p, unsigned con);

template <typename T, int size>
inline constexpr unsigned ArrLen(T (&)[size])
{
Expand Down
169 changes: 153 additions & 16 deletions src/coreclr/tools/superpmi/superpmi/neardiffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,22 +298,24 @@ struct DiffData
CompileResult* cr2;

// Details of the first block
size_t blocksize1;
size_t datablock1;
size_t datablockSize1;
size_t originalBlock1;
size_t originalDataBlock1;
size_t otherCodeBlock1;
size_t otherCodeBlockSize1;
unsigned char* block1;
size_t blocksize1;
unsigned char* datablock1;
size_t datablockSize1;
size_t originalBlock1;
size_t originalDataBlock1;
size_t otherCodeBlock1;
size_t otherCodeBlockSize1;

// Details of the second block
size_t blocksize2;
size_t datablock2;
size_t datablockSize2;
size_t originalBlock2;
size_t originalDataBlock2;
size_t otherCodeBlock2;
size_t otherCodeBlockSize2;
unsigned char* block2;
size_t blocksize2;
unsigned char* datablock2;
size_t datablockSize2;
size_t originalBlock2;
size_t originalDataBlock2;
size_t otherCodeBlock2;
size_t otherCodeBlockSize2;
};

//
Expand All @@ -330,6 +332,7 @@ bool NearDiffer::compareOffsets(
return true;
}

const SPMI_TARGET_ARCHITECTURE targetArch = GetSpmiTargetArchitecture();
const DiffData* data = (const DiffData*)payload;
size_t ip1 = data->originalBlock1 + blockOffset;
size_t ip2 = data->originalBlock2 + blockOffset;
Expand Down Expand Up @@ -435,6 +438,140 @@ bool NearDiffer::compareOffsets(
if ((mapped1 == mapped2) && (mapped1 != (size_t)-1))
return true;

// There are some cases on arm64 where we generate multiple instruction register construction of addresses
// but we don't have a relocation for them (so they aren't handled by `applyRelocs`). One case is
// allocPgoInstrumentationBySchema(), which returns an address the JIT writes into the code stream
// (used to store dynamic PGO probe data).
//
// The instruction sequence is something like this:
// mov x0, #63408
// movk x0, #23602, lsl #16
// movk x0, #606, lsl #32
//
// Here, we try to match this sequence and look it up in the address map.
//
// Since the mov/movk sequence is specific to the replay address constant, we don't assume the baseline
// and diff have the same number of instructions (e.g., it's possible to skip a `movk` if it is zero).
//
// Some version of this logic might apply to ARM as well.
//
if (targetArch == SPMI_TARGET_ARCHITECTURE_ARM64)
{
bool movk2_1 = false, movk3_1 = false;
bool movk2_2 = false, movk3_2 = false;

unsigned reg1_1 = 0, reg2_1, reg3_1, reg4_1;
unsigned reg1_2 = 0, reg2_2, reg3_2, reg4_2;
unsigned con1_1, con2_1, con3_1, con4_1;
unsigned con1_2, con2_2, con3_2, con4_2;
unsigned shift2_1, shift3_1, shift4_1;
unsigned shift2_2, shift3_2, shift4_2;

UINT32* iaddr1 = (UINT32*)(data->block1 + blockOffset);
UINT32* iaddr2 = (UINT32*)(data->block2 + blockOffset);
UINT32* iaddr1end = (UINT32*)(data->block1 + data->blocksize1);
UINT32* iaddr2end = (UINT32*)(data->block2 + data->blocksize2);

DWORDLONG addr1 = 0;
DWORDLONG addr2 = 0;

// Look for a mov/movk address pattern in code stream 1.

if ((iaddr1 < iaddr1end) &&
GetArm64MovConstant(iaddr1, &reg1_1, &con1_1))
{
// We assume the address requires at least 1 'movk' instruction.
if ((iaddr1 + 1 < iaddr1end) &&
GetArm64MovkConstant(iaddr1 + 1, &reg2_1, &con2_1, &shift2_1) &&
(reg1_1 == reg2_1))
{
addr1 = (DWORDLONG)con1_1 + ((DWORDLONG)con2_1 << shift2_1);

if ((iaddr1 + 2 < iaddr1end) &&
GetArm64MovkConstant(iaddr1 + 2, &reg3_1, &con3_1, &shift3_1) &&
(reg1_1 == reg3_1))
{
movk2_1 = true;
addr1 += (DWORDLONG)con3_1 << shift3_1;

if ((iaddr1 + 3 < iaddr1end) &&
GetArm64MovkConstant(iaddr1 + 3, &reg4_1, &con4_1, &shift4_1) &&
(reg1_1 == reg4_1))
{
movk3_1 = true;
addr1 += (DWORDLONG)con4_1 << shift4_1;
}
}
Copy link
Member

@jakobbotsch jakobbotsch Oct 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do I understand right that this only works if both sides end up with the same number of instructions? For example, it is conceivable that one side may omit a movk because that section of the constant is all zeros.

Ultimately, I feel like we need to redesign some of the JIT-EE parts to make this robust. E.g. @EgorBo ended up needing to redesign the JIT-EE interface in #76112 in a way that ends up fixing #53773 for good, by splitting the addresses that JIT embeds in codegen from the notion of where the values are. This also helps support things in R2R and NAOT.

Can the same sort of fix not be applied to PGO? I already see that allocPgoInstrumentationBySchema returns a profileMemory pointer. Is anything stopping us from returning the same pointer to both JITs? From my cursory glance, it seems like it would work and it would make things completely deterministic. The pointer seems to only be used for codegen purposes, not accessed by the JIT.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right: it's checking if the instruction stream is exactly the same modulo the constants, you're right that one movk could be omitted if it is zero, or perhaps (unlikely) one could have an address in the >48 bit range and one does not.

Is anything stopping us from returning the same pointer to both JITs?

The fact that two compilations are happening is unknown at this point in the code. In fact, we currently don't even remember we allocated the buffer (except to put it on a list to free).

We would need to record it as a map from ftnHnd => pointer. There's nothing preventing the JIT from calling the allocPgoInstrumentationBySchema (or similar API) multiple times and getting multiple different results.

I guess if the MethodContext remembers if it ever returned a pointer for a ftnHnd, then it can look up in the table and return the same pointer.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fact that two compilations are happening is unknown at this point in the code. In fact, we currently don't even remember we allocated the buffer (except to put it on a list to free).

We would need to record it as a map from ftnHnd => pointer. There's nothing preventing the JIT from calling the allocPgoInstrumentationBySchema (or similar API) multiple times and getting multiple different results.

I guess if the MethodContext remembers if it ever returned a pointer for a ftnHnd, then it can look up in the table and return the same pointer.

That's true, it would need an SPMI update (and by extension, JIT-EE GUID update) to save profileMemory during recording. I think then keying it by ftnHnd would make sense, it is the relationship in the VM today because the VM needs to be able to hand the same pointer back for multiple versions of the same method.

Another benefit is that there are other things that can affect the codegen. Constant CSE could potentially affect the codegen depending on the addresses too. So it would be best if we get the same exact pointer that was returned at runtime to make sure we are seeing the exact same codegen.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I wrote:

I guess if the MethodContext remembers if it ever returned a pointer for a ftnHnd, then it can look up in the table and return the same pointer.

I was thinking about during replay.

So it would be best if we get the same exact pointer that was returned at runtime to make sure we are seeing the exact same codegen.

By "runtime" do you mean during collection time? I don't think that's possible in general because the JIT might want to write to or in some cases read from that address, and we can't guarantee the same pointer at replay as at collection time.

Copy link
Member

@jakobbotsch jakobbotsch Oct 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By "runtime" do you mean during collection time? I don't think that's possible in general because the JIT might want to write to or in some cases read from that address, and we can't guarantee the same pointer at replay as at collection time.

Yes, I mean the same pointer during replay as during collection. This is what I was saying previously. I think we should be trying to design the JIT-EE interface so that the JIT does not need to read directly from handles it is using for codegen purposes. This is necessary not just for SPMI, but usually also for R2R/NAOT where the handles are not actually memory addresses. @EgorBo is moving static field addresses to this model in #76112.

I think it is already the case with allocPgoInstrumentationBySchema today. From a cursory glance the JIT does not read or write to the returned profileMemory pointer. It is purely used for codegen purposes.

If the JIT wants to start reading or writing this pointer I think it should be accomplished with changes to the JIT-EE interface. That way we could also support instrumentation during prejit, for example. In those cases pointers like these are presumably going to be some opaque handle that cannot just be dereferenced.

}
}

// Look for a mov/movk address pattern in code stream 2.

if ((iaddr2 < iaddr2end) &&
GetArm64MovConstant(iaddr2, &reg1_2, &con1_2))
{
// We assume the address requires at least 1 'movk' instruction.
if ((iaddr2 + 1 < iaddr2end) &&
GetArm64MovkConstant(iaddr2 + 1, &reg2_2, &con2_2, &shift2_2) &&
(reg1_2 == reg2_2))
{
addr2 = (DWORDLONG)con1_2 + ((DWORDLONG)con2_2 << shift2_2);

if ((iaddr2 + 2 < iaddr2end) &&
GetArm64MovkConstant(iaddr2 + 2, &reg3_2, &con3_2, &shift3_2) &&
(reg1_2 == reg3_2))
{
movk2_2 = true;
addr2 += (DWORDLONG)con3_2 << shift3_2;

if ((iaddr2 + 3 < iaddr2end) &&
GetArm64MovkConstant(iaddr2 + 3, &reg4_2, &con4_2, &shift4_2) &&
(reg1_2 == reg4_2))
{
movk3_2 = true;
addr2 += (DWORDLONG)con4_2 << shift4_2;
}
}
}
}

// Check the constants. We don't need to check 'addr1 == addr2' because if that were
// true we wouldn't have gotten here.
//
// Note: when replaying on a 32-bit platform, we must have
// movk2_1 == movk2_2 == movk3_1 == movk3_2 == false

if ((addr1 != 0) && (addr2 != 0) && (reg1_1 == reg1_2))
{
DWORDLONG mapped1 = (DWORDLONG)data->cr1->searchAddressMap((void*)addr1);
DWORDLONG mapped2 = (DWORDLONG)data->cr2->searchAddressMap((void*)addr2);
if ((mapped1 == mapped2) && (mapped1 != (DWORDLONG)-1))
{
// Now, zero out the constants in the `movk` instructions so when the disassembler
// gets to them, they compare equal.
PutArm64MovkConstant(iaddr1 + 1, 0);
PutArm64MovkConstant(iaddr2 + 1, 0);
if (movk2_1)
{
PutArm64MovkConstant(iaddr1 + 2, 0);
}
if (movk2_2)
{
PutArm64MovkConstant(iaddr2 + 2, 0);
}
if (movk3_1)
{
PutArm64MovkConstant(iaddr1 + 3, 0);
}
if (movk3_2)
{
PutArm64MovkConstant(iaddr2 + 3, 0);
}
return true;
}
}
}

return false;
}

Expand Down Expand Up @@ -513,11 +650,11 @@ bool NearDiffer::compareCodeSection(MethodContext* mc,
cr2,

// Details of the first block
(size_t)blocksize1, (size_t)datablock1, (size_t)datablockSize1, (size_t)originalBlock1,
block1, (size_t)blocksize1, datablock1, (size_t)datablockSize1, (size_t)originalBlock1,
(size_t)originalDataBlock1, (size_t)otherCodeBlock1, (size_t)otherCodeBlockSize1,

// Details of the second block
(size_t)blocksize2, (size_t)datablock2, (size_t)datablockSize2, (size_t)originalBlock2,
block2, (size_t)blocksize2, datablock2, (size_t)datablockSize2, (size_t)originalBlock2,
(size_t)originalDataBlock2, (size_t)otherCodeBlock2, (size_t)otherCodeBlockSize2};

#ifdef USE_COREDISTOOLS
Expand Down