Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[JIT] X64 - More replacement sequences for integer multiplication by a constant #77137

Merged
merged 57 commits into from
Nov 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
78c3247
Using 3 instruction sequence for x64 multiply
TIHan Oct 13, 2022
06a60e1
Do not do this in morph. Do it in codegen now.
TIHan Oct 13, 2022
7fb0095
Fixing codegen
TIHan Oct 13, 2022
86edb6c
Only allow values under 127 and do not skip mov - correctness testing
TIHan Oct 13, 2022
0955d11
Try to fix tests
TIHan Oct 13, 2022
865cf95
cleanup
TIHan Oct 13, 2022
999eac0
Moving to Lowering
TIHan Oct 14, 2022
6ebf58e
Quick fix
TIHan Oct 14, 2022
6e8f28c
Fully works in lowering now
TIHan Oct 14, 2022
94968a4
Account for all ints
TIHan Oct 14, 2022
64b523a
Take into account codegen opts
TIHan Oct 14, 2022
1cde9c0
Minor cleanup
TIHan Oct 14, 2022
c6fba6e
Minor cleanup
TIHan Oct 14, 2022
ff2b9a1
Fixed test
TIHan Oct 14, 2022
338dbe5
Added int multiply disasm checks. Fixed SuperFileCheck namespace bug.…
TIHan Oct 14, 2022
106c6b7
Update comments
TIHan Oct 14, 2022
8fc5b37
Update comments
TIHan Oct 14, 2022
e5835db
Update comments
TIHan Oct 14, 2022
3be48bf
Update comments
TIHan Oct 14, 2022
b3d4a5f
Formatting
TIHan Oct 15, 2022
843617a
Fixing build
TIHan Oct 16, 2022
74b1071
Fixing build again
TIHan Oct 17, 2022
90b7e7d
minor rename
TIHan Oct 17, 2022
ead83a5
Moving x64 multiply codegen optimizations to lowering
TIHan Oct 17, 2022
b5193c0
Using ReplaceWithLclVar
TIHan Oct 19, 2022
214625f
Feedback. Removed use of FULL-LINE as it is more readable not strictl…
TIHan Oct 19, 2022
f7181d2
Merged
TIHan Oct 19, 2022
99c11b4
Merge remote-tracking branch 'upstream/main' into mul-opt-x64-part2
TIHan Oct 19, 2022
9ff6393
Minor fix
TIHan Oct 19, 2022
75313fb
Formatting
TIHan Oct 20, 2022
c101d1b
merging
TIHan Oct 20, 2022
647cddb
Merging
TIHan Oct 20, 2022
79bc2d5
Adding codegen opts back, but disabling lowering opts if min-opts is …
TIHan Oct 20, 2022
47c818d
Little more changes
TIHan Oct 20, 2022
55b6212
Formatting
TIHan Oct 20, 2022
1a736ff
Remove codegen opts
TIHan Oct 20, 2022
c042862
Update comments
TIHan Oct 20, 2022
bba5ba0
Added test case for address-exposed
TIHan Oct 20, 2022
164f233
Added TryLowerLshWithConstant
TIHan Oct 21, 2022
88311d6
Cleanup
TIHan Oct 21, 2022
160c68e
Removed TryLowerLshWithConstant
TIHan Oct 21, 2022
4c9f7d3
Handle GT_LSH to emit lea if const is 3
TIHan Oct 21, 2022
a80dd11
Added tests.
TIHan Oct 21, 2022
d8b4228
Updated comments
TIHan Oct 21, 2022
e7b140d
Revert LowerShift
TIHan Oct 21, 2022
384ff0b
Remove IndexWithScale
TIHan Oct 21, 2022
a935699
Fix build
TIHan Oct 21, 2022
21335a6
Update comment
TIHan Oct 21, 2022
05ac3f7
Formatting
TIHan Oct 21, 2022
85ca0f4
Update comments. Fix disasm test.
TIHan Oct 21, 2022
e633fef
Another minor test change
TIHan Oct 21, 2022
5a459d8
Fix test
TIHan Oct 22, 2022
f40aa1c
Fix test
TIHan Oct 22, 2022
19d26d1
Feedback
TIHan Oct 25, 2022
923d10b
Update IntMultiply.cs
TIHan Oct 28, 2022
5ac902d
Update codegenxarch.cpp
TIHan Oct 31, 2022
c8fb73b
Update codegenxarch.cpp
TIHan Nov 1, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1085,7 +1085,6 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode)

if (immOp != nullptr)
{
// CQ: When possible use LEA for mul by imm 3, 5 or 9
ssize_t imm = immOp->AsIntConCommon()->IconValue();

if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9)))
Expand All @@ -1095,17 +1094,6 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode)
unsigned int scale = (unsigned int)(imm - 1);
GetEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->GetRegNum(), rmOp->GetRegNum(), scale, 0);
}
else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm == genFindLowestBit(imm)) && (imm != 0))
{
// Use shift for constant multiply when legal
uint64_t zextImm = static_cast<uint64_t>(static_cast<size_t>(imm));
unsigned int shiftAmount = genLog2(zextImm);

// Copy reg src to dest register
inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true);

inst_RV_SH(INS_shl, size, targetReg, shiftAmount);
}
else
{
// use the 3-op form with immediate
Expand Down Expand Up @@ -4444,8 +4432,10 @@ void CodeGen::genCodeForShift(GenTree* tree)
{
emitAttr size = emitTypeSize(tree);

bool mightOptimizeLsh = tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags();

// Optimize "X<<1" to "lea [reg+reg]" or "add reg, reg"
if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(1))
if (mightOptimizeLsh && shiftBy->IsIntegralConst(1))
{
if (tree->GetRegNum() == operandReg)
{
Expand All @@ -4456,6 +4446,18 @@ void CodeGen::genCodeForShift(GenTree* tree)
GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0);
}
}
// Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will
// remove a 'mov'.
else if (mightOptimizeLsh && shiftBy->IsIntegralConst(2) && tree->GetRegNum() != operandReg)
{
GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0);
}
// Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will
// remove a 'mov'.
else if (mightOptimizeLsh && shiftBy->IsIntegralConst(3) && tree->GetRegNum() != operandReg)
{
GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 8, 0);
}
else
{
int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ class Lowering final : public Phase
void LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode);
#ifdef TARGET_XARCH
void LowerPutArgStk(GenTreePutArgStk* putArgStk);
GenTree* TryLowerMulToLshSubOrLshAdd(GenTreeOp* node);
GenTree* TryLowerMulWithConstant(GenTreeOp* node);
#endif // TARGET_XARCH

bool TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* parent);
Expand Down
48 changes: 32 additions & 16 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
}

//----------------------------------------------------------------------------------------------
// Lowering::TryLowerMulToLshSubOrLshAdd:
// Lowering::TryLowerMulWithConstant:
// Lowers a tree MUL(X, CNS) to LSH(X, CNS_SHIFT)
// or
// Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X)
// or
// Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X)
Expand All @@ -119,14 +121,14 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
//
// Notes:
// Performs containment checks on the replacement node if one is created
GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node)
GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node)
{
assert(node->OperIs(GT_MUL));

// We do not do this optimization in X86 as it is not recommended.
#if TARGET_X86
return nullptr;
#endif // TARGET_X86
// Do not do these optimizations when min-opts enabled.
if (comp->opts.MinOpts())
return nullptr;

if (!varTypeIsIntegral(node))
return nullptr;

Expand All @@ -139,25 +141,36 @@ GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node)
if (op1->isContained() || op2->isContained())
return nullptr;

if (!op1->OperIs(GT_LCL_VAR))
return nullptr;

if (!op2->IsCnsIntOrI())
return nullptr;

GenTreeIntConCommon* cns = op2->AsIntConCommon();
ssize_t cnsVal = cns->IconValue();

// Use GT_LSH if cnsVal is a power of two.
// This is handled in codegen.
if (isPow2(cnsVal))
return nullptr;

// Use GT_LEA if cnsVal is 3, 5, or 9.
// This is handled in codegen.
// These are handled in codegen.
if (cnsVal == 3 || cnsVal == 5 || cnsVal == 9)
return nullptr;

// Use GT_LSH if cnsVal is a power of two.
if (isPow2(cnsVal))
{
// Use shift for constant multiply when legal
unsigned int shiftAmount = genLog2(static_cast<uint64_t>(static_cast<size_t>(cnsVal)));

cns->SetIconValue(shiftAmount);
node->ChangeOper(GT_LSH);

ContainCheckShiftRotate(node);

return node;
}

// We do not do this optimization in X86 as it is not recommended.
#if TARGET_X86
return nullptr;
#endif // TARGET_X86

ssize_t cnsValPlusOne = cnsVal + 1;
ssize_t cnsValMinusOne = cnsVal - 1;

Expand All @@ -166,6 +179,9 @@ GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node)
if (!useSub && !isPow2(cnsValMinusOne))
return nullptr;

LIR::Use op1Use(BlockRange(), &node->gtOp1, node);
op1 = ReplaceWithLclVar(op1Use);

if (useSub)
{
cnsVal = cnsValPlusOne;
Expand Down Expand Up @@ -213,7 +229,7 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul)

if (mul->OperIs(GT_MUL))
{
GenTree* replacementNode = TryLowerMulToLshSubOrLshAdd(mul);
GenTree* replacementNode = TryLowerMulWithConstant(mul);
if (replacementNode != nullptr)
{
return replacementNode->gtNext;
Expand Down
182 changes: 176 additions & 6 deletions src/tests/JIT/opt/Multiply/IntMultiply.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ static ulong UInt64_MultiplyWith3(ulong value)
[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith4(ulong value)
{
// X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]]
// X64-NEXT: shl [[REG0]], 2
// X64: lea [[REG0:[a-z]+]], {{\[}}4*[[REG1:[a-z]+]]{{\]}}
return value * 4;
}

Expand All @@ -82,17 +81,15 @@ static ulong UInt64_MultiplyWith6(ulong value)
[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith7(ulong value)
{
// X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]]
// X64-NEXT: shl [[REG0]], 3
// X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1:[a-z]+]]{{\]}}
// X64-NEXT: sub [[REG0]], [[REG1]]
return value * 7;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith8(ulong value)
{
// X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]]
// X64-NEXT: shl [[REG0]], 3
// X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1:[a-z]+]]{{\]}}
return value * 8;
}

Expand All @@ -103,9 +100,54 @@ static ulong UInt64_MultiplyWith9(ulong value)
return value * 9;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith10(ulong value)
{
// X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}}
// X64-NEXT: add [[REG0]], [[REG0]]
return value * 10;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith11(ulong value)
{
// We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.

// X64: imul
return value * 11;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith12(ulong value)
{
// X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}}
// X64-NEXT: shl [[REG0]], 2
return value * 12;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith13(ulong value)
{
// We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.

// X64: imul
return value * 13;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith14(ulong value)
{
// We expect 'imul' since the alternative replacement sequence would require 4 instructions which is too slow.

// X64: imul
return value * 14;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith15(ulong value)
{
// We expect these instructions since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.

// X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]]
// X64-NEXT: shl [[REG0]], 4
// X64-NEXT: sub [[REG0]], [[REG1]]
Expand All @@ -129,6 +171,92 @@ static ulong UInt64_MultiplyWith17(ulong value)
return value * 17;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith18(ulong value)
{
// X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+8*[[REG1]]{{\]}}
// X64-NEXT: add [[REG0]], [[REG0]]
return value * 18;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith19(ulong value)
{
// We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.

// X64: imul
return value * 19;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith20(ulong value)
{
// X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}}
// X64-NEXT: shl [[REG0]], 2
return value * 20;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith21(ulong value)
{
// We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.

// X64: imul
return value * 21;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith22(ulong value)
{
// We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions and 1 ADD instruction which is slower.

// X64: imul
return value * 22;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith23(ulong value)
{
// We expect 'imul' since the alternative replacement sequence would require 1 three-component LEA instruction, 1 SHL instruction, and 1 ADD instruction which is slower.

// X64: imul
return value * 23;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith24(ulong value)
{
// X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}}
// X64-NEXT: shl [[REG0]], 3
return value * 24;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith25(ulong value)
{
// We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.

// X64: imul
return value * 25;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static ulong UInt64_MultiplyWith5_AddressExposed(ulong value)
{
// X64: mov [[REG0:[a-z]+]], qword ptr
// X64-NOT: mov
// X64-NEXT: lea [[REG1:[a-z]+]], {{\[}}[[REG0]]+4*[[REG0]]{{\]}}
var value2 = value * 5;
UInt64_AddressExposed(ref value);
return value2;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static void UInt64_AddressExposed(ref ulong value)
{

}

static int Main()
{
if (UInt32_MultiplyWithUInt32MaxValue(1) != UInt32.MaxValue)
Expand Down Expand Up @@ -167,6 +295,21 @@ static int Main()
if (UInt64_MultiplyWith9(1) != 9)
return 0;

if (UInt64_MultiplyWith10(1) != 10)
return 0;

if (UInt64_MultiplyWith11(1) != 11)
return 0;

if (UInt64_MultiplyWith12(1) != 12)
return 0;

if (UInt64_MultiplyWith13(1) != 13)
return 0;

if (UInt64_MultiplyWith14(1) != 14)
return 0;

if (UInt64_MultiplyWith15(1) != 15)
return 0;

Expand All @@ -176,6 +319,33 @@ static int Main()
if (UInt64_MultiplyWith17(1) != 17)
return 0;

if (UInt64_MultiplyWith18(1) != 18)
return 0;

if (UInt64_MultiplyWith19(1) != 19)
return 0;

if (UInt64_MultiplyWith20(1) != 20)
return 0;

if (UInt64_MultiplyWith21(1) != 21)
return 0;

if (UInt64_MultiplyWith22(1) != 22)
return 0;

if (UInt64_MultiplyWith23(1) != 23)
return 0;

if (UInt64_MultiplyWith24(1) != 24)
return 0;

if (UInt64_MultiplyWith25(1) != 25)
return 0;

if (UInt64_MultiplyWith5_AddressExposed(1) != 5)
return 0;

return 100;
}
}
Expand Down
Loading