Skip to content

Commit 0a3fa33

Browse files
committed
[RISC-V] Enable constant CSE in riscv64
* Adjust costSz and costEx for GT_CNS_INT node * Add riscv64 in const CSE jitconfigvalues
1 parent 0cb6f71 commit 0a3fa33

File tree

4 files changed

+233
-14
lines changed

4 files changed

+233
-14
lines changed

src/coreclr/jit/emitriscv64.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1480,7 +1480,7 @@ void emitter::emitLoadImmediate(emitAttr size, regNumber reg, ssize_t imm)
14801480
*
14811481
* First, determine at which position to partition imm into high32 and offset,
14821482
* so that it yields the least instruction.
1483-
* Where high32 = imm[y:x] and imm[63:y] are all zeroes or all ones.
1483+
* Where high32 = imm[y:x] and imm[63:y] are all zeros or all ones.
14841484
*
14851485
* From the above equation, the value of offset1 & offset2 are:
14861486
* -> offset1 = imm[x-1:0]

src/coreclr/jit/gentree.cpp

Lines changed: 221 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5151,9 +5151,228 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
51515151

51525152
case GT_CNS_LNG:
51535153
case GT_CNS_INT:
5154-
costEx = 1;
5155-
costSz = 4;
5154+
{
5155+
GenTreeIntConCommon* con = tree->AsIntConCommon();
5156+
bool iconNeedsReloc = con->ImmedValNeedsReloc(this);
5157+
INT64 imm = con->LngValue();
5158+
emitAttr size = EA_SIZE(emitActualTypeSize(tree));
5159+
5160+
if (iconNeedsReloc)
5161+
{
5162+
// TODO-RISCV64-CQ: tune the costs.
5163+
// The codegen(emitIns_R_AI) is not implemented yet.
5164+
// Assuming that it will require two instructions auipc + addi for relocations
5165+
costSz = 8;
5166+
costEx = 2;
5167+
}
5168+
else if (emitter::isValidSimm12((ssize_t)imm))
5169+
{
5170+
costSz = 4;
5171+
costEx = 1;
5172+
}
5173+
else
5174+
{
5175+
// The below logic mimics emitter::emitLoadImmediate
5176+
#define WordMask(x) (static_cast<unsigned>((1ull << (uint8_t)(x)) - 1))
5177+
5178+
// STEP 1: Determine x & y
5179+
5180+
int x;
5181+
int y;
5182+
if (((uint64_t)imm >> 63) & 0b1)
5183+
{
5184+
// last one position from MSB
5185+
y = 63 - BitOperations::LeadingZeroCount((uint64_t)~imm) + 1;
5186+
}
5187+
else
5188+
{
5189+
// last zero position from MSB
5190+
y = 63 - BitOperations::LeadingZeroCount((uint64_t)imm) + 1;
5191+
}
5192+
if (imm & 0b1)
5193+
{
5194+
// first zero position from LSB
5195+
x = BitOperations::TrailingZeroCount((uint64_t)~imm);
5196+
}
5197+
else
5198+
{
5199+
// first one position from LSB
5200+
x = BitOperations::TrailingZeroCount((uint64_t)imm);
5201+
}
5202+
5203+
// STEP 2: Determine whether to utilize SRLI or not.
5204+
5205+
constexpr int absMaxInsCount = emitter::instrDescLoadImm::absMaxInsCount;
5206+
constexpr int prefMaxInsCount = 5;
5207+
assert(prefMaxInsCount <= absMaxInsCount);
5208+
5209+
int insCountLimit = prefMaxInsCount;
5210+
if (this->compGeneratingProlog || this->compGeneratingEpilog)
5211+
{
5212+
insCountLimit = absMaxInsCount;
5213+
}
5214+
5215+
bool utilizeSRLI = false;
5216+
int srliShiftAmount = 0;
5217+
uint64_t originalImm = imm;
5218+
bool cond1 = (y - x) > 31;
5219+
if ((((uint64_t)imm >> 63) & 0b1) == 0 && cond1)
5220+
{
5221+
srliShiftAmount = BitOperations::LeadingZeroCount((uint64_t)imm);
5222+
uint64_t tempImm = (uint64_t)imm << srliShiftAmount;
5223+
int m = BitOperations::LeadingZeroCount(~tempImm);
5224+
int b = 64 - m;
5225+
int a = BitOperations::TrailingZeroCount(tempImm);
5226+
bool cond2 = (b - a) < 32;
5227+
bool cond3 = ((y - x) - (b - a)) >= 11;
5228+
if (cond2 || cond3)
5229+
{
5230+
imm = tempImm;
5231+
y = b;
5232+
x = a;
5233+
utilizeSRLI = true;
5234+
insCountLimit -= 1;
5235+
}
5236+
}
5237+
5238+
if (y < 32)
5239+
{
5240+
y = 31;
5241+
x = 0;
5242+
}
5243+
else if ((y - x) < 31)
5244+
{
5245+
y = x + 31;
5246+
}
5247+
else
5248+
{
5249+
x = y - 31;
5250+
}
5251+
5252+
uint32_t high32 = ((int64_t)imm >> x) & WordMask(32);
5253+
5254+
// STEP 3: Determine whether to use high32 + offset1 or high32 - offset2
5255+
5256+
// TODO-RISCV: Instead of using subtract / add mode, assume that we're always adding
5257+
// 12-bit chunks. However, if we encounter such 12-bit chunk with MSB == 1,
5258+
// add 1 to the previous chunk, and add the 12-bit chunk as is, which
5259+
// essentially does a subtraction. It will generate the least instruction to
5260+
// load offset.
5261+
// See the following discussion:
5262+
// https://github.com/dotnet/runtime/pull/113250#discussion_r1987576070 */
5263+
5264+
uint32_t offset1 = imm & WordMask((uint8_t)x);
5265+
uint32_t offset2 = (~(offset1 - 1)) & WordMask((uint8_t)x);
5266+
uint32_t offset = offset1;
5267+
bool isSubtractMode = false;
5268+
5269+
if ((high32 == 0x7FFFFFFF) && (y != 63))
5270+
{
5271+
int newX = x + 1;
5272+
uint32_t newOffset1 = imm & WordMask((uint8_t)newX);
5273+
uint32_t newOffset2 = (~(newOffset1 - 1)) & WordMask((uint8_t)newX);
5274+
if (newOffset2 < offset1)
5275+
{
5276+
x = newX;
5277+
high32 = ((int64_t)imm >> x) & WordMask(32);
5278+
offset2 = newOffset2;
5279+
isSubtractMode = true;
5280+
}
5281+
}
5282+
else if (offset2 < offset1)
5283+
{
5284+
isSubtractMode = true;
5285+
}
5286+
5287+
if (isSubtractMode)
5288+
{
5289+
offset = offset2;
5290+
high32 = (high32 + 1) & WordMask(32);
5291+
}
5292+
5293+
assert(absMaxInsCount >= 2);
5294+
int numberOfInstructions = 0;
5295+
instruction ins[absMaxInsCount];
5296+
int32_t values[absMaxInsCount];
5297+
5298+
// STEP 4: Generate instructions to load high32
5299+
5300+
uint32_t upper = (high32 >> 12) & WordMask(20);
5301+
uint32_t lower = high32 & WordMask(12);
5302+
int lowerMsb = (lower >> 11) & 0b1;
5303+
if (lowerMsb == 1)
5304+
{
5305+
upper += 1;
5306+
upper &= WordMask(20);
5307+
}
5308+
if (upper != 0)
5309+
{
5310+
numberOfInstructions += 1;
5311+
}
5312+
if (lower != 0)
5313+
{
5314+
numberOfInstructions += 1;
5315+
}
5316+
5317+
// STEP 5: Generate instructions to load offset in 11-bits chunks
5318+
5319+
int chunkLsbPos = (x < 11) ? 0 : (x - 11);
5320+
int shift = (x < 11) ? x : 11;
5321+
int chunkMask = (x < 11) ? WordMask((uint8_t)x) : WordMask(11);
5322+
while (true)
5323+
{
5324+
uint32_t chunk = (offset >> chunkLsbPos) & chunkMask;
5325+
5326+
if (chunk != 0)
5327+
{
5328+
/* We could move our 11 bit chunk window to the right for as many as the
5329+
* leading zeros.*/
5330+
int leadingZerosOn11BitsChunk = 11 - (32 - BitOperations::LeadingZeroCount(chunk));
5331+
if (leadingZerosOn11BitsChunk > 0)
5332+
{
5333+
int maxAdditionalShift =
5334+
(chunkLsbPos < leadingZerosOn11BitsChunk) ? chunkLsbPos : leadingZerosOn11BitsChunk;
5335+
chunkLsbPos -= maxAdditionalShift;
5336+
shift += maxAdditionalShift;
5337+
chunk = (offset >> chunkLsbPos) & chunkMask;
5338+
}
5339+
5340+
numberOfInstructions += 2;
5341+
if (numberOfInstructions > insCountLimit)
5342+
{
5343+
break;
5344+
}
5345+
shift = 0;
5346+
}
5347+
if (chunkLsbPos == 0)
5348+
{
5349+
break;
5350+
}
5351+
shift += (chunkLsbPos < 11) ? chunkLsbPos : 11;
5352+
chunkMask = (chunkLsbPos < 11) ? (chunkMask >> (11 - chunkLsbPos)) : WordMask(11);
5353+
chunkLsbPos -= (chunkLsbPos < 11) ? chunkLsbPos : 11;
5354+
}
5355+
if (shift > 0)
5356+
{
5357+
numberOfInstructions += 1;
5358+
}
5359+
5360+
// STEP 6: Determine whether to use emitDataConst or emit generated instructions
5361+
5362+
if (numberOfInstructions <= insCountLimit)
5363+
{
5364+
if (utilizeSRLI)
5365+
{
5366+
numberOfInstructions += 1;
5367+
assert(numberOfInstructions < absMaxInsCount);
5368+
}
5369+
}
5370+
costSz = 4 * numberOfInstructions;
5371+
costEx = numberOfInstructions;
5372+
#undef WordMask
5373+
}
51565374
goto COMMON_CNS;
5375+
}
51575376
#else
51585377
case GT_CNS_STR:
51595378
case GT_CNS_LNG:

src/coreclr/jit/jitconfigvalues.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -453,12 +453,12 @@ RELEASE_CONFIG_INTEGER(JitDisableSimdVN, "JitDisableSimdVN", 0)
453453
// If 3, enable the CSE of Constants including nearby offsets. (all platforms)
454454
// If 4, enable the CSE of Constants but don't combine with nearby offsets. (all platforms)
455455
//
456-
#define CONST_CSE_ENABLE_ARM 0
457-
#define CONST_CSE_DISABLE_ALL 1
458-
#define CONST_CSE_ENABLE_ARM_NO_SHARING 2
459-
#define CONST_CSE_ENABLE_ALL 3
460-
#define CONST_CSE_ENABLE_ALL_NO_SHARING 4
461-
RELEASE_CONFIG_INTEGER(JitConstCSE, "JitConstCSE", CONST_CSE_ENABLE_ARM)
456+
#define CONST_CSE_ENABLE_ARM_RISCV64 0
457+
#define CONST_CSE_DISABLE_ALL 1
458+
#define CONST_CSE_ENABLE_ARM_RISCV64_NO_SHARING 2
459+
#define CONST_CSE_ENABLE_ALL 3
460+
#define CONST_CSE_ENABLE_ALL_NO_SHARING 4
461+
RELEASE_CONFIG_INTEGER(JitConstCSE, "JitConstCSE", CONST_CSE_ENABLE_ARM_RISCV64)
462462

463463
// If nonzero, use the greedy RL policy.
464464
//

src/coreclr/jit/optcse.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5873,12 +5873,12 @@ bool Compiler::optSharedConstantCSEEnabled()
58735873
{
58745874
enableSharedConstCSE = true;
58755875
}
5876-
#if defined(TARGET_ARMARCH)
5877-
else if (configValue == CONST_CSE_ENABLE_ARM)
5876+
#if defined(TARGET_ARMARCH) || defined(TARGET_RISCV64)
5877+
else if (configValue == CONST_CSE_ENABLE_ARM_RISCV64)
58785878
{
58795879
enableSharedConstCSE = true;
58805880
}
5881-
#endif // TARGET_ARMARCH
5881+
#endif // TARGET_ARMARCH || TARGET_RISCV64
58825882

58835883
return enableSharedConstCSE;
58845884
}
@@ -5898,8 +5898,8 @@ bool Compiler::optConstantCSEEnabled()
58985898
{
58995899
enableConstCSE = true;
59005900
}
5901-
#if defined(TARGET_ARMARCH)
5902-
else if ((configValue == CONST_CSE_ENABLE_ARM) || (configValue == CONST_CSE_ENABLE_ARM_NO_SHARING))
5901+
#if defined(TARGET_ARMARCH) || defined(TARGET_RISCV64)
5902+
else if ((configValue == CONST_CSE_ENABLE_ARM_RISCV64) || (configValue == CONST_CSE_ENABLE_ARM_RISCV64_NO_SHARING))
59035903
{
59045904
enableConstCSE = true;
59055905
}

0 commit comments

Comments
 (0)