Skip to content

Commit 71909f9

Browse files
committed
[RISC-V] Enable constant CSE in riscv64
* Adjust costSz and costEx for GT_CNS_INT node * Add riscv64 in const CSE jitconfigvalues
1 parent 0cb6f71 commit 71909f9

File tree

4 files changed

+269
-12
lines changed

4 files changed

+269
-12
lines changed

src/coreclr/jit/emitriscv64.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1480,7 +1480,7 @@ void emitter::emitLoadImmediate(emitAttr size, regNumber reg, ssize_t imm)
14801480
*
14811481
* First, determine at which position to partition imm into high32 and offset,
14821482
* so that it yields the least instruction.
1483-
* Where high32 = imm[y:x] and imm[63:y] are all zeroes or all ones.
1483+
* Where high32 = imm[y:x] and imm[63:y] are all zeros or all ones.
14841484
*
14851485
* From the above equation, the value of offset1 & offset2 are:
14861486
* -> offset1 = imm[x-1:0]

src/coreclr/jit/gentree.cpp

Lines changed: 260 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5151,9 +5151,267 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
51515151

51525152
case GT_CNS_LNG:
51535153
case GT_CNS_INT:
5154-
costEx = 1;
5155-
costSz = 4;
5154+
{
5155+
GenTreeIntConCommon* con = tree->AsIntConCommon();
5156+
bool iconNeedsReloc = con->ImmedValNeedsReloc(this);
5157+
INT64 imm = con->LngValue();
5158+
emitAttr size = EA_SIZE(emitActualTypeSize(tree));
5159+
5160+
if (iconNeedsReloc)
5161+
{
5162+
// TODO-RISCV64-CQ: tune the costs.
5163+
// The codegen(emitIns_R_AI) is not implemented yet.
5164+
// Assuming that it will require two instructions auipc + addi for relocations
5165+
costSz = 8;
5166+
costEx = 2;
5167+
}
5168+
else if (emitter::isValidSimm12((ssize_t)imm))
5169+
{
5170+
costSz = 4;
5171+
costEx = 1;
5172+
}
5173+
else
5174+
{
5175+
// The below logic mimics emitter::emitLoadImmediate
5176+
#define WordMask(x) (static_cast<unsigned>((1ull << (uint8_t)(x)) - 1))
5177+
5178+
// STEP 1: Determine x & y
5179+
int x;
5180+
int y;
5181+
if (((uint64_t)imm >> 63) & 0b1)
5182+
{
5183+
// last one position from MSB
5184+
y = 63 - BitOperations::LeadingZeroCount((uint64_t)~imm) + 1;
5185+
}
5186+
else
5187+
{
5188+
// last zero position from MSB
5189+
y = 63 - BitOperations::LeadingZeroCount((uint64_t)imm) + 1;
5190+
}
5191+
if (imm & 0b1)
5192+
{
5193+
// first zero position from LSB
5194+
x = BitOperations::TrailingZeroCount((uint64_t)~imm);
5195+
}
5196+
else
5197+
{
5198+
// first one position from LSB
5199+
x = BitOperations::TrailingZeroCount((uint64_t)imm);
5200+
}
5201+
5202+
// STEP 2: Determine whether to utilize SRLI or not.
5203+
5204+
constexpr int absMaxInsCount = emitter::instrDescLoadImm::absMaxInsCount;
5205+
constexpr int prefMaxInsCount = 5;
5206+
assert(prefMaxInsCount <= absMaxInsCount);
5207+
5208+
// If we generate more instructions than the prefered maximum instruction count, we'll instead use emitDataConst +
5209+
// emitIns_R_C combination.
5210+
int insCountLimit = prefMaxInsCount;
5211+
// If we are currently generating prolog / epilog, we are currently not inside a method block, therefore, we should
5212+
// not use the emitDataConst + emitIns_R_C combination.
5213+
if (this->compGeneratingProlog || this->compGeneratingEpilog)
5214+
{
5215+
insCountLimit = absMaxInsCount;
5216+
}
5217+
5218+
bool utilizeSRLI = false;
5219+
int srliShiftAmount = 0;
5220+
uint64_t originalImm = imm;
5221+
bool cond1 = (y - x) > 31;
5222+
if ((((uint64_t)imm >> 63) & 0b1) == 0 && cond1)
5223+
{
5224+
srliShiftAmount = BitOperations::LeadingZeroCount((uint64_t)imm);
5225+
uint64_t tempImm = (uint64_t)imm << srliShiftAmount;
5226+
int m = BitOperations::LeadingZeroCount(~tempImm);
5227+
int b = 64 - m;
5228+
int a = BitOperations::TrailingZeroCount(tempImm);
5229+
bool cond2 = (b - a) < 32;
5230+
bool cond3 = ((y - x) - (b - a)) >= 11;
5231+
if (cond2 || cond3)
5232+
{
5233+
imm = tempImm;
5234+
y = b;
5235+
x = a;
5236+
utilizeSRLI = true;
5237+
insCountLimit -= 1;
5238+
}
5239+
}
5240+
5241+
if (y < 32)
5242+
{
5243+
y = 31;
5244+
x = 0;
5245+
}
5246+
else if ((y - x) < 31)
5247+
{
5248+
y = x + 31;
5249+
}
5250+
else
5251+
{
5252+
x = y - 31;
5253+
}
5254+
5255+
uint32_t high32 = ((int64_t)imm >> x) & WordMask(32);
5256+
5257+
// STEP 3: Determine whether to use high32 + offset1 or high32 - offset2
5258+
5259+
uint32_t offset1 = imm & WordMask((uint8_t)x);
5260+
uint32_t offset2 = (~(offset1 - 1)) & WordMask((uint8_t)x);
5261+
uint32_t offset = offset1;
5262+
bool isSubtractMode = false;
5263+
5264+
if ((high32 == 0x7FFFFFFF) && (y != 63))
5265+
{
5266+
// Handle corner case: we cannot do subtract mode if high32 == 0x7FFFFFFF
5267+
// Since adding 1 to it will change the sign bit. Instead, shift x and y
5268+
// to the left by one.
5269+
int newX = x + 1;
5270+
uint32_t newOffset1 = imm & WordMask((uint8_t)newX);
5271+
uint32_t newOffset2 = (~(newOffset1 - 1)) & WordMask((uint8_t)newX);
5272+
if (newOffset2 < offset1)
5273+
{
5274+
x = newX;
5275+
high32 = ((int64_t)imm >> x) & WordMask(32);
5276+
offset2 = newOffset2;
5277+
isSubtractMode = true;
5278+
}
5279+
}
5280+
else if (offset2 < offset1)
5281+
{
5282+
isSubtractMode = true;
5283+
}
5284+
5285+
if (isSubtractMode)
5286+
{
5287+
offset = offset2;
5288+
high32 = (high32 + 1) & WordMask(32);
5289+
}
5290+
5291+
assert(absMaxInsCount >= 2);
5292+
int numberOfInstructions = 0;
5293+
instruction ins[absMaxInsCount];
5294+
int32_t values[absMaxInsCount];
5295+
5296+
// STEP 4: Generate instructions to load high32
5297+
5298+
uint32_t upper = (high32 >> 12) & WordMask(20);
5299+
uint32_t lower = high32 & WordMask(12);
5300+
int lowerMsb = (lower >> 11) & 0b1;
5301+
if (lowerMsb == 1)
5302+
{
5303+
upper += 1;
5304+
upper &= WordMask(20);
5305+
}
5306+
if (upper != 0)
5307+
{
5308+
// ins[numberOfInstructions] = INS_lui;
5309+
// values[numberOfInstructions] = ((upper >> 19) & 0b1) ? (upper + 0xFFF00000) : upper;
5310+
numberOfInstructions += 1;
5311+
}
5312+
if (lower != 0)
5313+
{
5314+
// ins[numberOfInstructions] = INS_addiw;
5315+
// values[numberOfInstructions] = lower;
5316+
numberOfInstructions += 1;
5317+
}
5318+
5319+
// STEP 5: Generate instructions to load offset in 11-bits chunks
5320+
5321+
int chunkLsbPos = (x < 11) ? 0 : (x - 11);
5322+
int shift = (x < 11) ? x : 11;
5323+
int chunkMask = (x < 11) ? WordMask((uint8_t)x) : WordMask(11);
5324+
while (true)
5325+
{
5326+
uint32_t chunk = (offset >> chunkLsbPos) & chunkMask;
5327+
5328+
if (chunk != 0)
5329+
{
5330+
/* We could move our 11 bit chunk window to the right for as many as the
5331+
* leading zeros.*/
5332+
int leadingZerosOn11BitsChunk = 11 - (32 - BitOperations::LeadingZeroCount(chunk));
5333+
if (leadingZerosOn11BitsChunk > 0)
5334+
{
5335+
int maxAdditionalShift =
5336+
(chunkLsbPos < leadingZerosOn11BitsChunk) ? chunkLsbPos : leadingZerosOn11BitsChunk;
5337+
chunkLsbPos -= maxAdditionalShift;
5338+
shift += maxAdditionalShift;
5339+
chunk = (offset >> chunkLsbPos) & chunkMask;
5340+
}
5341+
5342+
numberOfInstructions += 2;
5343+
if (numberOfInstructions > insCountLimit)
5344+
{
5345+
break;
5346+
}
5347+
// ins[numberOfInstructions - 2] = INS_slli;
5348+
// values[numberOfInstructions - 2] = shift;
5349+
// if (isSubtractMode)
5350+
// {
5351+
// ins[numberOfInstructions - 1] = INS_addi;
5352+
// values[numberOfInstructions - 1] = -(int32_t)chunk;
5353+
// }
5354+
// else
5355+
// {
5356+
// ins[numberOfInstructions - 1] = INS_addi;
5357+
// values[numberOfInstructions - 1] = chunk;
5358+
// }
5359+
shift = 0;
5360+
}
5361+
if (chunkLsbPos == 0)
5362+
{
5363+
break;
5364+
}
5365+
shift += (chunkLsbPos < 11) ? chunkLsbPos : 11;
5366+
chunkMask = (chunkLsbPos < 11) ? (chunkMask >> (11 - chunkLsbPos)) : WordMask(11);
5367+
chunkLsbPos -= (chunkLsbPos < 11) ? chunkLsbPos : 11;
5368+
}
5369+
if (shift > 0)
5370+
{
5371+
numberOfInstructions += 1;
5372+
// if (numberOfInstructions <= insCountLimit)
5373+
// {
5374+
// ins[numberOfInstructions - 1] = INS_slli;
5375+
// values[numberOfInstructions - 1] = shift;
5376+
// }
5377+
}
5378+
5379+
// STEP 6: Determine whether to use emitDataConst or emit generated instructions
5380+
5381+
if (numberOfInstructions <= insCountLimit)
5382+
{
5383+
// instrDescLoadImm* id = static_cast<instrDescLoadImm*>(emitNewInstrLoadImm(size, originalImm));
5384+
// id->idReg1(reg);
5385+
// memcpy(id->ins, ins, sizeof(instruction) * numberOfInstructions);
5386+
// memcpy(id->values, values, sizeof(int32_t) * numberOfInstructions);
5387+
if (utilizeSRLI)
5388+
{
5389+
numberOfInstructions += 1;
5390+
assert(numberOfInstructions < absMaxInsCount);
5391+
// id->ins[numberOfInstructions - 1] = INS_srli;
5392+
// id->values[numberOfInstructions - 1] = srliShiftAmount;
5393+
}
5394+
// id->idCodeSize(numberOfInstructions * 4);
5395+
// id->idIns(id->ins[numberOfInstructions - 1]);
5396+
5397+
// appendToCurIG(id);
5398+
}
5399+
// else if (size == EA_PTRSIZE)
5400+
// {
5401+
// assert(!emitComp->compGeneratingProlog && !emitComp->compGeneratingEpilog);
5402+
// auto constAddr = emitDataConst(&originalImm, sizeof(long), sizeof(long), TYP_LONG);
5403+
// emitIns_R_C(INS_ld, EA_PTRSIZE, reg, REG_NA, emitComp->eeFindJitDataOffs(constAddr));
5404+
// }
5405+
// else
5406+
// {
5407+
// assert(false && "If number of instruction exceeds MAX_NUM_OF_LOAD_IMM_INS, imm must be 8 bytes");
5408+
// }
5409+
costSz = 4 * numberOfInstructions;
5410+
costEx = numberOfInstructions;
5411+
#undef WordMask
5412+
}
51565413
goto COMMON_CNS;
5414+
}
51575415
#else
51585416
case GT_CNS_STR:
51595417
case GT_CNS_LNG:

src/coreclr/jit/jitconfigvalues.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -453,12 +453,12 @@ RELEASE_CONFIG_INTEGER(JitDisableSimdVN, "JitDisableSimdVN", 0)
453453
// If 3, enable the CSE of Constants including nearby offsets. (all platforms)
454454
// If 4, enable the CSE of Constants but don't combine with nearby offsets. (all platforms)
455455
//
456-
#define CONST_CSE_ENABLE_ARM 0
456+
#define CONST_CSE_ENABLE_ARM_RISCV64 0
457457
#define CONST_CSE_DISABLE_ALL 1
458-
#define CONST_CSE_ENABLE_ARM_NO_SHARING 2
458+
#define CONST_CSE_ENABLE_ARM_RISCV64_NO_SHARING 2
459459
#define CONST_CSE_ENABLE_ALL 3
460460
#define CONST_CSE_ENABLE_ALL_NO_SHARING 4
461-
RELEASE_CONFIG_INTEGER(JitConstCSE, "JitConstCSE", CONST_CSE_ENABLE_ARM)
461+
RELEASE_CONFIG_INTEGER(JitConstCSE, "JitConstCSE", CONST_CSE_ENABLE_ARM_RISCV64)
462462

463463
// If nonzero, use the greedy RL policy.
464464
//

src/coreclr/jit/optcse.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2061,7 +2061,6 @@ bool CSE_HeuristicCommon::CanConsiderTree(GenTree* tree, bool isReturn)
20612061
{
20622062
return false;
20632063
}
2064-
20652064
return true;
20662065
}
20672066

@@ -5873,12 +5872,12 @@ bool Compiler::optSharedConstantCSEEnabled()
58735872
{
58745873
enableSharedConstCSE = true;
58755874
}
5876-
#if defined(TARGET_ARMARCH)
5877-
else if (configValue == CONST_CSE_ENABLE_ARM)
5875+
#if defined(TARGET_ARMARCH) || defined(TARGET_RISCV64)
5876+
else if (configValue == CONST_CSE_ENABLE_ARM_RISCV64)
58785877
{
58795878
enableSharedConstCSE = true;
58805879
}
5881-
#endif // TARGET_ARMARCH
5880+
#endif // TARGET_ARMARCH || TARGET_RISCV64
58825881

58835882
return enableSharedConstCSE;
58845883
}
@@ -5898,8 +5897,8 @@ bool Compiler::optConstantCSEEnabled()
58985897
{
58995898
enableConstCSE = true;
59005899
}
5901-
#if defined(TARGET_ARMARCH)
5902-
else if ((configValue == CONST_CSE_ENABLE_ARM) || (configValue == CONST_CSE_ENABLE_ARM_NO_SHARING))
5900+
#if defined(TARGET_ARMARCH) || defined(TARGET_RISCV64)
5901+
else if ((configValue == CONST_CSE_ENABLE_ARM_RISCV64) || (configValue == CONST_CSE_ENABLE_ARM_RISCV64_NO_SHARING))
59035902
{
59045903
enableConstCSE = true;
59055904
}

0 commit comments

Comments
 (0)