Skip to content

Commit ff6474a

Browse files
authored
[RISC-V] Use zero register as argument for atomics (#112693)
* zero register in simple atomics * Use zero register for CAS comparand * Use zero register for CAS data node * Fix unextended CAS comparand * Address previous nit * Remove unnecessary diff
1 parent c8f32e5 commit ff6474a

File tree

7 files changed

+141
-58
lines changed

7 files changed

+141
-58
lines changed

src/coreclr/jit/codegenriscv64.cpp

+32-36
Original file line numberDiff line numberDiff line change
@@ -1091,31 +1091,23 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
10911091
double constValue = tree->AsDblCon()->DconValue();
10921092

10931093
assert(emitter::isFloatReg(targetReg));
1094-
1095-
// Make sure we use "fmv.w.x reg, zero" only for positive zero (0.0) and not for negative zero (-0.0)
1096-
if (FloatingPointUtils::isPositiveZero(constValue))
1097-
{
1098-
// A faster/smaller way to generate 0.0
1099-
// We will just zero out the entire register for both float and double
1100-
emit->emitIns_R_R(size == EA_4BYTE ? INS_fmv_w_x : INS_fmv_d_x, size, targetReg, REG_R0);
1101-
break;
1102-
}
1103-
1104-
int64_t bits =
1105-
(size == EA_4BYTE)
1106-
? (int32_t)BitOperations::SingleToUInt32Bits(FloatingPointUtils::convertToSingle(constValue))
1107-
: (int64_t)BitOperations::DoubleToUInt64Bits(constValue);
1108-
bool fitsInLui = ((bits & 0xfff) == 0) && emitter::isValidSimm20(bits >> 12);
1109-
if (fitsInLui || emitter::isValidSimm12(bits)) // can we synthesize bits with a single instruction?
1094+
int64_t bits;
1095+
if (emitter::isSingleInstructionFpImm(constValue, size, &bits))
11101096
{
1111-
regNumber temp = internalRegisters.GetSingle(tree);
1112-
if (fitsInLui)
1097+
regNumber temp = REG_ZERO;
1098+
if (bits != 0)
11131099
{
1114-
emit->emitIns_R_I(INS_lui, size, temp, bits >> 12);
1115-
}
1116-
else
1117-
{
1118-
emit->emitIns_R_R_I(INS_addi, size, temp, REG_ZERO, bits);
1100+
temp = internalRegisters.GetSingle(tree);
1101+
if (emitter::isValidSimm12(bits))
1102+
{
1103+
emit->emitIns_R_R_I(INS_addi, size, temp, REG_ZERO, bits);
1104+
}
1105+
else
1106+
{
1107+
int64_t upperBits = bits >> 12;
1108+
assert((upperBits << 12) == bits);
1109+
emit->emitIns_R_I(INS_lui, size, temp, upperBits);
1110+
}
11191111
}
11201112

11211113
emit->emitIns_R_R(size == EA_4BYTE ? INS_fmv_w_x : INS_fmv_d_x, size, targetReg, temp);
@@ -2371,12 +2363,12 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode)
23712363

23722364
GenTree* data = treeNode->AsOp()->gtOp2;
23732365
GenTree* addr = treeNode->AsOp()->gtOp1;
2374-
regNumber dataReg = data->GetRegNum();
2366+
regNumber dataReg = !data->isContained() ? data->GetRegNum() : REG_ZERO;
23752367
regNumber addrReg = addr->GetRegNum();
23762368
regNumber targetReg = treeNode->GetRegNum();
23772369
if (targetReg == REG_NA)
23782370
{
2379-
targetReg = REG_R0;
2371+
targetReg = REG_ZERO;
23802372
}
23812373

23822374
genConsumeAddress(addr);
@@ -2385,8 +2377,6 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode)
23852377
emitAttr dataSize = emitActualTypeSize(data);
23862378
bool is4 = (dataSize == EA_4BYTE);
23872379

2388-
assert(!data->isContainedIntOrIImmed());
2389-
23902380
instruction ins = INS_none;
23912381
switch (treeNode->gtOper)
23922382
{
@@ -2407,7 +2397,7 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode)
24072397
}
24082398
GetEmitter()->emitIns_R_R_R(ins, dataSize, targetReg, addrReg, dataReg);
24092399

2410-
if (targetReg != REG_R0)
2400+
if (targetReg != REG_ZERO)
24112401
{
24122402
genProduceReg(treeNode);
24132403
}
@@ -2430,9 +2420,19 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode)
24302420

24312421
regNumber target = treeNode->GetRegNum();
24322422
regNumber loc = locOp->GetRegNum();
2433-
regNumber val = valOp->GetRegNum();
2434-
regNumber comparand = comparandOp->GetRegNum();
2435-
regNumber storeErr = internalRegisters.Extract(treeNode, RBM_ALLINT);
2423+
regNumber val = !valOp->isContained() ? valOp->GetRegNum() : REG_ZERO;
2424+
regNumber comparand = REG_ZERO;
2425+
if (!comparandOp->isContained())
2426+
{
2427+
comparand = comparandOp->GetRegNum();
2428+
if (comparandOp->TypeIs(TYP_INT, TYP_UINT))
2429+
{
2430+
regNumber signExtendedComparand = internalRegisters.Extract(treeNode);
2431+
GetEmitter()->emitIns_R_R(INS_sext_w, EA_4BYTE, signExtendedComparand, comparand);
2432+
comparand = signExtendedComparand;
2433+
}
2434+
}
2435+
regNumber storeErr = internalRegisters.GetSingle(treeNode);
24362436

24372437
// Register allocator should have extended the lifetimes of all input and internal registers
24382438
// They should all be different
@@ -2443,16 +2443,12 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode)
24432443
noway_assert(loc != val);
24442444
noway_assert(loc != comparand);
24452445
noway_assert(loc != storeErr);
2446-
noway_assert(val != comparand);
2446+
noway_assert((val != comparand) || (val == REG_ZERO));
24472447
noway_assert(val != storeErr);
24482448
noway_assert(comparand != storeErr);
24492449
noway_assert(target != REG_NA);
24502450
noway_assert(storeErr != REG_NA);
24512451

2452-
assert(locOp->isUsedFromReg());
2453-
assert(valOp->isUsedFromReg());
2454-
assert(!comparandOp->isUsedFromMemory());
2455-
24562452
genConsumeAddress(locOp);
24572453
genConsumeRegs(valOp);
24582454
genConsumeRegs(comparandOp);

src/coreclr/jit/emitriscv64.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -919,7 +919,11 @@ void emitter::emitIns_R_R_R(
919919
{
920920
code |= 0x7 << 12;
921921
}
922-
else if (INS_lr_w <= ins && ins <= INS_amomaxu_d)
922+
else if (ins == INS_sc_w || ins == INS_sc_d)
923+
{
924+
code |= 0b10 << 25; // release ordering, it ends the lr-sc loop
925+
}
926+
else if ((ins == INS_lr_w || ins == INS_lr_d) || (INS_amoswap_w <= ins && ins <= INS_amomaxu_d))
923927
{
924928
// For now all atomics are seq. consistent as Interlocked.* APIs don't expose acquire/release ordering
925929
code |= 0b11 << 25;

src/coreclr/jit/emitriscv64.h

+20
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,26 @@ static bool isValidSimm32(ssize_t value)
200200
return (-(((ssize_t)1) << 31) - 0x800) <= value && value < (((ssize_t)1) << 31) - 0x800;
201201
}
202202

203+
//------------------------------------------------------------------------
204+
// isSingleInstructionFpImm: checks if the floating-point constant can be synthesized with one instruction
205+
//
206+
// Arguments:
207+
// value - the constant to be imm'ed
208+
// size - size of the target immediate
209+
// outBits - [out] the bits of the immediate
210+
//
211+
// Return Value:
212+
// Whether the floating-point immediate can be synthesized with one instruction
213+
//
214+
static bool isSingleInstructionFpImm(double value, emitAttr size, int64_t* outBits)
215+
{
216+
assert(size == EA_4BYTE || size == EA_8BYTE);
217+
*outBits = (size == EA_4BYTE)
218+
? (int32_t)BitOperations::SingleToUInt32Bits(FloatingPointUtils::convertToSingle(value))
219+
: (int64_t)BitOperations::DoubleToUInt64Bits(value);
220+
return isValidSimm12(*outBits) || (((*outBits & 0xfff) == 0) && isValidSimm20(*outBits >> 12));
221+
}
222+
203223
// Returns the number of bits used by the given 'size'.
204224
inline static unsigned getBitWidth(emitAttr size)
205225
{

src/coreclr/jit/lower.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -651,12 +651,14 @@ GenTree* Lowering::LowerNode(GenTree* node)
651651

652652
#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
653653
case GT_CMPXCHG:
654+
RISCV64_ONLY(CheckImmedAndMakeContained(node, node->AsCmpXchg()->Data()));
654655
CheckImmedAndMakeContained(node, node->AsCmpXchg()->Comparand());
655656
break;
656657

657658
case GT_XORR:
658659
case GT_XAND:
659660
case GT_XADD:
661+
case GT_XCHG:
660662
CheckImmedAndMakeContained(node, node->AsOp()->gtOp2);
661663
break;
662664
#elif defined(TARGET_XARCH)

src/coreclr/jit/lowerriscv64.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@ bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode) const
7979
case GT_JCMP:
8080
return true;
8181

82+
case GT_CMPXCHG:
83+
case GT_XORR:
84+
case GT_XAND:
85+
case GT_XADD:
86+
case GT_XCHG:
8287
case GT_STORE_LCL_FLD:
8388
case GT_STORE_LCL_VAR:
8489
if (immVal == 0)

src/coreclr/jit/lsrariscv64.cpp

+49-21
Original file line numberDiff line numberDiff line change
@@ -144,20 +144,11 @@ int LinearScan::BuildNode(GenTree* tree)
144144
case GT_CNS_DBL:
145145
{
146146
emitAttr size = emitActualTypeSize(tree);
147-
148-
double constValue = tree->AsDblCon()->DconValue();
149-
if (!FloatingPointUtils::isPositiveZero(constValue))
147+
int64_t bits;
148+
if (emitter::isSingleInstructionFpImm(tree->AsDblCon()->DconValue(), size, &bits) && bits != 0)
150149
{
151-
int64_t bits =
152-
(size == EA_4BYTE)
153-
? (int32_t)BitOperations::SingleToUInt32Bits(FloatingPointUtils::convertToSingle(constValue))
154-
: (int64_t)BitOperations::DoubleToUInt64Bits(constValue);
155-
bool fitsInLui = ((bits & 0xfff) == 0) && emitter::isValidSimm20(bits >> 12);
156-
if (fitsInLui || emitter::isValidSimm12(bits)) // can we synthesize bits with a single instruction?
157-
{
158-
buildInternalIntRegisterDefForNode(tree);
159-
buildInternalRegisterUses();
160-
}
150+
buildInternalIntRegisterDefForNode(tree);
151+
buildInternalRegisterUses();
161152
}
162153
}
163154
FALLTHROUGH;
@@ -441,16 +432,44 @@ int LinearScan::BuildNode(GenTree* tree)
441432
case GT_CMPXCHG:
442433
{
443434
GenTreeCmpXchg* cas = tree->AsCmpXchg();
444-
assert(!cas->Comparand()->isContained());
445-
srcCount = 3;
446435
assert(dstCount == 1);
447436

448-
buildInternalIntRegisterDefForNode(tree); // temp reg for store conditional error
437+
srcCount = 1;
449438
// Extend lifetimes of argument regs because they may be reused during retries
439+
assert(!cas->Addr()->isContained());
450440
setDelayFree(BuildUse(cas->Addr()));
451-
setDelayFree(BuildUse(cas->Data()));
452-
setDelayFree(BuildUse(cas->Comparand()));
453441

442+
GenTree* data = cas->Data();
443+
if (!data->isContained())
444+
{
445+
srcCount++;
446+
setDelayFree(BuildUse(data));
447+
}
448+
else
449+
{
450+
assert(data->IsIntegralConst(0));
451+
}
452+
453+
GenTree* comparand = cas->Comparand();
454+
if (!comparand->isContained())
455+
{
456+
srcCount++;
457+
RefPosition* use = BuildUse(comparand);
458+
if (comparand->TypeIs(TYP_INT, TYP_UINT))
459+
{
460+
buildInternalIntRegisterDefForNode(tree); // temp reg for sign-extended comparand
461+
}
462+
else
463+
{
464+
setDelayFree(use);
465+
}
466+
}
467+
else
468+
{
469+
assert(comparand->IsIntegralConst(0));
470+
}
471+
472+
buildInternalIntRegisterDefForNode(tree); // temp reg for store conditional error
454473
// Internals may not collide with target
455474
setInternalRegsDelayFree = true;
456475
buildInternalRegisterUses();
@@ -470,11 +489,20 @@ int LinearScan::BuildNode(GenTree* tree)
470489
assert(dstCount == (tree->TypeIs(TYP_VOID) ? 0 : 1));
471490
GenTree* addr = tree->gtGetOp1();
472491
GenTree* data = tree->gtGetOp2();
473-
assert(!addr->isContained() && !data->isContained());
474-
srcCount = 2;
492+
assert(!addr->isContained());
475493

494+
srcCount = 1;
476495
BuildUse(addr);
477-
BuildUse(data);
496+
if (!data->isContained())
497+
{
498+
srcCount++;
499+
BuildUse(data);
500+
}
501+
else
502+
{
503+
assert(data->IsIntegralConst(0));
504+
}
505+
478506
if (dstCount == 1)
479507
{
480508
BuildDef(tree);

src/tests/JIT/Intrinsics/Interlocked.cs

+28
Original file line numberDiff line numberDiff line change
@@ -300,5 +300,33 @@ private static void ThrowsNRE(Action action, [CallerLineNumber] int line = 0, [C
300300
Console.WriteLine($"Line {line}: test failed (expected: NullReferenceException)");
301301
_errors++;
302302
}
303+
304+
public struct FloatUint
305+
{
306+
public float f;
307+
public uint u;
308+
}
309+
310+
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
311+
public static int DoTestCompareExchangeUnextended(FloatUint comparand)
312+
{
313+
uint val = 1;
314+
uint old = Interlocked.CompareExchange(ref val, 0, comparand.u);
315+
if (val != 0)
316+
return 101;
317+
if (old != 1)
318+
return 102;
319+
return 100;
320+
}
321+
322+
[Fact]
323+
public static int TestCompareExchangeUnextended()
324+
{
325+
// RISC-V comparisons are always full-register so the comparand reg must be extended.
326+
// The integer field of a struct passed according to the floating-point calling convention is not ABI-extended.
327+
// The reflection call poisons its remaining bits in runtime debug mode, making it a better repro.
328+
return (int)typeof(Program).GetMethod("DoTestCompareExchangeUnextended").Invoke(
329+
null, new object[] { new FloatUint{f = 0f, u = 1} });
330+
}
303331
}
304332
}

0 commit comments

Comments
 (0)