diff --git a/docs/design/coreclr/botr/clr-abi.md b/docs/design/coreclr/botr/clr-abi.md index 825ad5820a920..9141613db0c0d 100644 --- a/docs/design/coreclr/botr/clr-abi.md +++ b/docs/design/coreclr/botr/clr-abi.md @@ -205,7 +205,7 @@ For non-rude thread abort, the VM walks the stack, running any catch handler tha For example: -``` +```cs try { // try 1 try { // try 2 System.Threading.Thread.CurrentThread.Abort(); @@ -221,7 +221,7 @@ L: In this case, if the address returned in catch 2 corresponding to label L is outside try 1, then the ThreadAbortException re-raised by the VM will not be caught by catch 1, as is expected. The JIT needs to insert a block such that this is the effective code generation: -``` +```cs try { // try 1 try { // try 2 System.Threading.Thread.CurrentThread.Abort(); @@ -238,7 +238,7 @@ L: Similarly, the automatic re-raise address for a ThreadAbortException can't be within a finally handler, or the VM will abort the re-raise and swallow the exception. This can happen due to call-to-finally thunks marked as "cloned finally", as described above. For example (this is pseudo-assembly-code, not C#): -``` +```cs try { // try 1 try { // try 2 System.Threading.Thread.CurrentThread.Abort(); @@ -254,7 +254,7 @@ L: This would generate something like: -``` +```asm // beginning of 'try 1' // beginning of 'try 2' System.Threading.Thread.CurrentThread.Abort(); @@ -279,7 +279,7 @@ Finally1: Note that the JIT must already insert a "step" block so the finally will be called. However, this isn't sufficient to support ThreadAbortException processing, because "L1" is marked as "cloned finally". In this case, the JIT must insert another step block that is within "try 1" but outside the cloned finally block, that will allow for correct re-raise semantics. For example: -``` +```asm // beginning of 'try 1' // beginning of 'try 2' System.Threading.Thread.CurrentThread.Abort(); @@ -397,7 +397,7 @@ To implement this requirement, for any function with EH, we create a frame-local Note that the since a slot on x86 is 4 bytes, the minimum size is 16 bytes. The idea is to have 1 slot for each handler that could be possibly be invoked at the same time. For example, for: -``` +```cs try { ... } catch { @@ -417,7 +417,7 @@ When calling a finally, we set the appropriate level to 0xFC (aka "finally call" Thus, calling a finally from JIT generated code looks like: -``` +```asm mov dword ptr [L_02+0x4 ebp-10H], 0 // This must happen before the 0xFC is written mov dword ptr [L_02+0x8 ebp-0CH], 252 // 0xFC push G_M52300_IG07 @@ -428,7 +428,7 @@ In this case, `G_M52300_IG07` is not the address after the 'jmp', so a simple 'c The code this finally returns to looks like this: -``` +```asm mov dword ptr [L_02+0x8 ebp-0CH], 0 jmp SHORT G_M52300_IG05 ``` @@ -477,7 +477,7 @@ Because a main function body will **always** be on the stack when one of its fun There is one "corner case" in the VM implementation of WantsReportOnlyLeaf model that has implications for the code the JIT is allowed to generate. Consider this function with nested exception handling: -``` +```cs public void runtest() { try { try { @@ -804,3 +804,29 @@ In addition to the usual registers it also preserves all float registers and `rc `CORINFO_HELP_DISPATCH_INDIRECT_CALL` takes the call address in `rax` and it reserves the right to use and trash `r10` and `r11`. The JIT uses the dispatch helper on x64 whenever possible as it is expected that the code size benefits outweighs the less accurate branch prediction. However, note that the use of `r11` in the dispatcher makes it incompatible with VSD calls where the JIT must fall back to the validator and a manual call. + +# Notes on Memset/Memcpy + +Generally, `memset` and `memcpy` do not provide any guarantees of atomicity. This implies that they should only be used when the memory being modified by `memset`/`memcpy` is not observable by any other thread (including GC), or when there are no atomicity requirements according to our [Memory Model](../../specs/Memory-model.md). It's especially important when we modify heap containing managed pointers - those must be updated atomically, e.g. using pointer-sized `mov` instruction (managed pointers are always aligned) - see [Atomic Memory Access](../../specs/Memory-model.md#Atomic-memory-accesses). It's worth noting that by "update" it's implied "set to zero", otherwise, we need a write barrier. + +Examples: + +```cs +struct MyStruct +{ + long a; + string b; +} + +void Test1(ref MyStruct m) +{ + // We're not allowed to use memset here + m = default; +} + +MyStruct Test2() +{ + // We can use memset here + return default; +} +``` \ No newline at end of file diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index eb190f97f0e47..dda0d1871e4bf 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1226,6 +1226,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #ifndef TARGET_X86 void genCodeForInitBlkHelper(GenTreeBlk* initBlkNode); #endif + void genCodeForInitBlkLoop(GenTreeBlk* initBlkNode); void genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode); void genCodeForInitBlkUnroll(GenTreeBlk* initBlkNode); void genJumpTable(GenTree* tree); diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 5434ef7b722ef..562dad738bd3e 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -3114,6 +3114,72 @@ void CodeGen::genCodeForInitBlkHelper(GenTreeBlk* initBlkNode) genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN); } +//------------------------------------------------------------------------ +// genCodeForInitBlkLoop - Generate code for an InitBlk using an inlined for-loop. +// It's needed for cases when size is too big to unroll and we're not allowed +// to use memset call due to atomicity requirements. +// +// Arguments: +// initBlkNode - the GT_STORE_BLK node +// +void CodeGen::genCodeForInitBlkLoop(GenTreeBlk* initBlkNode) +{ + GenTree* const dstNode = initBlkNode->Addr(); + genConsumeReg(dstNode); + const regNumber dstReg = dstNode->GetRegNum(); + +#ifndef TARGET_ARM64 + GenTree* const zeroNode = initBlkNode->Data(); + genConsumeReg(zeroNode); + const regNumber zeroReg = zeroNode->GetRegNum(); +#else + const regNumber zeroReg = REG_ZR; +#endif + + if (initBlkNode->IsVolatile()) + { + // issue a full memory barrier before a volatile initBlock Operation + instGen_MemoryBarrier(); + } + + // str xzr, [dstReg] + // mov offsetReg, + //.LOOP: + // str xzr, [dstReg, offsetReg] + // subs offsetReg, offsetReg, #8 + // bne .LOOP + + const unsigned size = initBlkNode->GetLayout()->GetSize(); + assert((size >= TARGET_POINTER_SIZE) && ((size % TARGET_POINTER_SIZE) == 0)); + + // The loop is reversed - it makes it smaller. + // Although, we zero the first pointer before the loop (the loop doesn't zero it) + // it works as a nullcheck, otherwise the first iteration would try to access + // "null + potentially large offset" and hit AV. + GetEmitter()->emitIns_R_R(INS_str, EA_PTRSIZE, zeroReg, dstReg); + if (size > TARGET_POINTER_SIZE) + { + // Extend liveness of dstReg in case if it gets killed by the store. + gcInfo.gcMarkRegPtrVal(dstReg, dstNode->TypeGet()); + + const regNumber offsetReg = initBlkNode->GetSingleTempReg(); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, offsetReg, size - TARGET_POINTER_SIZE); + + BasicBlock* loop = genCreateTempLabel(); + genDefineTempLabel(loop); + + GetEmitter()->emitIns_R_R_R(INS_str, EA_PTRSIZE, zeroReg, dstReg, offsetReg); +#ifdef TARGET_ARM64 + GetEmitter()->emitIns_R_R_I(INS_subs, EA_PTRSIZE, offsetReg, offsetReg, TARGET_POINTER_SIZE); +#else + GetEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, offsetReg, offsetReg, TARGET_POINTER_SIZE, INS_FLAGS_SET); +#endif + inst_JMP(EJ_ne, loop); + + gcInfo.gcMarkRegSetNpt(genRegMask(dstReg)); + } +} + //------------------------------------------------------------------------ // genCall: Produce code for a GT_CALL node // @@ -4384,6 +4450,11 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) genCodeForCpObj(blkOp->AsBlk()); break; + case GenTreeBlk::BlkOpKindLoop: + assert(!isCopyBlk); + genCodeForInitBlkLoop(blkOp); + break; + case GenTreeBlk::BlkOpKindHelper: assert(!blkOp->gtBlkOpGcUnsafe); if (isCopyBlk) diff --git a/src/coreclr/jit/codegenloongarch64.cpp b/src/coreclr/jit/codegenloongarch64.cpp index 33bd2a77486c6..2f7444e0825ae 100644 --- a/src/coreclr/jit/codegenloongarch64.cpp +++ b/src/coreclr/jit/codegenloongarch64.cpp @@ -6375,6 +6375,54 @@ void CodeGen::genCodeForInitBlkHelper(GenTreeBlk* initBlkNode) genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN); } +//------------------------------------------------------------------------ +// genCodeForInitBlkLoop - Generate code for an InitBlk using an inlined for-loop. +// It's needed for cases when size is too big to unroll and we're not allowed +// to use memset call due to atomicity requirements. +// +// Arguments: +// initBlkNode - the GT_STORE_BLK node +// +void CodeGen::genCodeForInitBlkLoop(GenTreeBlk* initBlkNode) +{ + GenTree* const dstNode = initBlkNode->Addr(); + genConsumeReg(dstNode); + const regNumber dstReg = dstNode->GetRegNum(); + + if (initBlkNode->IsVolatile()) + { + // issue a full memory barrier before a volatile initBlock Operation + instGen_MemoryBarrier(); + } + + const unsigned size = initBlkNode->GetLayout()->GetSize(); + assert((size >= TARGET_POINTER_SIZE) && ((size % TARGET_POINTER_SIZE) == 0)); + + // The loop is reversed - it makes it smaller. + // Although, we zero the first pointer before the loop (the loop doesn't zero it) + // it works as a nullcheck, otherwise the first iteration would try to access + // "null + potentially large offset" and hit AV. + GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_R0, dstReg, 0); + if (size > TARGET_POINTER_SIZE) + { + // Extend liveness of dstReg in case if it gets killed by the store. + gcInfo.gcMarkRegPtrVal(dstReg, dstNode->TypeGet()); + + const regNumber offsetReg = initBlkNode->GetSingleTempReg(); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, offsetReg, size - TARGET_POINTER_SIZE); + + // loop begin: + // *(dstReg + offsetReg) = 0 + GetEmitter()->emitIns_R_R_R(INS_stx_d, EA_PTRSIZE, REG_R0, dstReg, offsetReg); + // offsetReg = offsetReg - 8 + GetEmitter()->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, offsetReg, offsetReg, -8); + // if (offsetReg != 0) goto loop; + GetEmitter()->emitIns_R_I(INS_bnez, EA_8BYTE, offsetReg, -2 << 2); + + gcInfo.gcMarkRegSetNpt(genRegMask(dstReg)); + } +} + // Generate code for a load from some address + offset // base: tree node which can be either a local address or arbitrary node // offset: distance from the base from which to load @@ -7286,6 +7334,11 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) genCodeForCpObj(blkOp->AsBlk()); break; + case GenTreeBlk::BlkOpKindLoop: + assert(!isCopyBlk); + genCodeForInitBlkLoop(blkOp); + break; + case GenTreeBlk::BlkOpKindHelper: if (isCopyBlk) { diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index 39d25e316e529..df5b47554d8b9 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -5958,6 +5958,61 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode) } } +//------------------------------------------------------------------------ +// genCodeForInitBlkLoop - Generate code for an InitBlk using an inlined for-loop. +// It's needed for cases when size is too big to unroll and we're not allowed +// to use memset call due to atomicity requirements. +// +// Arguments: +// initBlkNode - the GT_STORE_BLK node +// +void CodeGen::genCodeForInitBlkLoop(GenTreeBlk* initBlkNode) +{ + GenTree* const dstNode = initBlkNode->Addr(); + genConsumeReg(dstNode); + const regNumber dstReg = dstNode->GetRegNum(); + + if (initBlkNode->IsVolatile()) + { + // issue a full memory barrier before a volatile initBlock Operation + instGen_MemoryBarrier(); + } + + const unsigned size = initBlkNode->GetLayout()->GetSize(); + assert((size >= TARGET_POINTER_SIZE) && ((size % TARGET_POINTER_SIZE) == 0)); + + // The loop is reversed - it makes it smaller. + // Although, we zero the first pointer before the loop (the loop doesn't zero it) + // it works as a nullcheck, otherwise the first iteration would try to access + // "null + potentially large offset" and hit AV. + GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, dstReg, 0); + if (size > TARGET_POINTER_SIZE) + { + // Extend liveness of dstReg in case if it gets killed by the store. + gcInfo.gcMarkRegPtrVal(dstReg, dstNode->TypeGet()); + + const regNumber tempReg = initBlkNode->GetSingleTempReg(); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, tempReg, size - TARGET_POINTER_SIZE); + + // tempReg = dstReg + tempReg (a new interior pointer, but in a nongc region) + GetEmitter()->emitIns_R_R_R(INS_add, EA_PTRSIZE, tempReg, dstReg, tempReg); + + BasicBlock* loop = genCreateTempLabel(); + genDefineTempLabel(loop); + GetEmitter()->emitDisableGC(); // TODO: add gcinfo to tempReg and remove nogc + + // *tempReg = 0 + GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, tempReg, 0); + // tempReg = tempReg - 8 + GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, tempReg, tempReg, -8); + // if (tempReg != dstReg) goto loop; + GetEmitter()->emitIns_J(INS_bne, loop, (int)tempReg | ((int)dstReg << 5)); + GetEmitter()->emitEnableGC(); + + gcInfo.gcMarkRegSetNpt(genRegMask(dstReg)); + } +} + //------------------------------------------------------------------------ // genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call // @@ -6853,6 +6908,11 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) genCodeForCpObj(blkOp->AsBlk()); break; + case GenTreeBlk::BlkOpKindLoop: + assert(!isCopyBlk); + genCodeForInitBlkLoop(blkOp); + break; + case GenTreeBlk::BlkOpKindHelper: if (isCopyBlk) { diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 5847b40ffd32e..12764affdd92e 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3034,6 +3034,11 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode) genCodeForCpObj(storeBlkNode->AsBlk()); break; + case GenTreeBlk::BlkOpKindLoop: + assert(!isCopyBlk); + genCodeForInitBlkLoop(storeBlkNode); + break; + #ifdef TARGET_AMD64 case GenTreeBlk::BlkOpKindHelper: assert(!storeBlkNode->gtBlkOpGcUnsafe); @@ -3312,6 +3317,60 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #endif } +//------------------------------------------------------------------------ +// genCodeForInitBlkLoop - Generate code for an InitBlk using an inlined for-loop. +// It's needed for cases when size is too big to unroll and we're not allowed +// to use memset call due to atomicity requirements. +// +// Arguments: +// initBlkNode - the GT_STORE_BLK node +// +void CodeGen::genCodeForInitBlkLoop(GenTreeBlk* initBlkNode) +{ + GenTree* const dstNode = initBlkNode->Addr(); + GenTree* const zeroNode = initBlkNode->Data(); + + genConsumeReg(dstNode); + genConsumeReg(zeroNode); + + const regNumber dstReg = dstNode->GetRegNum(); + const regNumber zeroReg = zeroNode->GetRegNum(); + + // xor zeroReg, zeroReg + // mov qword ptr [dstReg], zeroReg + // mov offsetReg, + //.LOOP: + // mov qword ptr [dstReg + offsetReg], zeroReg + // sub offsetReg, 8 + // jne .LOOP + + const unsigned size = initBlkNode->GetLayout()->GetSize(); + assert((size >= TARGET_POINTER_SIZE) && ((size % TARGET_POINTER_SIZE) == 0)); + + // The loop is reversed - it makes it smaller. + // Although, we zero the first pointer before the loop (the loop doesn't zero it) + // it works as a nullcheck, otherwise the first iteration would try to access + // "null + potentially large offset" and hit AV. + GetEmitter()->emitIns_AR_R(INS_mov, EA_PTRSIZE, zeroReg, dstReg, 0); + if (size > TARGET_POINTER_SIZE) + { + // Extend liveness of dstReg in case if it gets killed by the store. + gcInfo.gcMarkRegPtrVal(dstReg, dstNode->TypeGet()); + + const regNumber offsetReg = initBlkNode->GetSingleTempReg(); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, offsetReg, size - TARGET_POINTER_SIZE); + + BasicBlock* loop = genCreateTempLabel(); + genDefineTempLabel(loop); + + GetEmitter()->emitIns_ARX_R(INS_mov, EA_PTRSIZE, zeroReg, dstReg, offsetReg, 1, 0); + GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, offsetReg, TARGET_POINTER_SIZE); + inst_JMP(EJ_jne, loop); + + gcInfo.gcMarkRegSetNpt(genRegMask(dstReg)); + } +} + #ifdef TARGET_AMD64 //------------------------------------------------------------------------ // genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 8339d6d274f4f..dd23c48255915 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9905,6 +9905,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX STRESS_MODE(IF_CONVERSION_COST) \ STRESS_MODE(IF_CONVERSION_INNER_LOOPS) \ STRESS_MODE(POISON_IMPLICIT_BYREFS) \ + STRESS_MODE(STORE_BLOCK_UNROLLING) \ STRESS_MODE(COUNT) enum compStressArea diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 7c34c51571e66..31be03bda15b7 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -12227,6 +12227,11 @@ void Compiler::gtDispTree(GenTree* tree, printf(" (Helper)"); break; #endif + + case GenTreeBlk::BlkOpKindLoop: + printf(" (Loop)"); + break; + default: unreached(); } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 109da6a15c30d..5a06113fa8d71 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -7248,6 +7248,7 @@ struct GenTreeBlk : public GenTreeIndir #ifdef TARGET_XARCH BlkOpKindRepInstr, #endif + BlkOpKindLoop, BlkOpKindUnroll, BlkOpKindUnrollMemmove, } gtBlkOpKind; @@ -7256,12 +7257,20 @@ struct GenTreeBlk : public GenTreeIndir bool gtBlkOpGcUnsafe; #endif -#ifdef TARGET_XARCH + bool ContainsReferences() + { + return (m_layout != nullptr) && m_layout->HasGCPtr(); + } + bool IsOnHeapAndContainsReferences() { - return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIs(GT_LCL_ADDR); + return ContainsReferences() && !Addr()->OperIs(GT_LCL_ADDR); + } + + bool IsZeroingGcPointersOnHeap() + { + return OperIs(GT_STORE_BLK) && Data()->IsIntegralConst(0) && IsOnHeapAndContainsReferences(); } -#endif GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, ClassLayout* layout) : GenTreeIndir(oper, type, addr, nullptr) @@ -7272,6 +7281,10 @@ struct GenTreeBlk : public GenTreeIndir GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, GenTree* data, ClassLayout* layout) : GenTreeIndir(oper, type, addr, data) { + if (data->IsIntegralConst(0)) + { + data->gtFlags |= GTF_DONT_CSE; + } Initialize(layout); } diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 2e454e64c14eb..2231a044b04cd 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -7973,6 +7973,15 @@ void Lowering::LowerBlockStoreCommon(GenTreeBlk* blkNode) { assert(blkNode->OperIs(GT_STORE_BLK, GT_STORE_DYN_BLK)); + if (blkNode->ContainsReferences() && !blkNode->OperIsCopyBlkOp()) + { + // Make sure we don't use GT_STORE_DYN_BLK + assert(blkNode->OperIs(GT_STORE_BLK)); + + // and we only zero it (and that zero is better to be not hoisted/CSE'd) + assert(blkNode->Data()->IsIntegralConst(0)); + } + // Lose the type information stored in the source - we no longer need it. if (blkNode->Data()->OperIs(GT_BLK)) { diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 2536d44aa00c5..c58456940bf5e 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -554,6 +554,16 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) if (blkNode->OperIsInitBlkOp()) { +#ifdef DEBUG + // Use BlkOpKindLoop for more cases under stress mode + if (comp->compStressCompile(Compiler::STRESS_STORE_BLOCK_UNROLLING, 50) && blkNode->OperIs(GT_STORE_BLK) && + ((blkNode->GetLayout()->GetSize() % TARGET_POINTER_SIZE) == 0) && src->IsIntegralConst(0)) + { + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindLoop; + return; + } +#endif + if (src->OperIs(GT_INIT_VAL)) { src->SetContained(); @@ -598,6 +608,15 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr); } + else if (blkNode->IsZeroingGcPointersOnHeap()) + { + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindLoop; +#ifdef TARGET_ARM64 + // On ARM64 we can just use REG_ZR instead of having to load + // the constant into a real register like on ARM32. + src->SetContained(); +#endif + } else { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; diff --git a/src/coreclr/jit/lowerloongarch64.cpp b/src/coreclr/jit/lowerloongarch64.cpp index d89c8723e80f6..d522a85ab0f96 100644 --- a/src/coreclr/jit/lowerloongarch64.cpp +++ b/src/coreclr/jit/lowerloongarch64.cpp @@ -326,6 +326,12 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr); } + else if (blkNode->IsZeroingGcPointersOnHeap()) + { + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindLoop; + // We're going to use REG_R0 for zero + src->SetContained(); + } else { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; diff --git a/src/coreclr/jit/lowerriscv64.cpp b/src/coreclr/jit/lowerriscv64.cpp index 37848fc5909de..0c0f0aafc34bc 100644 --- a/src/coreclr/jit/lowerriscv64.cpp +++ b/src/coreclr/jit/lowerriscv64.cpp @@ -280,6 +280,12 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr); } + else if (blkNode->IsZeroingGcPointersOnHeap()) + { + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindLoop; + // We're going to use REG_R0 for zero + src->SetContained(); + } else { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 150ad04a55d99..f9766fe8e7b34 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -310,6 +310,16 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) if (blkNode->OperIsInitBlkOp()) { +#ifdef DEBUG + // Use BlkOpKindLoop for more cases under stress mode + if (comp->compStressCompile(Compiler::STRESS_STORE_BLOCK_UNROLLING, 50) && blkNode->OperIs(GT_STORE_BLK) && + ((blkNode->GetLayout()->GetSize() % TARGET_POINTER_SIZE) == 0) && src->IsIntegralConst(0)) + { + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindLoop; + return; + } +#endif + if (src->OperIs(GT_INIT_VAL)) { src->SetContained(); @@ -375,10 +385,12 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { TOO_BIG_TO_UNROLL: #ifdef TARGET_AMD64 - blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; + blkNode->gtBlkOpKind = + blkNode->IsZeroingGcPointersOnHeap() ? GenTreeBlk::BlkOpKindLoop : GenTreeBlk::BlkOpKindHelper; #else // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86 - blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr; + blkNode->gtBlkOpKind = + blkNode->IsZeroingGcPointersOnHeap() ? GenTreeBlk::BlkOpKindLoop : GenTreeBlk::BlkOpKindRepInstr; #endif } } diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index e72ff23df3375..b87e0f80539fd 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -631,6 +631,11 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) #endif // TARGET_ARM64 break; + case GenTreeBlk::BlkOpKindLoop: + // Needed for offsetReg + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + break; + case GenTreeBlk::BlkOpKindHelper: assert(!src->isContained()); dstAddrRegMask = RBM_ARG_0; diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 8c9025f61b703..22efbf53b82cb 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -968,6 +968,7 @@ regMaskTP LinearScan::getKillSetForBlockStore(GenTreeBlk* blkNode) #endif case GenTreeBlk::BlkOpKindUnrollMemmove: case GenTreeBlk::BlkOpKindUnroll: + case GenTreeBlk::BlkOpKindLoop: case GenTreeBlk::BlkOpKindInvalid: // for these 'gtBlkOpKind' kinds, we leave 'killMask' = RBM_NONE break; diff --git a/src/coreclr/jit/lsraloongarch64.cpp b/src/coreclr/jit/lsraloongarch64.cpp index 7ab333630c046..0312342a38914 100644 --- a/src/coreclr/jit/lsraloongarch64.cpp +++ b/src/coreclr/jit/lsraloongarch64.cpp @@ -1114,6 +1114,11 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) } break; + case GenTreeBlk::BlkOpKindLoop: + // Needed for offsetReg + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + break; + case GenTreeBlk::BlkOpKindHelper: assert(!src->isContained()); dstAddrRegMask = RBM_ARG_0; diff --git a/src/coreclr/jit/lsrariscv64.cpp b/src/coreclr/jit/lsrariscv64.cpp index 4be7a391d26a7..09d8a8daf905a 100644 --- a/src/coreclr/jit/lsrariscv64.cpp +++ b/src/coreclr/jit/lsrariscv64.cpp @@ -1129,6 +1129,11 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) } break; + case GenTreeBlk::BlkOpKindLoop: + // Needed for tempReg + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + break; + case GenTreeBlk::BlkOpKindHelper: assert(!src->isContained()); dstAddrRegMask = RBM_ARG_0; diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 5d54c08ebb790..678aa6bd596bb 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1406,6 +1406,11 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) sizeRegMask = RBM_RCX; break; + case GenTreeBlk::BlkOpKindLoop: + // Needed for offsetReg + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + break; + #ifdef TARGET_AMD64 case GenTreeBlk::BlkOpKindHelper: dstAddrRegMask = RBM_ARG_0; diff --git a/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.cs b/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.cs new file mode 100644 index 0000000000000..0a8719a2430a3 --- /dev/null +++ b/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.cs @@ -0,0 +1,56 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using Xunit; + +public class StructWithGC_Zeroing +{ + [Fact] + public static void StructZeroingShouldNotUseMemset() + { + LargeStructWithGC ls1 = default; + ls1.str = "hello1"; + ls1.z2 = long.MaxValue; + ZeroIt(ref ls1); + + LargeStructWithGC2 ls2 = default; + ls2.str = "hello2"; + ls2.z1 = long.MinValue; + ZeroIt2(ref ls2); + + if (ls1.str != null || ls2.str != null || ls1.z2 != 0 || ls2.z1 != 0) + throw new InvalidOperationException("should be zeroed"); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ZeroIt(ref LargeStructWithGC s) + { + // X64-NOT: CORINFO_HELP_MEMSET + // ARM64-NOT: CORINFO_HELP_MEMSET + s = default; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ZeroIt2(ref LargeStructWithGC2 s) + { + // X64-NOT: CORINFO_HELP_MEMSET + // ARM64-NOT: CORINFO_HELP_MEMSET + s = default; + } + + struct LargeStructWithGC // 360 bytes (64-bit) + { + public string str; + public long b1, c1, d1, e1, f1, g1, h1, i1, j1, k1, l1, m1, n1, o1, p1, r1, s1, t1, u1, v1, w1, z1; + public long b2, c2, d2, e2, f2, g2, h2, i2, j2, k2, l2, m2, n2, o2, p2, r2, s2, t2, u2, v2, w2, z2; + } + + unsafe struct LargeStructWithGC2 // 4184 bytes (64-bit) + { + public fixed byte data[4000]; + public string str; + public long b1, c1, d1, e1, f1, g1, h1, i1, j1, k1, l1, m1, n1, o1, p1, r1, s1, t1, u1, v1, w1, z1; + } +} diff --git a/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.csproj b/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.csproj new file mode 100644 index 0000000000000..7aa59749804e4 --- /dev/null +++ b/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.csproj @@ -0,0 +1,13 @@ + + + true + true + None + True + + + + true + + +