Skip to content

Commit 9efd6cb

Browse files
PawelJurekigcbot
authored andcommitted
Add "Evict L1" option to memfence intrinsic and enable it on Release semantics fences
Normally evicts shouldn't be needed, as by default runtime sets L1 WriteBypass mode. This is an unwritten contract between runtime and compiler. In order for the kernel to be correct, we need to insert L1 Evicts in cases of Release semantics.
1 parent 26c5dad commit 9efd6cb

File tree

9 files changed

+42
-26
lines changed

9 files changed

+42
-26
lines changed

IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ local uchar* __builtin_IB_AllocLocalMemPool(bool allocAllWorkgroups, uint numAdd
103103

104104
// Memory fences
105105
// See GenISAIntrinsics.td for documentation
106-
void __builtin_IB_memfence(bool commitEnable, bool flushRW, bool flushConstant, bool flushTexture, bool flushIcache, bool isGlobal, bool invalidateL1);
106+
void __builtin_IB_memfence(bool commitEnable, bool flushRW, bool flushConstant, bool flushTexture, bool flushIcache, bool isGlobal, bool invalidateL1, bool evictL1);
107107
void __builtin_IB_flush_sampler_cache(void);
108108
void __builtin_IB_typedmemfence(bool invalidateCache);
109109

IGC/BiFModule/Implementation/atomics.cl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ extern __constant int __UseNativeFP64GlobalAtomicAdd;
2626
__local int* __builtin_IB_get_local_lock();
2727
__global int* __builtin_IB_get_global_lock();
2828
void __builtin_IB_eu_thread_pause(uint value);
29-
void __intel_memfence_handler(bool flushRW, bool isGlobal, bool invalidateL1);
29+
void __intel_memfence_handler(bool flushRW, bool isGlobal, bool invalidateL1, bool evictL1);
3030

3131
#define LOCAL_SPINLOCK_START() \
3232
{ \
@@ -56,14 +56,14 @@ extern __constant int __UseNativeFP64GlobalAtomicAdd;
5656
if( ( (Semantics) & ( SEMANTICS_PRE_OP_NEED_FENCE ) ) > 0 ) \
5757
{ \
5858
bool flushL3 = (isGlobal) && ((Scope) == Device || (Scope) == CrossDevice); \
59-
__intel_memfence_handler(flushL3, isGlobal, false); \
59+
__intel_memfence_handler(flushL3, isGlobal, false, false); \
6060
}
6161

6262
#define FENCE_POST_OP(Scope, Semantics, isGlobal) \
6363
if( ( (Semantics) & ( SEMANTICS_POST_OP_NEEDS_FENCE ) ) > 0 ) \
6464
{ \
6565
bool flushL3 = (isGlobal) && ((Scope) == Device || (Scope) == CrossDevice); \
66-
__intel_memfence_handler(flushL3, isGlobal, false); \
66+
__intel_memfence_handler(flushL3, isGlobal, false, false); \
6767
}
6868

6969
// This fencing scheme allows us to obey the memory model when coherency is

IGC/BiFModule/Implementation/barrier.cl

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,37 +18,39 @@ extern __constant int __OptDisable;
1818

1919
// MEMFENCE IMPLEMENTATION
2020

21-
void __attribute__((optnone)) __intel_memfence_optnone(bool flushRW, bool isGlobal, bool invalidateL1)
21+
void __attribute__((optnone)) __intel_memfence_optnone(bool flushRW, bool isGlobal, bool invalidateL1, bool evictL1)
2222
{
23-
#define MEMFENCE_IF(V1, V5, V6) \
24-
if (flushRW == V1 && isGlobal == V5 && invalidateL1 == V6) \
25-
{ \
26-
__builtin_IB_memfence(true, V1, false, false, false, V5, V6); \
23+
#define MEMFENCE_IF(V1, V5, V6, V7) \
24+
if (flushRW == V1 && isGlobal == V5 && invalidateL1 == V6 && evictL1 == V7) \
25+
{ \
26+
__builtin_IB_memfence(true, V1, false, false, false, V5, V6, V7); \
2727
} else
2828

2929
// Generate combinations for all MEMFENCE_IF cases, e.g.:
3030
// true, true, true
3131
// true, true, false etc.
32+
#define MF_L3(...) MF_L2(__VA_ARGS__,false) MF_L2(__VA_ARGS__,true)
3233
#define MF_L2(...) MF_L1(__VA_ARGS__,false) MF_L1(__VA_ARGS__,true)
3334
#define MF_L1(...) MEMFENCE_IF(__VA_ARGS__,false) MEMFENCE_IF(__VA_ARGS__,true)
34-
MF_L2(false)
35-
MF_L2(true) {}
35+
MF_L3(false)
36+
MF_L3(true) {}
3637

3738
#undef MEMFENCE_IF
39+
#undef MF_L3
3840
#undef MF_L2
3941
#undef MF_L1
4042
}
41-
void __intel_memfence(bool flushRW, bool isGlobal, bool invalidateL1)
43+
void __intel_memfence(bool flushRW, bool isGlobal, bool invalidateL1, bool evictL1)
4244
{
43-
__builtin_IB_memfence(true, flushRW, false, false, false, isGlobal, invalidateL1);
45+
__builtin_IB_memfence(true, flushRW, false, false, false, isGlobal, invalidateL1, evictL1);
4446
}
4547

46-
void __intel_memfence_handler(bool flushRW, bool isGlobal, bool invalidateL1)
48+
void __intel_memfence_handler(bool flushRW, bool isGlobal, bool invalidateL1, bool evictL1)
4749
{
4850
if (__OptDisable)
49-
__intel_memfence_optnone(flushRW, isGlobal, invalidateL1);
51+
__intel_memfence_optnone(flushRW, isGlobal, invalidateL1, evictL1);
5052
else
51-
__intel_memfence(flushRW, isGlobal, invalidateL1);
53+
__intel_memfence(flushRW, isGlobal, invalidateL1, evictL1);
5254
}
5355

5456
// TYPEDMEMFENCE IMPLEMENTATION
@@ -81,6 +83,7 @@ static void __intel_atomic_work_item_fence( Scope_t Memory, uint Semantics )
8183
bool fence = Semantics & ( Acquire | Release | AcquireRelease | SequentiallyConsistent );
8284

8385
bool invalidateL1 = Semantics & ( Acquire | AcquireRelease | SequentiallyConsistent );
86+
bool evictL1 = Semantics & ( Release | AcquireRelease | SequentiallyConsistent );
8487

8588
// We always need to 'fence' image memory (aka, flush caches, drain pipelines)
8689
fence |= ( Semantics & ImageMemory );
@@ -97,12 +100,12 @@ static void __intel_atomic_work_item_fence( Scope_t Memory, uint Semantics )
97100
// although on some platforms they may be elided; platform-specific checks are performed in codegen
98101
if (Semantics & WorkgroupMemory)
99102
{
100-
__intel_memfence_handler(false, false, false);
103+
__intel_memfence_handler(false, false, false, false);
101104
}
102105
if (Semantics & CrossWorkgroupMemory)
103106
{
104107
bool flushL3 = Memory == Device || Memory == CrossDevice;
105-
__intel_memfence_handler(flushL3, true, invalidateL1);
108+
__intel_memfence_handler(flushL3, true, invalidateL1, evictL1);
106109
}
107110
}
108111
}

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15551,14 +15551,19 @@ void EmitPass::emitThreadGroupBarrier(llvm::Instruction* inst)
1555115551
}
1555215552
}
1555315553

15554-
LSC_FENCE_OP EmitPass::getLSCMemoryFenceOp(bool IsGlobalMemFence, bool InvalidateL1) const
15554+
LSC_FENCE_OP EmitPass::getLSCMemoryFenceOp(bool IsGlobalMemFence, bool InvalidateL1, bool EvictL1) const
1555515555
{
1555615556
LSC_FENCE_OP op = LSC_FENCE_OP_NONE;
1555715557
if (InvalidateL1 || (IsGlobalMemFence && m_currShader->m_Platform->getWATable().Wa_14012437816))
1555815558
{
1555915559
op = LSC_FENCE_OP_INVALIDATE;
1556015560
}
1556115561

15562+
if (EvictL1)
15563+
{
15564+
op = LSC_FENCE_OP_EVICT;
15565+
}
15566+
1556215567
// For experiment on XeHP SDV
1556315568
if (op == LSC_FENCE_OP_NONE && IsGlobalMemFence && IGC_IS_FLAG_ENABLED(EnableL3FlushForGlobal))
1556415569
{
@@ -15569,7 +15574,7 @@ LSC_FENCE_OP EmitPass::getLSCMemoryFenceOp(bool IsGlobalMemFence, bool Invalidat
1556915574

1557015575
void EmitPass::emitMemoryFence(llvm::Instruction* inst)
1557115576
{
15572-
static constexpr int ExpectedNumberOfArguments = 7;
15577+
static constexpr int ExpectedNumberOfArguments = 8;
1557315578
IGC_ASSERT(cast<CallInst>(inst)->getNumArgOperands() == ExpectedNumberOfArguments);
1557415579
CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
1557515580

@@ -15583,6 +15588,9 @@ void EmitPass::emitMemoryFence(llvm::Instruction* inst)
1558315588
bool L3_Flush_Instructions = true;
1558415589
bool Global_Mem_Fence = true;
1558515590
bool L1_Invalidate = ctx->platform.hasL1ReadOnlyCache();
15591+
// Note: this flag is respected only for LSC case currently.
15592+
// TODO: add support for non-LSC and typed fence.
15593+
bool L1_Evict = true;
1558615594

1558715595
std::array<reference_wrapper<bool>, ExpectedNumberOfArguments> MemFenceArguments{
1558815596
CommitEnable,
@@ -15592,6 +15600,7 @@ void EmitPass::emitMemoryFence(llvm::Instruction* inst)
1559215600
L3_Flush_Instructions,
1559315601
Global_Mem_Fence,
1559415602
L1_Invalidate,
15603+
L1_Evict
1559515604
};
1559615605

1559715606
for (size_t i = 0; i < MemFenceArguments.size(); ++i) {
@@ -15639,7 +15648,7 @@ void EmitPass::emitMemoryFence(llvm::Instruction* inst)
1563915648
{
1564015649
scope = LSC_SCOPE_TILE;
1564115650
}
15642-
LSC_FENCE_OP op = getLSCMemoryFenceOp(Global_Mem_Fence, L1_Invalidate);
15651+
LSC_FENCE_OP op = getLSCMemoryFenceOp(Global_Mem_Fence, L1_Invalidate, L1_Evict);
1564315652
if (inst->getMetadata("forceFlushNone"))
1564415653
{
1564515654
op = LSC_FENCE_OP_NONE;
@@ -15695,7 +15704,7 @@ void EmitPass::emitTypedMemoryFence(llvm::Instruction* inst)
1569515704

1569615705
if (shouldGenerateLSC())
1569715706
{
15698-
auto flushOpt = m_currShader->m_Platform->hasSamplerSupport() ? LSC_FENCE_OP_EVICT : getLSCMemoryFenceOp(true, L1_Invalidate);
15707+
auto flushOpt = getLSCMemoryFenceOp(true, L1_Invalidate, m_currShader->m_Platform->hasSamplerSupport());
1569915708
LSC_SCOPE scope = LSC_SCOPE_GPU;
1570015709
if (!m_currShader->m_Platform->hasMultiTile() &&
1570115710
m_currShader->m_Platform->hasL3FlushOnGPUScopeInvalidate() &&
@@ -21724,7 +21733,7 @@ void EmitPass::emitSystemMemoryFence(llvm::GenIntrinsicInst* inst)
2172421733
if (fenceTGM)
2172521734
{
2172621735
// first fence TGM with GPU scope
21727-
auto flushOpt = m_currShader->m_Platform->hasSamplerSupport() ? LSC_FENCE_OP_EVICT : getLSCMemoryFenceOp(true, false);
21736+
auto flushOpt = getLSCMemoryFenceOp(true, false, m_currShader->m_Platform->hasSamplerSupport());
2172821737
m_encoder->LSC_Fence(LSC_TGM, LSC_SCOPE_GPU, flushOpt);
2172921738
m_encoder->Push();
2173021739
// then emit the regular UGM fence

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -886,7 +886,7 @@ class EmitPass : public llvm::FunctionPass
886886
LSC_CACHE_OPTS cacheOpts, bool UseA32,
887887
ResourceDescriptor& Resource, CVariable* StoreVar, CVariable* Offset, int ImmOffset,
888888
uint32_t NumElts, uint32_t EltBytes, int Align);
889-
LSC_FENCE_OP getLSCMemoryFenceOp(bool IsGlobalMemFence, bool InvalidateL1) const;
889+
LSC_FENCE_OP getLSCMemoryFenceOp(bool IsGlobalMemFence, bool InvalidateL1, bool EvictL1) const;
890890
bool m_isDuplicate;
891891
CVariable* m_tmpDest = nullptr;
892892

IGC/Compiler/CISACodeGen/PixelShaderLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,7 @@ void PixelShaderLowering::EmitMemoryFence(IRBuilder<>& builder, bool forceFlushN
571571
falseValue,
572572
trueValue,
573573
falseValue,
574+
falseValue,
574575
};
575576

576577
CallInst* memFence = GenIntrinsicInst::Create(GenISAIntrinsic::getDeclaration(m_module, GenISAIntrinsic::GenISA_memoryfence),

IGC/Compiler/Optimizer/OpenCLPasses/Atomics/ResolveOCLAtomics.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ void ResolveOCLAtomics::generateLockInitilization(Function* F)
358358
m_builder->CreateStore(m_localLock->getInitializer(), m_localLock);
359359
m_builder->CreateBr(initSpinLockEndBB);
360360

361-
// insert call void @llvm.genx.GenISA.memoryfence(i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true)
361+
// insert call void @llvm.genx.GenISA.memoryfence(i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false)
362362
// call void @llvm.genx.GenISA.threadgroupbarrier()
363363
// to guarantee synchronization in accessing spin lock variable
364364
Value* trueValue = m_builder->getTrue();
@@ -372,6 +372,7 @@ void ResolveOCLAtomics::generateLockInitilization(Function* F)
372372
falseValue,
373373
falseValue,
374374
trueValue,
375+
falseValue,
375376
};
376377
m_builder->SetInsertPoint(initSpinLockEndBB, initSpinLockEndBB->getFirstInsertionPt());
377378
Function* localMemFence = GenISAIntrinsic::getDeclaration(m_pModule, GenISAIntrinsic::GenISA_memoryfence);

IGC/Compiler/Optimizer/OpenCLPasses/NamedBarriers/NamedBarriersResolution.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ void NamedBarriersResolution::HandleNamedBarrierSyncPVC(CallInst& NBarrierSyncCa
241241
falseValue, // bool flushIcache
242242
isGlobal, // bool isGlobal
243243
falseValue, // bool invalidateL1
244+
falseValue, // bool evictL1
244245
},
245246
"",
246247
&(NBarrierSyncCall));

IGC/GenISAIntrinsics/Intrinsic_definitions.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1410,7 +1410,8 @@
14101410
("bool", "L3_Flush_Texture_Data"),
14111411
("bool", "L3_Flush_Instructions"),
14121412
("bool", "Fence has global effect"),
1413-
("bool", "L1 Invalidate")],
1413+
("bool", "L1 Invalidate"),
1414+
("bool", "L1 Evict")],
14141415
"Convergent"]],
14151416
####################################################################################################
14161417
"GenISA_mov_identity": ["",

0 commit comments

Comments
 (0)