Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

Perform PhysicalMemoryLimit check for workstation GC, refactor GetLargestOnDieCacheSize into GetCacheSizePerLogicalCpu #15975

Merged
merged 2 commits into from
Jan 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions src/gc/env/gcenv.os.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,16 +282,13 @@ class GCToOSInterface
// Processor topology
//

// Get number of logical processors
static uint32_t GetLogicalCpuCount();

// Get size of the largest cache on the processor die
// Get size of the on die cache per logical processor
// Parameters:
// trueSize - true to return true cache size, false to return scaled up size based on
// the processor architecture
// Return:
// Size of the cache
static size_t GetLargestOnDieCacheSize(bool trueSize = true);
static size_t GetCacheSizePerLogicalCpu(bool trueSize = true);

// Get number of processors assigned to the current process
// Return:
Expand Down
24 changes: 14 additions & 10 deletions src/gc/gc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15760,7 +15760,7 @@ void gc_heap::gc1()
size_t min_gc_size = dd_min_gc_size(dd);
// if min GC size larger than true on die cache, then don't bother
// limiting the desired size
if ((min_gc_size <= GCToOSInterface::GetLargestOnDieCacheSize(TRUE) / GCToOSInterface::GetLogicalCpuCount()) &&
if ((min_gc_size <= GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE)) &&
desired_per_heap <= 2*min_gc_size)
{
desired_per_heap = min_gc_size;
Expand Down Expand Up @@ -35527,19 +35527,26 @@ size_t GCHeap::GetValidGen0MaxSize(size_t seg_size)
#ifdef SERVER_GC
// performance data seems to indicate halving the size results
// in optimal perf. Ask for adjusted gen0 size.
gen0size = max(GCToOSInterface::GetLargestOnDieCacheSize(FALSE)/GCToOSInterface::GetLogicalCpuCount(),(256*1024));
gen0size = max(GCToOSInterface::GetCacheSizePerLogicalCpu(FALSE),(256*1024));

// if gen0 size is too large given the available memory, reduce it.
// Get true cache size, as we don't want to reduce below this.
size_t trueSize = max(GCToOSInterface::GetLargestOnDieCacheSize(TRUE)/GCToOSInterface::GetLogicalCpuCount(),(256*1024));
size_t trueSize = max(GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE),(256*1024));
dprintf (2, ("cache: %Id-%Id, cpu: %Id",
GCToOSInterface::GetLargestOnDieCacheSize(FALSE),
GCToOSInterface::GetLargestOnDieCacheSize(TRUE),
GCToOSInterface::GetLogicalCpuCount()));
GCToOSInterface::GetCacheSizePerLogicalCpu(FALSE),
GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE)));

int n_heaps = gc_heap::n_heaps;
#else //SERVER_GC
size_t trueSize = GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Maoni0 @swgillespie I wonder, for workstation gc, if we want to multiply here with GCToOSInterface::GetCurrentProcessCpuCount. wdyt?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll leave that to @Maoni0, I don't have any thoughts - not super familiar with this math.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't want to do that 'cause we don't assume that processes using workstation GC will own the machine. unless you specifically affinitize your process with fewer CPUs, GetCurrentProcessCpuCount will return all CPUs.

gen0size = max((4*trueSize/5),(256*1024));
trueSize = max(trueSize, (256*1024));
int n_heaps = 1;
#endif //SERVER_GC

// if the total min GC across heaps will exceed 1/6th of available memory,
// then reduce the min GC size until it either fits or has been reduced to cache size.
while ((gen0size * gc_heap::n_heaps) > GCToOSInterface::GetPhysicalMemoryLimit() / 6)
while ((gen0size * n_heaps) > GCToOSInterface::GetPhysicalMemoryLimit() / 6)
{
gen0size = gen0size / 2;
if (gen0size <= trueSize)
Expand All @@ -35548,9 +35555,6 @@ size_t GCHeap::GetValidGen0MaxSize(size_t seg_size)
break;
}
}
#else //SERVER_GC
gen0size = max((4*GCToOSInterface::GetLargestOnDieCacheSize(TRUE)/5),(256*1024));
#endif //SERVER_GC
}

// Generation 0 must never be more than 1/2 the segment size.
Expand Down
8 changes: 1 addition & 7 deletions src/gc/unix/gcenv.unix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,12 +221,6 @@ void GCToOSInterface::DebugBreak()
#endif
}

// Get number of logical processors
uint32_t GCToOSInterface::GetLogicalCpuCount()
{
return g_logicalCpuCount;
}

// Causes the calling thread to sleep for the specified number of milliseconds
// Parameters:
// sleepMSec - time to sleep before switching to another thread
Expand Down Expand Up @@ -403,7 +397,7 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size,
// the processor architecture
// Return:
// Size of the cache
size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
{
// TODO(segilles) processor detection
return 0;
Expand Down
9 changes: 1 addition & 8 deletions src/gc/windows/gcenv.windows.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,13 +228,6 @@ void GCToOSInterface::DebugBreak()
::DebugBreak();
}

// Get number of logical processors
uint32_t GCToOSInterface::GetLogicalCpuCount()
{
// TODO(segilles) processor detection
return 1;
}

// Causes the calling thread to sleep for the specified number of milliseconds
// Parameters:
// sleepMSec - time to sleep before switching to another thread
Expand Down Expand Up @@ -381,7 +374,7 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size,
// the processor architecture
// Return:
// Size of the cache
size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
{
// TODO(segilles) processor detection (see src/vm/util.cpp:1935)
return 0;
Expand Down
1 change: 0 additions & 1 deletion src/vm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,6 @@ elseif(CLR_CMAKE_TARGET_ARCH_ARM)
)
elseif(CLR_CMAKE_TARGET_ARCH_ARM64)
set(VM_SOURCES_DAC_AND_WKS_ARCH
${ARCH_SOURCES_DIR}/cgenarm64.cpp
${ARCH_SOURCES_DIR}/stubs.cpp
exceptionhandling.cpp
gcinfodecoder.cpp
Expand Down
83 changes: 0 additions & 83 deletions src/vm/amd64/cgenamd64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,89 +458,6 @@ BOOL GetAnyThunkTarget (CONTEXT *pctx, TADDR *pTarget, TADDR *pTargetMethodDesc)
// determine the number of logical cpus, or the machine is not populated uniformly with the same
// type of processors, this function returns 1.

extern "C" DWORD __stdcall getcpuid(DWORD arg, unsigned char result[16]);

// fix this if/when AMD does multicore or SMT
DWORD GetLogicalCpuCount()
{
// No CONTRACT possible because GetLogicalCpuCount uses SEH

STATIC_CONTRACT_THROWS;
STATIC_CONTRACT_GC_NOTRIGGER;

static DWORD val = 0;

// cache value for later re-use
if (val)
{
return val;
}

struct Param : DefaultCatchFilterParam
{
DWORD retVal;
} param;
param.pv = COMPLUS_EXCEPTION_EXECUTE_HANDLER;
param.retVal = 1;

PAL_TRY(Param *, pParam, &param)
{

unsigned char buffer[16];
DWORD maxCpuId = getcpuid(0, buffer);
DWORD* dwBuffer = (DWORD*)buffer;

if (maxCpuId < 1)
goto qExit;

if (dwBuffer[1] == 'uneG') {
if (dwBuffer[3] == 'Ieni') {
if (dwBuffer[2] == 'letn') { // get SMT/multicore enumeration for Intel EM64T


// TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on
// multi-core processor, but we never call into those two functions since we don't halve the
// gen0size when it's prescott and above processor. We keep the old version here for earlier
// generation system(Northwood based), perf data suggests on those systems, halve gen0 size
// still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood)
// based, we still go ahead and halve gen0 size. The logic in GetLogicalCpuCountFromOS()
// and GetLogicalCpuCountFallback() works fine for those earlier generation systems.
// If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0
// size at all gives us overall better performance.
// This is going to be fixed with a new version in orcas time frame.

if( (maxCpuId > 3) && (maxCpuId < 0x80000000) )
goto qExit;

val = GetLogicalCpuCountFromOS(); //try to obtain HT enumeration from OS API
if (val )
{
pParam->retVal = val; // OS API HT enumeration successful, we are Done
goto qExit;
}

val = GetLogicalCpuCountFallback(); // Fallback to HT enumeration using CPUID
if( val )
pParam->retVal = val;
}
}
}
qExit: ;
}

PAL_EXCEPT_FILTER(DefaultCatchFilter)
{
}
PAL_ENDTRY

if (val == 0)
{
val = param.retVal;
}

return param.retVal;
}

void EncodeLoadAndJumpThunk (LPBYTE pBuffer, LPVOID pv, LPVOID pTarget)
{
CONTRACTL
Expand Down
7 changes: 0 additions & 7 deletions src/vm/arm/stubs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3374,13 +3374,6 @@ void emitCOMStubCall (ComCallMethodDesc *pCOMMethod, PCODE target)

#ifndef CROSSGEN_COMPILE

DWORD GetLogicalCpuCount()
{
// Just use the OS to return this information (the APIs used exist on all versions of Windows which
// support ARM).
return GetLogicalCpuCountFromOS();
}

#ifdef FEATURE_READYTORUN

//
Expand Down
38 changes: 0 additions & 38 deletions src/vm/arm64/cgenarm64.cpp

This file was deleted.

6 changes: 1 addition & 5 deletions src/vm/cgensys.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@ int CallJitEHFilter (CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHC
void CallJitEHFinally(CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHClausePtr, DWORD nestingLevel);
#endif // _TARGET_X86_


// get number of logical to physical processors. Returns 1 on failure or non-intel x86 processors.
DWORD GetLogicalCpuCount();

//These are in util.cpp
extern size_t GetLogicalProcessorCacheSizeFromOS();
extern size_t GetIntelDeterministicCacheEnum();
Expand All @@ -47,7 +43,7 @@ extern DWORD GetLogicalCpuCountFallback();


// Try to determine the largest last-level cache size of the machine - return 0 if unknown or no L2/L3 cache
size_t GetLargestOnDieCacheSize(BOOL bTrueSize = TRUE);
size_t GetCacheSizePerLogicalCpu(BOOL bTrueSize = TRUE);


#ifdef FEATURE_COMINTEROP
Expand Down
11 changes: 2 additions & 9 deletions src/vm/gcenv.os.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,6 @@ void GCToOSInterface::DebugBreak()
::DebugBreak();
}

// Get number of logical processors
uint32_t GCToOSInterface::GetLogicalCpuCount()
{
LIMITED_METHOD_CONTRACT;
return ::GetLogicalCpuCount();
}

// Causes the calling thread to sleep for the specified number of milliseconds
// Parameters:
// sleepMSec - time to sleep before switching to another thread
Expand Down Expand Up @@ -322,11 +315,11 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size,
// the processor architecture
// Return:
// Size of the cache
size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
{
LIMITED_METHOD_CONTRACT;

return ::GetLargestOnDieCacheSize(trueSize);
return ::GetCacheSizePerLogicalCpu(trueSize);
}

// Sets the calling thread's affinity to only run on the processor specified
Expand Down
Loading