diff --git a/src/gc/env/gcenv.os.h b/src/gc/env/gcenv.os.h index 1707f0dabec7..41e46f8f0f98 100644 --- a/src/gc/env/gcenv.os.h +++ b/src/gc/env/gcenv.os.h @@ -282,16 +282,13 @@ class GCToOSInterface // Processor topology // - // Get number of logical processors - static uint32_t GetLogicalCpuCount(); - - // Get size of the largest cache on the processor die + // Get size of the on die cache per logical processor // Parameters: // trueSize - true to return true cache size, false to return scaled up size based on // the processor architecture // Return: // Size of the cache - static size_t GetLargestOnDieCacheSize(bool trueSize = true); + static size_t GetCacheSizePerLogicalCpu(bool trueSize = true); // Get number of processors assigned to the current process // Return: diff --git a/src/gc/gc.cpp b/src/gc/gc.cpp index 445812cd4081..52d13f57a645 100644 --- a/src/gc/gc.cpp +++ b/src/gc/gc.cpp @@ -15760,7 +15760,7 @@ void gc_heap::gc1() size_t min_gc_size = dd_min_gc_size(dd); // if min GC size larger than true on die cache, then don't bother // limiting the desired size - if ((min_gc_size <= GCToOSInterface::GetLargestOnDieCacheSize(TRUE) / GCToOSInterface::GetLogicalCpuCount()) && + if ((min_gc_size <= GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE)) && desired_per_heap <= 2*min_gc_size) { desired_per_heap = min_gc_size; @@ -35527,19 +35527,26 @@ size_t GCHeap::GetValidGen0MaxSize(size_t seg_size) #ifdef SERVER_GC // performance data seems to indicate halving the size results // in optimal perf. Ask for adjusted gen0 size. - gen0size = max(GCToOSInterface::GetLargestOnDieCacheSize(FALSE)/GCToOSInterface::GetLogicalCpuCount(),(256*1024)); + gen0size = max(GCToOSInterface::GetCacheSizePerLogicalCpu(FALSE),(256*1024)); // if gen0 size is too large given the available memory, reduce it. // Get true cache size, as we don't want to reduce below this. - size_t trueSize = max(GCToOSInterface::GetLargestOnDieCacheSize(TRUE)/GCToOSInterface::GetLogicalCpuCount(),(256*1024)); + size_t trueSize = max(GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE),(256*1024)); dprintf (2, ("cache: %Id-%Id, cpu: %Id", - GCToOSInterface::GetLargestOnDieCacheSize(FALSE), - GCToOSInterface::GetLargestOnDieCacheSize(TRUE), - GCToOSInterface::GetLogicalCpuCount())); + GCToOSInterface::GetCacheSizePerLogicalCpu(FALSE), + GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE))); + + int n_heaps = gc_heap::n_heaps; +#else //SERVER_GC + size_t trueSize = GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE); + gen0size = max((4*trueSize/5),(256*1024)); + trueSize = max(trueSize, (256*1024)); + int n_heaps = 1; +#endif //SERVER_GC // if the total min GC across heaps will exceed 1/6th of available memory, // then reduce the min GC size until it either fits or has been reduced to cache size. - while ((gen0size * gc_heap::n_heaps) > GCToOSInterface::GetPhysicalMemoryLimit() / 6) + while ((gen0size * n_heaps) > GCToOSInterface::GetPhysicalMemoryLimit() / 6) { gen0size = gen0size / 2; if (gen0size <= trueSize) @@ -35548,9 +35555,6 @@ size_t GCHeap::GetValidGen0MaxSize(size_t seg_size) break; } } -#else //SERVER_GC - gen0size = max((4*GCToOSInterface::GetLargestOnDieCacheSize(TRUE)/5),(256*1024)); -#endif //SERVER_GC } // Generation 0 must never be more than 1/2 the segment size. diff --git a/src/gc/unix/gcenv.unix.cpp b/src/gc/unix/gcenv.unix.cpp index 7bc6a370684b..737c5efcf032 100644 --- a/src/gc/unix/gcenv.unix.cpp +++ b/src/gc/unix/gcenv.unix.cpp @@ -221,12 +221,6 @@ void GCToOSInterface::DebugBreak() #endif } -// Get number of logical processors -uint32_t GCToOSInterface::GetLogicalCpuCount() -{ - return g_logicalCpuCount; -} - // Causes the calling thread to sleep for the specified number of milliseconds // Parameters: // sleepMSec - time to sleep before switching to another thread @@ -403,7 +397,7 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size, // the processor architecture // Return: // Size of the cache -size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize) +size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize) { // TODO(segilles) processor detection return 0; diff --git a/src/gc/windows/gcenv.windows.cpp b/src/gc/windows/gcenv.windows.cpp index 69e5d7273aab..e258834abc2c 100644 --- a/src/gc/windows/gcenv.windows.cpp +++ b/src/gc/windows/gcenv.windows.cpp @@ -228,13 +228,6 @@ void GCToOSInterface::DebugBreak() ::DebugBreak(); } -// Get number of logical processors -uint32_t GCToOSInterface::GetLogicalCpuCount() -{ - // TODO(segilles) processor detection - return 1; -} - // Causes the calling thread to sleep for the specified number of milliseconds // Parameters: // sleepMSec - time to sleep before switching to another thread @@ -381,7 +374,7 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size, // the processor architecture // Return: // Size of the cache -size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize) +size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize) { // TODO(segilles) processor detection (see src/vm/util.cpp:1935) return 0; diff --git a/src/vm/CMakeLists.txt b/src/vm/CMakeLists.txt index 97ab656f811f..fc354089c41e 100644 --- a/src/vm/CMakeLists.txt +++ b/src/vm/CMakeLists.txt @@ -474,7 +474,6 @@ elseif(CLR_CMAKE_TARGET_ARCH_ARM) ) elseif(CLR_CMAKE_TARGET_ARCH_ARM64) set(VM_SOURCES_DAC_AND_WKS_ARCH - ${ARCH_SOURCES_DIR}/cgenarm64.cpp ${ARCH_SOURCES_DIR}/stubs.cpp exceptionhandling.cpp gcinfodecoder.cpp diff --git a/src/vm/amd64/cgenamd64.cpp b/src/vm/amd64/cgenamd64.cpp index 60751349438b..56e3bfa73829 100644 --- a/src/vm/amd64/cgenamd64.cpp +++ b/src/vm/amd64/cgenamd64.cpp @@ -458,89 +458,6 @@ BOOL GetAnyThunkTarget (CONTEXT *pctx, TADDR *pTarget, TADDR *pTargetMethodDesc) // determine the number of logical cpus, or the machine is not populated uniformly with the same // type of processors, this function returns 1. -extern "C" DWORD __stdcall getcpuid(DWORD arg, unsigned char result[16]); - -// fix this if/when AMD does multicore or SMT -DWORD GetLogicalCpuCount() -{ - // No CONTRACT possible because GetLogicalCpuCount uses SEH - - STATIC_CONTRACT_THROWS; - STATIC_CONTRACT_GC_NOTRIGGER; - - static DWORD val = 0; - - // cache value for later re-use - if (val) - { - return val; - } - - struct Param : DefaultCatchFilterParam - { - DWORD retVal; - } param; - param.pv = COMPLUS_EXCEPTION_EXECUTE_HANDLER; - param.retVal = 1; - - PAL_TRY(Param *, pParam, ¶m) - { - - unsigned char buffer[16]; - DWORD maxCpuId = getcpuid(0, buffer); - DWORD* dwBuffer = (DWORD*)buffer; - - if (maxCpuId < 1) - goto qExit; - - if (dwBuffer[1] == 'uneG') { - if (dwBuffer[3] == 'Ieni') { - if (dwBuffer[2] == 'letn') { // get SMT/multicore enumeration for Intel EM64T - - - // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on - // multi-core processor, but we never call into those two functions since we don't halve the - // gen0size when it's prescott and above processor. We keep the old version here for earlier - // generation system(Northwood based), perf data suggests on those systems, halve gen0 size - // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood) - // based, we still go ahead and halve gen0 size. The logic in GetLogicalCpuCountFromOS() - // and GetLogicalCpuCountFallback() works fine for those earlier generation systems. - // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0 - // size at all gives us overall better performance. - // This is going to be fixed with a new version in orcas time frame. - - if( (maxCpuId > 3) && (maxCpuId < 0x80000000) ) - goto qExit; - - val = GetLogicalCpuCountFromOS(); //try to obtain HT enumeration from OS API - if (val ) - { - pParam->retVal = val; // OS API HT enumeration successful, we are Done - goto qExit; - } - - val = GetLogicalCpuCountFallback(); // Fallback to HT enumeration using CPUID - if( val ) - pParam->retVal = val; - } - } - } -qExit: ; - } - - PAL_EXCEPT_FILTER(DefaultCatchFilter) - { - } - PAL_ENDTRY - - if (val == 0) - { - val = param.retVal; - } - - return param.retVal; -} - void EncodeLoadAndJumpThunk (LPBYTE pBuffer, LPVOID pv, LPVOID pTarget) { CONTRACTL diff --git a/src/vm/arm/stubs.cpp b/src/vm/arm/stubs.cpp index 7e5b58c54b99..9d1b443ffb5d 100644 --- a/src/vm/arm/stubs.cpp +++ b/src/vm/arm/stubs.cpp @@ -3374,13 +3374,6 @@ void emitCOMStubCall (ComCallMethodDesc *pCOMMethod, PCODE target) #ifndef CROSSGEN_COMPILE -DWORD GetLogicalCpuCount() -{ - // Just use the OS to return this information (the APIs used exist on all versions of Windows which - // support ARM). - return GetLogicalCpuCountFromOS(); -} - #ifdef FEATURE_READYTORUN // diff --git a/src/vm/arm64/cgenarm64.cpp b/src/vm/arm64/cgenarm64.cpp deleted file mode 100644 index 59905bf09899..000000000000 --- a/src/vm/arm64/cgenarm64.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. -// -// Various helper routines for generating AMD64 assembly code. -// - -// Precompiled Header - -#include "common.h" - -#include "stublink.h" -#include "cgensys.h" -#include "siginfo.hpp" -#include "excep.h" -#include "ecall.h" -#include "dllimport.h" -#include "dllimportcallback.h" -#include "dbginterface.h" -#include "fcall.h" -#include "array.h" -#include "virtualcallstub.h" - -#ifndef DACCESS_COMPILE - -// Note: This is only used on server GC on Windows. - -DWORD GetLogicalCpuCount() -{ - LIMITED_METHOD_CONTRACT; - - // The contact with any callers of this function is that if we're unable to determine - // the processor count, or the number of processors is not distributed evenly, then - // we should return 1. - return 1; -} - -#endif // DACCESS_COMPILE diff --git a/src/vm/cgensys.h b/src/vm/cgensys.h index d55d15dd7dd9..b5158b722ce7 100644 --- a/src/vm/cgensys.h +++ b/src/vm/cgensys.h @@ -34,10 +34,6 @@ int CallJitEHFilter (CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHC void CallJitEHFinally(CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHClausePtr, DWORD nestingLevel); #endif // _TARGET_X86_ - -// get number of logical to physical processors. Returns 1 on failure or non-intel x86 processors. -DWORD GetLogicalCpuCount(); - //These are in util.cpp extern size_t GetLogicalProcessorCacheSizeFromOS(); extern size_t GetIntelDeterministicCacheEnum(); @@ -47,7 +43,7 @@ extern DWORD GetLogicalCpuCountFallback(); // Try to determine the largest last-level cache size of the machine - return 0 if unknown or no L2/L3 cache -size_t GetLargestOnDieCacheSize(BOOL bTrueSize = TRUE); +size_t GetCacheSizePerLogicalCpu(BOOL bTrueSize = TRUE); #ifdef FEATURE_COMINTEROP diff --git a/src/vm/gcenv.os.cpp b/src/vm/gcenv.os.cpp index 78670b0af3d3..8d8630ec6208 100644 --- a/src/vm/gcenv.os.cpp +++ b/src/vm/gcenv.os.cpp @@ -145,13 +145,6 @@ void GCToOSInterface::DebugBreak() ::DebugBreak(); } -// Get number of logical processors -uint32_t GCToOSInterface::GetLogicalCpuCount() -{ - LIMITED_METHOD_CONTRACT; - return ::GetLogicalCpuCount(); -} - // Causes the calling thread to sleep for the specified number of milliseconds // Parameters: // sleepMSec - time to sleep before switching to another thread @@ -322,11 +315,11 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size, // the processor architecture // Return: // Size of the cache -size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize) +size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize) { LIMITED_METHOD_CONTRACT; - return ::GetLargestOnDieCacheSize(trueSize); + return ::GetCacheSizePerLogicalCpu(trueSize); } // Sets the calling thread's affinity to only run on the processor specified diff --git a/src/vm/i386/cgenx86.cpp b/src/vm/i386/cgenx86.cpp index 7071d27928dd..32929d406f29 100644 --- a/src/vm/i386/cgenx86.cpp +++ b/src/vm/i386/cgenx86.cpp @@ -1511,89 +1511,6 @@ extern "C" DWORD __stdcall xmmYmmStateSupport() #endif // !FEATURE_PAL -// This function returns the number of logical processors on a given physical chip. If it cannot -// determine the number of logical cpus, or the machine is not populated uniformly with the same -// type of processors, this function returns 1. -DWORD GetLogicalCpuCount() -{ - // No CONTRACT possible because GetLogicalCpuCount uses SEH - - STATIC_CONTRACT_THROWS; - STATIC_CONTRACT_GC_NOTRIGGER; - - static DWORD val = 0; - - // cache value for later re-use - if (val) - { - return val; - } - - struct Param : DefaultCatchFilterParam - { - DWORD retVal; - } param; - param.pv = COMPLUS_EXCEPTION_EXECUTE_HANDLER; - param.retVal = 1; - - PAL_TRY(Param *, pParam, ¶m) - { - unsigned char buffer[16]; - DWORD* dwBuffer = NULL; - - DWORD maxCpuId = getcpuid(0, buffer); - - if (maxCpuId < 1) - goto lDone; - - dwBuffer = (DWORD*)buffer; - - if (dwBuffer[1] == 'uneG') { - if (dwBuffer[3] == 'Ieni') { - if (dwBuffer[2] == 'letn') { // get SMT/multicore enumeration for Intel EM64T - - // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on - // multi-core processor, but we never call into those two functions since we don't halve the - // gen0size when it's prescott and above processor. We keep the old version here for earlier - // generation system(Northwood based), perf data suggests on those systems, halve gen0 size - // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood) - // based, we still go ahead and halve gen0 size. The logic in GetLogicalCpuCountFromOS() - // and GetLogicalCpuCountFallback() works fine for those earlier generation systems. - // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0 - // size at all gives us overall better performance. - // This is going to be fixed with a new version in orcas time frame. - - if( (maxCpuId > 3) && (maxCpuId < 0x80000000) ) - goto lDone; - - val = GetLogicalCpuCountFromOS(); //try to obtain HT enumeration from OS API - if (val ) - { - pParam->retVal = val; // OS API HT enumeration successful, we are Done - goto lDone; - } - - val = GetLogicalCpuCountFallback(); // OS API failed, Fallback to HT enumeration using CPUID - if( val ) - pParam->retVal = val; - } - } - } -lDone: ; - } - PAL_EXCEPT_FILTER(DefaultCatchFilter) - { - } - PAL_ENDTRY - - if (val == 0) - { - val = param.retVal; - } - - return param.retVal; -} - void UMEntryThunkCode::Encode(BYTE* pTargetCode, void* pvSecretParam) { LIMITED_METHOD_CONTRACT; diff --git a/src/vm/util.cpp b/src/vm/util.cpp index 692b72fc39fb..b9448dadbe1a 100644 --- a/src/vm/util.cpp +++ b/src/vm/util.cpp @@ -1854,9 +1854,10 @@ DWORD GetLogicalCpuCountFallback() #endif // _TARGET_X86_ || _TARGET_AMD64_ -size_t GetLargestOnDieCacheSize(BOOL bTrueSize) +// fix this if/when AMD does multicore or SMT +size_t GetCacheSizePerLogicalCpu(BOOL bTrueSize) { - // No CONTRACT possible because GetLargestOnDieCacheSize uses SEH + // No CONTRACT possible because GetCacheSizePerLogicalCpu uses SEH STATIC_CONTRACT_NOTHROW; STATIC_CONTRACT_GC_NOTRIGGER; @@ -1911,6 +1912,31 @@ size_t GetLargestOnDieCacheSize(BOOL bTrueSize) } } + // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on + // multi-core processor, but we never call into those two functions since we don't halve the + // gen0size when it's prescott and above processor. We keep the old version here for earlier + // generation system(Northwood based), perf data suggests on those systems, halve gen0 size + // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood) + // based, we still go ahead and halve gen0 size. The logic in GetLogicalCpuCountFromOS() + // and GetLogicalCpuCountFallback() works fine for those earlier generation systems. + // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0 + // size at all gives us overall better performance. + // This is going to be fixed with a new version in orcas time frame. + if (maxCpuId >= 2 && !((maxCpuId > 3) && (maxCpuId < 0x80000000))) + { + DWORD logicalProcessorCount = GetLogicalCpuCountFromOS(); //try to obtain HT enumeration from OS API + + if (!logicalProcessorCount) + { + logicalProcessorCount = GetLogicalCpuCountFallback(); // OS API failed, Fallback to HT enumeration using CPUID + } + + if (logicalProcessorCount) + { + tempSize = tempSize / logicalProcessorCount; + } + } + // update maxSize once with final value maxTrueSize = tempSize; @@ -2009,7 +2035,7 @@ size_t GetLargestOnDieCacheSize(BOOL bTrueSize) maxSize = maxTrueSize * 3; #endif - // printf("GetLargestOnDieCacheSize returns %d, adjusted size %d\n", maxSize, maxTrueSize); + // printf("GetCacheSizePerLogicalCpu returns %d, adjusted size %d\n", maxSize, maxTrueSize); if (bTrueSize) return maxTrueSize; else