diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index ab5443aed18f1..40314c6f5fc9a 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -5,6 +5,7 @@ project(sycl-solution) set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) if(MSVC) set_property(GLOBAL PROPERTY USE_FOLDERS ON) @@ -135,6 +136,7 @@ add_library("${SYCLLibrary}" SHARED "${sourceRootPath}/detail/program_manager/program_manager.cpp" "${sourceRootPath}/detail/queue_impl.cpp" "${sourceRootPath}/detail/os_util.cpp" + "${sourceRootPath}/detail/platform_util.cpp" "${sourceRootPath}/detail/sampler_impl.cpp" "${sourceRootPath}/detail/scheduler/commands.cpp" "${sourceRootPath}/detail/scheduler/commands2.cpp" diff --git a/sycl/include/CL/sycl/detail/buffer_impl.hpp b/sycl/include/CL/sycl/detail/buffer_impl.hpp index 7cbef555f6e2b..009d83e0bbc39 100644 --- a/sycl/include/CL/sycl/detail/buffer_impl.hpp +++ b/sycl/include/CL/sycl/detail/buffer_impl.hpp @@ -42,7 +42,7 @@ class accessor; template class buffer; class handler; class queue; -template class id; +template struct id; template class range; using buffer_allocator = aligned_allocator; namespace detail { diff --git a/sycl/include/CL/sycl/detail/helpers.hpp b/sycl/include/CL/sycl/detail/helpers.hpp index 03a151a861171..024cad59b8bb2 100644 --- a/sycl/include/CL/sycl/detail/helpers.hpp +++ b/sycl/include/CL/sycl/detail/helpers.hpp @@ -19,10 +19,10 @@ namespace cl { namespace sycl { class context; class event; -template class item; +template struct item; template class group; template class range; -template class id; +template struct id; template class nd_item; namespace detail { class context_impl; diff --git a/sycl/include/CL/sycl/detail/os_util.hpp b/sycl/include/CL/sycl/detail/os_util.hpp index c00ad73032db0..131be5f90717b 100644 --- a/sycl/include/CL/sycl/detail/os_util.hpp +++ b/sycl/include/CL/sycl/detail/os_util.hpp @@ -10,6 +10,8 @@ #pragma once +#include + #ifdef _WIN32 #define SYCL_RT_OS_WINDOWS // Windows platform @@ -48,6 +50,9 @@ class OSUtil { /// Module handle for the executable module - it is assumed there is always /// single one at most. static const OSModuleHandle ExeModuleHandle; + + /// Returns the amount of RAM available for the operating system. + static size_t getOSMemSize(); }; } // namespace detail diff --git a/sycl/include/CL/sycl/detail/platform_util.hpp b/sycl/include/CL/sycl/detail/platform_util.hpp new file mode 100644 index 0000000000000..72100d8fd1b28 --- /dev/null +++ b/sycl/include/CL/sycl/detail/platform_util.hpp @@ -0,0 +1,40 @@ +//===-- platform_util.hpp - platform utilities ----------------*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace cl { +namespace sycl { +namespace detail { + +struct PlatformUtil { + enum class TypeIndex : unsigned int { + Char = 0, + Short = 1, + Int = 2, + Long = 3, + Float = 4, + Double = 5, + Half = 6 + }; + + /// Returns the maximum vector width counted in elements of the given type. + static uint32_t getNativeVectorWidth(TypeIndex Index); + + static uint32_t getMaxClockFrequency(); + + static uint32_t getMemCacheLineSize(); + + static uint64_t getMemCacheSize(); +}; + +} // namespace detail +} // namespace sycl +} // namespace cl diff --git a/sycl/include/CL/sycl/group.hpp b/sycl/include/CL/sycl/group.hpp index 21a0bde0fc15c..dcd8a69b5ecdd 100644 --- a/sycl/include/CL/sycl/group.hpp +++ b/sycl/include/CL/sycl/group.hpp @@ -19,7 +19,7 @@ namespace cl { namespace sycl { namespace detail { -class Builder; +struct Builder; } // namespace detail template class group { diff --git a/sycl/include/CL/sycl/id.hpp b/sycl/include/CL/sycl/id.hpp index 54c551c8f2d5f..a6f942ac2d076 100644 --- a/sycl/include/CL/sycl/id.hpp +++ b/sycl/include/CL/sycl/id.hpp @@ -16,8 +16,9 @@ namespace cl { namespace sycl { template class range; template struct id : public detail::array { -public: +private: using base = detail::array; +public: id() = default; /* The following constructor is only available in the id struct diff --git a/sycl/include/CL/sycl/item.hpp b/sycl/include/CL/sycl/item.hpp index 30ff70b90d716..d5715e2efda1f 100644 --- a/sycl/include/CL/sycl/item.hpp +++ b/sycl/include/CL/sycl/item.hpp @@ -16,10 +16,10 @@ namespace cl { namespace sycl { namespace detail { -class Builder; +struct Builder; } template struct id; -template struct range; +template class range; template struct item { item() = delete; @@ -86,8 +86,9 @@ template struct item { protected: // For call constructor inside conversion operator - friend class item; - friend class detail::Builder; + friend struct item; + friend struct item; + friend struct detail::Builder; template item(typename std::enable_if<(W == true), const range>::type &R, diff --git a/sycl/include/CL/sycl/nd_item.hpp b/sycl/include/CL/sycl/nd_item.hpp index cebaa1596e3f3..9e9fa9d1011cc 100644 --- a/sycl/include/CL/sycl/nd_item.hpp +++ b/sycl/include/CL/sycl/nd_item.hpp @@ -22,7 +22,7 @@ namespace cl { namespace sycl { namespace detail { -class Builder; +struct Builder; } template struct nd_item { diff --git a/sycl/include/CL/sycl/range.hpp b/sycl/include/CL/sycl/range.hpp index 0359b648f66ea..d1300fc8b8b7c 100644 --- a/sycl/include/CL/sycl/range.hpp +++ b/sycl/include/CL/sycl/range.hpp @@ -16,8 +16,8 @@ namespace sycl { template struct id; template class range : public detail::array { -public: using base = detail::array; +public: /* The following constructor is only available in the range class specialization where: dimensions==1 */ template diff --git a/sycl/source/detail/device_info.cpp b/sycl/source/detail/device_info.cpp index c578a255be2a2..2718efffab5c2 100644 --- a/sycl/source/detail/device_info.cpp +++ b/sycl/source/detail/device_info.cpp @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include +#include #include #include -#include #include #ifdef __GNUG__ @@ -21,22 +22,6 @@ namespace cl { namespace sycl { namespace detail { -// Used by methods that duplicate OpenCL behaviour in order to get CPU info -// TODO add Windows support -// TODO add support for x86-64 ABI selected using ifdef. -static void cpuid(unsigned int cpuid_info[], unsigned int type) { - unsigned int eax, ebx, ecx, edx; - __asm__ __volatile__("mov %%ebx, %%edi\n\r" - "cpuid\n\r" - "xchg %%edi, %%ebx\n\r" - : "=a"(eax), "=D"(ebx), "=c"(ecx), "=d"(edx) - : "a"(type)); - cpuid_info[0] = eax; - cpuid_info[1] = ebx; - cpuid_info[2] = ecx; - cpuid_info[3] = edx; -} - vector_class read_fp_bitfield(cl_device_fp_config bits) { vector_class result; if (bits & CL_FP_DENORM) @@ -156,99 +141,43 @@ cl_uint get_device_info_host() { return 0; } -// SSE4.2 has 16 byte (XMM) registers -static const cl_uint NATIVE_VECTOR_WIDTH_SSE42[] = {16, 8, 4, 2, 4, 2, 0}; -// AVX supports 32 byte (YMM) registers only for floats and doubles -static const cl_uint NATIVE_VECTOR_WIDTH_AVX[] = {16, 8, 4, 2, 8, 4, 0}; -// AVX2 has a full set of 32 byte (YMM) registers -static const cl_uint NATIVE_VECTOR_WIDTH_AVX2[] = {32, 16, 8, 4, 8, 4, 0}; -// AVX512 has 64 byte (ZMM) registers -static const cl_uint NATIVE_VECTOR_WIDTH_AVX512[] = {64, 32, 16, 8, 16, 8, 0}; - -cl_uint get_native_vector_width(size_t idx) { -#if (__GNUG__ && GCC_VERSION > 40900) - if (__builtin_cpu_supports("avx512f")) { - return NATIVE_VECTOR_WIDTH_AVX512[idx]; - } -#endif - - if (__builtin_cpu_supports("avx2")) { - return NATIVE_VECTOR_WIDTH_AVX2[idx]; - } - if (__builtin_cpu_supports("avx")) { - return NATIVE_VECTOR_WIDTH_AVX[idx]; - } - return NATIVE_VECTOR_WIDTH_SSE42[idx]; -} - template <> cl_uint get_device_info_host() { - return get_native_vector_width(0); + return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Char); } template <> cl_uint get_device_info_host() { - return get_native_vector_width(1); + return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Short); } template <> cl_uint get_device_info_host() { - return get_native_vector_width(2); + return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Int); } template <> cl_uint get_device_info_host() { - return get_native_vector_width(3); + return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Long); } template <> cl_uint get_device_info_host() { - return get_native_vector_width(4); + return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Float); } template <> cl_uint get_device_info_host() { - return get_native_vector_width(5); + return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Double); } template <> cl_uint get_device_info_host() { - return get_native_vector_width(6); + return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Half); } template <> cl_uint get_device_info_host() { - throw runtime_error( - "max_clock_frequency parameter is not supported for host device"); - unsigned int cpuInfo[4] = {0 - 1u}; - string_class buff(sizeof(cpuInfo) * 3 + 1, 0); - size_t offset = 0; - - for (unsigned int i = 0x80000002; i <= 0x80000004; i++) { - cpuid(cpuInfo, i); - std::copy(reinterpret_cast(cpuInfo), - reinterpret_cast(cpuInfo) + sizeof(cpuInfo), - buff.begin() + offset); - offset += sizeof(cpuInfo); - } - std::size_t found = buff.rfind("Hz"); - // Bail out if frequency is not found in CPUID string - if (found == std::string::npos) - return 0; - - buff = buff.substr(0, found); - - cl_uint freq = 0; - switch (buff[buff.size() - 1]) { - case 'M': - freq = 1; - break; - case 'G': - freq = 1000; - break; - } - buff = buff.substr(buff.rfind(' '), buff.length()); - freq *= std::stod(buff); - return freq; + return PlatformUtil::getMaxClockFrequency(); } template <> cl_uint get_device_info_host() { @@ -256,9 +185,7 @@ template <> cl_uint get_device_info_host() { } template <> cl_ulong get_device_info_host() { - struct sysinfo meminfo; - sysinfo(&meminfo); - return meminfo.totalram * meminfo.mem_unit; + return static_cast(OSUtil::getOSMemSize()); } template <> cl_ulong get_device_info_host() { @@ -362,16 +289,12 @@ get_device_info_host() { template <> cl_uint get_device_info_host() { - unsigned int viCPUInfo[4] = {(unsigned int)-1}; - cpuid(viCPUInfo, 0x80000006); - return viCPUInfo[2] & 0xff; + return PlatformUtil::getMemCacheLineSize(); } template <> cl_ulong get_device_info_host() { - unsigned int viCPUInfo[4] = {(unsigned int)-1}; - cpuid(viCPUInfo, 0x80000006); - return ((viCPUInfo[2] >> 16) & 0xffff) * 1024; + return PlatformUtil::getMemCacheSize(); } template <> diff --git a/sycl/source/detail/os_util.cpp b/sycl/source/detail/os_util.cpp index 5786aea7bab22..2d38788e82cd0 100644 --- a/sycl/source/detail/os_util.cpp +++ b/sycl/source/detail/os_util.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include +#include #if defined(SYCL_RT_OS_LINUX) @@ -16,9 +17,13 @@ #include #include -#include +#include -#endif // SYCL_RT_OS_LINUX +#elif defined(SYCL_RT_OS_WINDOWS) + +#include + +#endif namespace cl { namespace sycl { @@ -35,23 +40,22 @@ struct ModuleInfo { const char *Name; // out }; -static int callback(struct dl_phdr_info *info, size_t size, void *data) { - unsigned char *Base = reinterpret_cast(info->dlpi_addr); - ModuleInfo *MI = (ModuleInfo *)data; +static int callback(struct dl_phdr_info *Info, size_t Size, void *Data) { + auto Base = reinterpret_cast(Info->dlpi_addr); + auto MI = reinterpret_cast(Data); + auto TestAddr = reinterpret_cast(MI->VirtAddr); - for (int i = 0; i < info->dlpi_phnum; ++i) { - unsigned char *SegStart = Base + info->dlpi_phdr[i].p_vaddr; - unsigned char *SegEnd = SegStart + info->dlpi_phdr[i].p_memsz; - const unsigned char *TestAddr = - reinterpret_cast(MI->VirtAddr); + for (int i = 0; i < Info->dlpi_phnum; ++i) { + unsigned char *SegStart = Base + Info->dlpi_phdr[i].p_vaddr; + unsigned char *SegEnd = SegStart + Info->dlpi_phdr[i].p_memsz; // check if the tested address is within current segment if (TestAddr >= SegStart && TestAddr < SegEnd) { // ... it is - belongs to the module then // dlpi_addr is zero for the executable, replace it - void *H = (void *)info->dlpi_addr; + auto H = reinterpret_cast(Info->dlpi_addr); MI->Handle = H ? H : OSUtil::ExeModuleHandle; - MI->Name = info->dlpi_name; + MI->Name = Info->dlpi_name; return 1; // non-zero tells to finish iteration via modules } } @@ -66,9 +70,25 @@ OSModuleHandle OSUtil::getOSModuleHandle(const void *VirtAddr) { } #elif defined(SYCL_RT_OS_WINDOWS) +// TODO: implement this function for Windows probably by using // GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,...) -// to implement getOSModuleHandle -#endif // SYCL_RT_OS_LINUX +OSModuleHandle OSUtil::getOSModuleHandle(const void *VirtAddr) { + throw runtime_error("OSUtil::getOSModuleHandle() is not implemented yet"); +} +#endif // SYCL_RT_OS_WINDOWS + +size_t OSUtil::getOSMemSize() { +#if defined(SYCL_RT_OS_LINUX) + struct sysinfo MemInfo; + sysinfo(&MemInfo); + return static_cast(MemInfo.totalram * MemInfo.mem_unit); +#elif defined(SYCL_RT_OS_WINDOWS) + MEMORYSTATUSEX MemInfo; + MemInfo.dwLength = sizeof(MemInfo); + GlobalMemoryStatusEx(&MemInfo); + return static_cast(MemInfo.ullTotalPhys); +#endif +} } // namespace detail } // namespace sycl diff --git a/sycl/source/detail/platform_util.cpp b/sycl/source/detail/platform_util.cpp new file mode 100644 index 0000000000000..eeead0c38cce9 --- /dev/null +++ b/sycl/source/detail/platform_util.cpp @@ -0,0 +1,125 @@ +//===-- platform_util.cpp - Platform utilities implementation --*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#if defined(SYCL_RT_OS_LINUX) +#include +#elif defined(SYCL_RT_OS_WINDOWS) +#include +#endif + +namespace cl { +namespace sycl { +namespace detail { + +// Used by methods that duplicate OpenCL behaviour in order to get CPU info +static void cpuid(uint32_t *CPUInfo, uint32_t Type, uint32_t SubType = 0) { +#if defined(SYCL_RT_OS_LINUX) + __cpuid_count(Type, SubType, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#elif defined(SYCL_RT_OS_WINDOWS) + __cpuidex(reinterpret_cast(CPUInfo), Type, SubType); +#endif +} + +uint32_t PlatformUtil::getMaxClockFrequency() { + throw runtime_error( + "max_clock_frequency parameter is not supported for host device"); + uint32_t CPUInfo[4]; + string_class Buff(sizeof(CPUInfo) * 3 + 1, 0); + size_t Offset = 0; + + for (uint32_t i = 0x80000002; i <= 0x80000004; i++) { + cpuid(CPUInfo, i); + std::copy(reinterpret_cast(CPUInfo), + reinterpret_cast(CPUInfo) + sizeof(CPUInfo), + Buff.begin() + Offset); + Offset += sizeof(CPUInfo); + } + std::size_t Found = Buff.rfind("Hz"); + // Bail out if frequency is not found in CPUID string + if (Found == std::string::npos) + return 0; + + Buff = Buff.substr(0, Found); + uint32_t Freq = 0; + switch (Buff[Buff.size() - 1]) { + case 'M': + Freq = 1; + break; + case 'G': + Freq = 1000; + break; + } + Buff = Buff.substr(Buff.rfind(' '), Buff.length()); + Freq *= std::stod(Buff); + return Freq; +} + +uint32_t PlatformUtil::getMemCacheLineSize() { + uint32_t CPUInfo[4]; + cpuid(CPUInfo, 0x80000006); + return CPUInfo[2] & 0xff; +} + +uint64_t PlatformUtil::getMemCacheSize() { + uint32_t CPUInfo[4]; + cpuid(CPUInfo, 0x80000006); + return static_cast(CPUInfo[2] >> 16) * 1024; +} + +uint32_t PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex TIndex) { + // SSE4.2 has 16 byte (XMM) registers + static constexpr uint32_t VECTOR_WIDTH_SSE42[] = {16, 8, 4, 2, 4, 2, 0}; + // AVX supports 32 byte (YMM) registers only for floats and doubles + static constexpr uint32_t VECTOR_WIDTH_AVX[] = {16, 8, 4, 2, 8, 4, 0}; + // AVX2 has a full set of 32 byte (YMM) registers + static constexpr uint32_t VECTOR_WIDTH_AVX2[] = {32, 16, 8, 4, 8, 4, 0}; + // AVX512 has 64 byte (ZMM) registers + static constexpr uint32_t VECTOR_WIDTH_AVX512[] = {64, 32, 16, 8, 16, 8, 0}; + + uint32_t Index = static_cast(TIndex); + +#if defined(SYCL_RT_OS_LINUX) + if (__builtin_cpu_supports("avx512f")) + return VECTOR_WIDTH_AVX512[Index]; + if (__builtin_cpu_supports("avx2")) + return VECTOR_WIDTH_AVX2[Index]; + if (__builtin_cpu_supports("avx")) + return VECTOR_WIDTH_AVX[Index]; +#elif defined(SYCL_RT_OS_WINDOWS) + + uint32_t Info[4]; + + // Check that CPUID func number 7 is available. + cpuid(Info, 0); + if (Info[0] >= 7) { + // avx512f = CPUID.7.EBX[16] + cpuid(Info, 7); + if (Info[1] & (1 << 16)) + return VECTOR_WIDTH_AVX512[Index]; + + // avx2 = CPUID.7.EBX[5] + if (Info[1] & (1 << 5)) + return VECTOR_WIDTH_AVX2[Index]; + } + // It is assumed that CPUID func number 1 is always available. + // avx = CPUID.1.ECX[28] + cpuid(Info, 1); + if (Info[2] & (1 << 28)) + return VECTOR_WIDTH_AVX[Index]; +#endif + + return VECTOR_WIDTH_SSE42[Index]; +} + +} // namespace detail +} // namespace sycl +} // namespace cl