Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[alpaka] Add support for the SYCL back-end #407

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 69 additions & 21 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -132,38 +132,52 @@ OCLOC_IDS := pvc # tgllp acm_g10 pvc

ifdef SYCL_USE_INTEL_ONEAPI
ONEAPI_BASE := /opt/intel/oneapi
endif

ifeq ($(wildcard $(ONEAPI_BASE)),)
$(warning Cannot find an Intel oneAPI installation at $(ONEAPI_BASE))
endif
ifeq ($(wildcard $(ONEAPI_BASE)),)
# Intel oneAPI not available
$(warning Cannot find an Intel oneAPI installation at $(ONEAPI_BASE))
SYCL_USE_INTEL_ONEAPI :=
ONEAPI_BASE :=
endif

INTELOCLCPU_BASE := $(BASE_DIR)/external/oclcpuexp
INTELOCLCPU_ICD := $(INTELOCLCPU_BASE)/x64/libintelocl.so

ifdef SYCL_USE_INTEL_ONEAPI
# Intel oneTBB
TBB_BASE := $(ONEAPI_BASE)/tbb/latest
TBB_LIBDIR := $(TBB_BASE)/lib/intel64/gcc4.8
TBB_BASE := $(ONEAPI_BASE)/tbb/latest
TBB_LIBDIR := $(TBB_BASE)/lib/intel64/gcc4.8

# Intel debugger
GDB_ONEAPI_BASE := $(ONEAPI_BASE)/debugger/latest

# use Intel oneAPI DPC++/C++ Compiler
SYCL_BASE := $(ONEAPI_BASE)/compiler/latest/linux
SYCL_PATH := $(SYCL_BASE)/bin:$(SYCL_BASE)/bin-llvm
SYCL_LDPATH := $(SYCL_BASE)/lib:$(SYCL_BASE)/lib/x64:$(SYCL_BASE)/compiler/lib/intel64_lin
SYCL_LIBDIR := $(SYCL_BASE)/lib
SYCL_BASE := $(ONEAPI_BASE)/compiler/latest/linux
SYCL_PATH := $(SYCL_BASE)/bin:$(SYCL_BASE)/bin-llvm
SYCL_LDPATH := $(SYCL_BASE)/lib:$(SYCL_BASE)/lib/x64:$(SYCL_BASE)/compiler/lib/intel64_lin
SYCL_LIBDIR := $(SYCL_BASE)/lib
# use ICPX: $(SYCL_BASE)/bin/icpx
# use clang++: $(SYCL_BASE)/bin-llvm/clang++
SYCL_CXX := $(SYCL_BASE)/bin/icpx
SYCL_CXX := $(SYCL_BASE)/bin/icpx

# use the oneAPI CPU OpenCL runtime
export OCL_ICD_FILENAMES := $(SYCL_BASE)/lib/x64/libintelocl.so
INTELOCLCPU_ICD := $(SYCL_BASE)/lib/x64/libintelocl.so

# override the CPU OpenCL runtime with the last known working version (2022.14.8.0.04)
INTELOCLCPU_ICD := $(INTELOCLCPU_BASE)/x64/libintelocl.so
else
# use clang++
# latest release: /cvmfs/patatrack.cern.ch/externals/x86_64/rhel8/intel/sycl/release/2022-12
# latest nightly: /cvmfs/patatrack.cern.ch/externals/x86_64/rhel8/intel/sycl/nightly/20230708
SYCL_BASE := /cvmfs/patatrack.cern.ch/externals/x86_64/rhel8/intel/sycl/nightly/20230805_160000
SYCL_PATH := $(SYCL_BASE)/bin
SYCL_LDPATH := $(SYCL_BASE)/lib:$(SYCL_BASE)/lib64
SYCL_LIBDIR := $(SYCL_BASE)/lib
SYCL_CXX := $(SYCL_BASE)/bin/clang++
SYCL_BASE := /cvmfs/patatrack.cern.ch/externals/x86_64/rhel8/intel/sycl/nightly/20230805_160000
SYCL_PATH := $(SYCL_BASE)/bin
SYCL_LDPATH := $(SYCL_BASE)/lib:$(SYCL_BASE)/lib64
SYCL_LIBDIR := $(SYCL_BASE)/lib
SYCL_CXX := $(SYCL_BASE)/bin/clang++

# use the latest CPU OpenCL runtime (2023.16.6.0.28)
export OCL_ICD_FILENAMES := /cvmfs/patatrack.cern.ch/externals/x86_64/rhel8/intel/sycl/runtime/intel/oclcpuexp_2023.16.6.0.28/x64/libintelocl.so
# use the last known working CPU OpenCL runtime (2022.14.8.0.04)
INTELOCLCPU_ICD := /cvmfs/patatrack.cern.ch/externals/x86_64/rhel8/intel/sycl/runtime/intel/oclcpuexp_2022.14.8.0.04/x64/libintelocl.so
endif

ifneq ($(wildcard $(SYCL_BASE)),)
Expand All @@ -189,6 +203,8 @@ ifneq ($(wildcard $(SYCL_BASE)),)
SYCL_FLAGS := -fsycl -fsycl-targets=$(SYCL_TARGETS)
SYCL_LDFLAGS := -fsycl-fp32-prec-sqrt -fsycl-link-huge-device-code $(JIT_FLAGS) $(AOT_CPU_FLAGS) $(AOT_INTEL_FLAGS) $(AOT_CUDA_FLAGS) $(AOT_ROCM_FLAGS)

export OCL_ICD_FILENAMES := $(INTELOCLCPU_ICD)

# other SYCL options
# -fsycl-device-code-split=per_kernel
# build one binary image per kernel (very slow), to allow kernel built for subgroup sizes not supported by all devices;
Expand Down Expand Up @@ -234,6 +250,15 @@ ifneq ($(wildcard $(SYCL_BASE)),)
export SYCL_CXXFLAGS := $(filter-out $(LLVM_UNSUPPORTED_CXXFLAGS),$(CXXFLAGS)) $(SYCL_FLAGS) $(USER_SYCLFLAGS)
export SYCL_LDFLAGS

# alpaka SYCL targets$(ALPAKA_SYCL_GPU_TARGETS)
export SYCL_BASE
export ALPAKA_SYCL_CXXFLAGS := -fsycl $(filter-out $(LLVM_UNSUPPORTED_CXXFLAGS),$(CXXFLAGS)) $(USER_SYCLFLAGS) -Wno-unused-const-variable -Wno-constant-conversion -Wno-tautological-constant-compare
export ALPAKA_SYCL_LDFLAGS := -fsycl-fp32-prec-sqrt -fsycl-link-huge-device-code -fsycl-max-parallel-link-jobs=8
export ALPAKA_SYCL_CPU_TARGETS := -fsycl-targets=spir64_x86_64
export ALPAKA_SYCL_CPU_FLAGS :=
export ALPAKA_SYCL_GPU_TARGETS := -fsycl-targets=$(foreach ARCH,$(OCLOC_IDS),intel_gpu_$(ARCH))
export ALPAKA_SYCL_GPU_FLAGS := -Xsycl-target-backend=intel_gpu_pvc '-q -options -ze-intel-enable-auto-large-GRF-mode'

# add the SYCL paths to the PATH and LD_LIBRARY_PATH
export PATH := $(SYCL_PATH):$(PATH)
export LD_LIBRARY_PATH := $(SYCL_LDPATH):$(TBB_LIBDIR):$(LD_LIBRARY_PATH)
Expand All @@ -253,6 +278,10 @@ DATA_TAR_GZ := $(DATA_BASE)/data.tar.gz
# External definitions
EXTERNAL_BASE := $(BASE_DIR)/external

export INTELOCLCPU_DEPS := $(INTELOCLCPU_ICD)
INTELOCLCPU_CXXFLAGS :=
INTELOCLCPU_LDFLAGS :=

HWLOC_BASE := $(EXTERNAL_BASE)/hwloc
export HWLOC_DEPS := $(HWLOC_BASE)
HWLOC_CXXFLAGS := -isystem $(HWLOC_BASE)/include
Expand Down Expand Up @@ -320,7 +349,7 @@ export BACKTRACE_SYCL_CXXFLAGS :=

ALPAKA_BASE := $(EXTERNAL_BASE)/alpaka
export ALPAKA_DEPS := $(ALPAKA_BASE)
export ALPAKA_CXXFLAGS := -isystem $(ALPAKA_BASE)/include
export ALPAKA_CXXFLAGS := -isystem $(ALPAKA_BASE)/include -DALPAKA_DISABLE_VENDOR_RNG

KOKKOS_BASE := $(EXTERNAL_BASE)/kokkos
KOKKOS_SRC := $(KOKKOS_BASE)/source
Expand Down Expand Up @@ -526,7 +555,8 @@ test_auto: $(TEST_AUTO_TARGETS)
.PHONY: test_auto $(TEST_AUTO_TARGETS)
.PHONY: format $(patsubst %,format_%,$(TARGETS_ALL))
.PHONY: environment print_targets clean distclean dataclean
.PHONY: external_tbb external_cub external_eigen external_kokkos external_kokkos_clean
.PHONY: external_oclcpu external_tbb external_eigen external_boost external_libbacktrace external_hwloc external_alpaka external_kokkos external_kokkos_clean


environment: env.sh
env.sh: Makefile
Expand Down Expand Up @@ -556,6 +586,9 @@ endif
@echo -n '$(KOKKOS_LIBDIR):' >> $@
ifneq ($(SYCL_BASE),)
@echo -n '$(SYCL_LDPATH):' >> $@
ifneq ($(SYCL_USE_INTEL_ONEAPI),)
@echo -n '$(GDB_ONEAPI_BASE)/gdb/intel64/lib:' >> $@
endif
endif
@echo '$$LD_LIBRARY_PATH' >> $@
@# set the PATH
Expand All @@ -568,9 +601,15 @@ ifdef ROCM_BASE
endif
ifneq ($(SYCL_BASE),)
@echo -n '$(SYCL_PATH):' >> $@
ifneq ($(SYCL_USE_INTEL_ONEAPI),)
@echo -n '$(GDB_ONEAPI_BASE)/gdb/intel64/bin:' >> $@
endif
endif
@echo '$$PATH' >> $@
ifneq ($(SYCL_BASE),)
ifneq ($(SYCL_USE_INTEL_ONEAPI),)
@echo 'export INTEL_PYTHONHOME=$(GDB_ONEAPI_BASE)/dep' >> $@
endif
@# load the CPU OpenCL runtime
@echo 'export OCL_ICD_FILENAMES=$(OCL_ICD_FILENAMES)' >> $@
@# enable double precision floating point emulation for Intel GPUs
Expand Down Expand Up @@ -667,6 +706,15 @@ $(DATA_TAR_GZ): | $(DATA_BASE)/url.txt
$(EXTERNAL_BASE):
mkdir -p $@

# OpenCL CPU runtime
external_oclcpu: $(INTELOCLCPU_ICD)
ifneq ($(findstring $(EXTERNAL_BASE),$(INTELOCLCPU_ICD)),)
$(INTELOCLCPU_ICD):
mkdir -p $(INTELOCLCPU_BASE)
curl -L -s -S https://github.com/intel/llvm/releases/download/2022-WW33/oclcpuexp-2022.14.8.0.04_rel.tar.gz | tar xz -C $(INTELOCLCPU_BASE)
chmod -R +rwX $(INTELOCLCPU_BASE)
endif

# TBB
external_tbb: $(TBB_LIB)

Expand Down Expand Up @@ -748,7 +796,7 @@ external_alpaka: $(ALPAKA_BASE)

$(ALPAKA_BASE):
git clone https://github.com/alpaka-group/alpaka.git -b develop $@
cd $@ && git checkout bb74c9129e8761cb74b9733b034eec62f7c0f600
cd $@ && git checkout 819974ddc5b2eb4b33e709bd317701793cdb7d15

# Kokkos
external_kokkos: $(KOKKOS_LIB)
Expand Down
20 changes: 20 additions & 0 deletions src/alpaka/AlpakaCore/AllocatorPolicy.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,26 @@ namespace cms::alpakatools {
#endif
#endif // ALPAKA_ACC_GPU_HIP_ENABLED

#if defined ALPAKA_SYCL_ONEAPI_CPU
template <>
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCpuSycl> =
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
AllocatorPolicy::Caching;
#else
AllocatorPolicy::Synchronous;
#endif
#endif // ALPAKA_SYCL_ONEAPI_CPU

#if defined ALPAKA_SYCL_ONEAPI_GPU
template <>
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevGpuSyclIntel> =
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
AllocatorPolicy::Caching;
#else
AllocatorPolicy::Synchronous;
#endif
#endif // ALPAKA_SYCL_ONEAPI_GPU

} // namespace cms::alpakatools

#endif // AlpakaCore_AllocatorPolicy_h
105 changes: 105 additions & 0 deletions src/alpaka/AlpakaCore/CachedBufAlloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,111 @@ namespace cms::alpakatools {

#endif // ALPAKA_ACC_GPU_HIP_ENABLED

#ifdef ALPAKA_SYCL_ONEAPI_CPU

//! The caching memory allocator implementation for the pinned host memory
template <typename TElem, typename TDim, typename TIdx>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCpuSyclNonBlocking, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
alpaka::QueueCpuSyclNonBlocking queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getHostCachingAllocator<alpaka::QueueCpuSyclNonBlocking>();

// FIXME the BufCpu does not support a pitch ?
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);

// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};

//! The caching memory allocator implementation for the SYCL CPU device
template <typename TElem, typename TDim, typename TIdx, typename TQueue>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpuSycl, TQueue, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpuSycl const& dev, TQueue queue, TExtent const& extent)
-> alpaka::BufCpuSycl<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getDeviceCachingAllocator<alpaka::DevCpuSycl, TQueue>(dev);

// size_t width = alpaka::getWidth(extent);
// size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
// TODO implement pitch in SYCL
// size_t pitchBytes = widthBytes;
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);

// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufCpuSycl<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};

#endif // ALPAKA_SYCL_ONEAPI_CPU

#ifdef ALPAKA_SYCL_ONEAPI_GPU

//! The caching memory allocator implementation for the pinned host memory
template <typename TElem, typename TDim, typename TIdx>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueGpuSyclIntelNonBlocking, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
alpaka::QueueGpuSyclIntelNonBlocking queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getHostCachingAllocator<alpaka::QueueGpuSyclIntelNonBlocking>();

// FIXME the BufCpu does not support a pitch ?
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);

// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};

//! The caching memory allocator implementation for the SYCL GPU device
template <typename TElem, typename TDim, typename TIdx, typename TQueue>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevGpuSyclIntel, TQueue, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevGpuSyclIntel const& dev, TQueue queue, TExtent const& extent)
-> alpaka::BufGpuSyclIntel<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getDeviceCachingAllocator<alpaka::DevGpuSyclIntel, TQueue>(dev);

// size_t width = alpaka::getWidth(extent);
// size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
// TODO implement pitch in SYCL
// size_t pitchBytes = widthBytes;
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);

// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufGpuSyclIntel<TElem, TDim, TIdx>(
dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};

#endif // ALPAKA_SYCL_ONEAPI_GPU

} // namespace traits

template <typename TElem, typename TIdx, typename TExtent, typename TQueue, typename TDev>
Expand Down
39 changes: 31 additions & 8 deletions src/alpaka/AlpakaCore/CachingAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,13 @@ namespace cms::alpakatools {
std::ostringstream out;
out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned " << block.bytes << " bytes at "
<< ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << " , event "
<< block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached ("
<< cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
<< " bytes) outstanding." << std::endl;
#if ALPAKA_ACC_SYCL_ENABLED
<< " is not a shared pointer in SYCL"
#else
<< block.event->m_spEventImpl.get()
#endif
<< " .\n\t\t " << cachedBlocks_.size() << " available blocks cached (" << cachedBytes_.free << " bytes), "
<< liveBlocks_.size() << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl;
std::cout << out.str() << std::endl;
}
} else {
Expand All @@ -212,9 +216,13 @@ namespace cms::alpakatools {
std::ostringstream out;
out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
<< ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
<< block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached ("
<< cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
<< " bytes) outstanding." << std::endl;
#if ALPAKA_ACC_SYCL_ENABLED
<< " is not a shared pointer in SYCL"
#else
<< block.event->m_spEventImpl.get()
#endif
<< " .\n\t\t " << cachedBlocks_.size() << " available blocks cached (" << cachedBytes_.free << " bytes), "
<< liveBlocks_.size() << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl;
std::cout << out.str() << std::endl;
}
}
Expand Down Expand Up @@ -301,9 +309,18 @@ namespace cms::alpakatools {
std::ostringstream out;
out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " reused cached block at "
<< block.buffer->data() << " (" << block.bytes << " bytes) for queue "
<< block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get()
<< block.queue->m_spQueueImpl.get() << ", event "
#if ALPAKA_ACC_SYCL_ENABLED
<< " is not a shared pointer in SYCL"
#else
<< block.event->m_spEventImpl.get()
#endif
<< " (previously associated with stream " << iBlock->second.queue->m_spQueueImpl.get() << " , event "
#if ALPAKA_ACC_SYCL_ENABLED
<< " is not a shared pointer in SYCL)." << std::endl;
#else
<< iBlock->second.event->m_spEventImpl.get() << ")." << std::endl;
#endif
std::cout << out.str() << std::endl;
}

Expand Down Expand Up @@ -366,7 +383,13 @@ namespace cms::alpakatools {
std::ostringstream out;
out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " allocated new block at "
<< block.buffer->data() << " (" << block.bytes << " bytes associated with queue "
<< block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() << "." << std::endl;
<< block.queue->m_spQueueImpl.get() << ", event "
#if ALPAKA_ACC_SYCL_ENABLED
<< " is not a shared pointer in SYCL"
#else
<< block.event->m_spEventImpl.get()
#endif
<< "." << std::endl;
std::cout << out.str() << std::endl;
}
}
Expand Down
7 changes: 6 additions & 1 deletion src/alpaka/AlpakaCore/ESProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ namespace cms::alpakatools {

ESProduct() : gpuDataPerDevice_(devices<Platform>().size()) {
for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
gpuDataPerDevice_[i].m_event = getEventCache<Event>().get(devices<Platform>()[i]);
gpuDataPerDevice_[i].m_event =
#if !defined(ALPAKA_ACC_SYCL_ENABLED)
getEventCache<Event>().get(devices<Platform>()[i]);
#else
std::make_shared<Event>(devices<Platform>()[i]);
#endif
}
}

Expand Down
Loading