Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL 2020][USM] Add USM queue shortcuts, memcpy, memset, fill, prefetch, mem_advise stub, test cases #323

Merged
merged 5 commits into from
Oct 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/hipSYCL/runtime/cuda/cuda_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ class cuda_queue : public inorder_queue

virtual result submit_memcpy(const memcpy_operation&) override;
virtual result submit_kernel(const kernel_operation&) override;
virtual result submit_prefetch(const prefetch_operation&) override;
virtual result submit_prefetch(const prefetch_operation &) override;
virtual result submit_memset(const memset_operation&) override;

/// Causes the queue to wait until an event on another queue has occured.
/// the other queue must be from the same backend
Expand Down
7 changes: 5 additions & 2 deletions include/hipSYCL/runtime/dag_builder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,11 @@ class dag_builder
const requirements_list& requirements,
const execution_hints& hints = {});
dag_node_ptr add_prefetch(std::unique_ptr<operation> op,
const requirements_list& requirements,
const execution_hints& hints = {});
const requirements_list &requirements,
const execution_hints &hints = {});
dag_node_ptr add_memset(std::unique_ptr<operation> op,
const requirements_list &requirements,
const execution_hints &hints = {});
dag_node_ptr
add_explicit_mem_requirement(std::unique_ptr<operation> req,
const requirements_list &requirements,
Expand Down
3 changes: 2 additions & 1 deletion include/hipSYCL/runtime/hip/hip_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ class hip_queue : public inorder_queue

virtual result submit_memcpy(const memcpy_operation&) override;
virtual result submit_kernel(const kernel_operation&) override;
virtual result submit_prefetch(const prefetch_operation&) override;
virtual result submit_prefetch(const prefetch_operation &) override;
virtual result submit_memset(const memset_operation&) override;

/// Causes the queue to wait until an event on another queue has occured.
/// the other queue must be from the same backend
Expand Down
3 changes: 2 additions & 1 deletion include/hipSYCL/runtime/inorder_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ class inorder_queue

virtual result submit_memcpy(const memcpy_operation&) = 0;
virtual result submit_kernel(const kernel_operation&) = 0;
virtual result submit_prefetch(const prefetch_operation&) = 0;
virtual result submit_prefetch(const prefetch_operation &) = 0;
virtual result submit_memset(const memset_operation&) = 0;

/// Causes the queue to wait until an event on another queue has occured.
/// the other queue must be from the same backend
Expand Down
3 changes: 2 additions & 1 deletion include/hipSYCL/runtime/omp/omp_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ class omp_queue : public inorder_queue

virtual result submit_memcpy(const memcpy_operation&) override;
virtual result submit_kernel(const kernel_operation&) override;
virtual result submit_prefetch(const prefetch_operation&) override;
virtual result submit_prefetch(const prefetch_operation &) override;
virtual result submit_memset(const memset_operation&) override;

/// Causes the queue to wait until an event on another queue has occured.
/// the other queue must be from the same backend
Expand Down
49 changes: 43 additions & 6 deletions include/hipSYCL/runtime/operations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,15 @@ using dag_node_ptr = std::shared_ptr<dag_node>;
class kernel_operation;
class memcpy_operation;
class prefetch_operation;
class memset_operation;

class operation_dispatcher
{
public:
virtual result dispatch_kernel(kernel_operation* op) = 0;
virtual result dispatch_memcpy(memcpy_operation* op) = 0;
virtual result dispatch_prefetch(prefetch_operation* op) = 0;
virtual result dispatch_prefetch(prefetch_operation *op) = 0;
virtual result dispatch_memset(memset_operation* op) = 0;
virtual ~operation_dispatcher(){}
};

Expand Down Expand Up @@ -260,6 +262,7 @@ class buffer_memory_requirement : public memory_requirement

class requirements_list;


class kernel_operation : public operation
{
public:
Expand All @@ -270,7 +273,7 @@ class kernel_operation : public operation
kernel_launcher& get_launcher();
const kernel_launcher& get_launcher() const;

const std::vector<memory_requirement*>& get_requirements() const;
const std::vector<memory_requirement*>& get_memory_requirements() const;

void dump(std::ostream & ostr, int indentation=0) const override;

Expand Down Expand Up @@ -369,10 +372,44 @@ class memcpy_operation : public operation
range<3> _num_elements;
};

/// A prefetch operation on SVM/USM memory
class prefetch_operation : public operation
{
// TBD
/// USM prefetch
class prefetch_operation : public operation {
public:
prefetch_operation(const void *ptr, std::size_t num_bytes)
: _ptr{ptr}, _num_bytes{num_bytes} {}

result dispatch(operation_dispatcher* dispatcher) final override {
return dispatcher->dispatch_prefetch(this);
}

const void *get_pointer() const { return _ptr; }
std::size_t get_num_bytes() const { return _num_bytes; }

void dump(std::ostream&, int = 0) const override;
private:
const void *_ptr;
std::size_t _num_bytes;
};

/// USM memset
class memset_operation : public operation {
public:
memset_operation(void *ptr, unsigned char pattern, std::size_t num_bytes)
: _ptr{ptr}, _pattern{pattern}, _num_bytes{num_bytes} {}

result dispatch(operation_dispatcher* dispatcher) final override {
return dispatcher->dispatch_memset(this);
}

void *get_pointer() const { return _ptr; }
unsigned char get_pattern() const { return _pattern; }
std::size_t get_num_bytes() const { return _num_bytes; }

void dump(std::ostream&, int = 0) const override;
private:
void *_ptr;
unsigned char _pattern;
std::size_t _num_bytes;
};


Expand Down
156 changes: 126 additions & 30 deletions include/hipSYCL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,14 @@

#include "exception.hpp"
#include "access.hpp"
#include "context.hpp"
#include "libkernel/backend.hpp"
#include "device.hpp"
#include "event.hpp"
#include "types.hpp"
#include "usm_query.hpp"
#include "libkernel/accessor.hpp"
#include "libkernel/builtin_kernels.hpp"
#include "libkernel/id.hpp"
#include "libkernel/range.hpp"
#include "libkernel/nd_range.hpp"
Expand All @@ -63,9 +66,8 @@ namespace sycl {

class queue;

class handler
{
friend class sycl::queue;
class handler {
friend class queue;
public:
~handler()
{
Expand Down Expand Up @@ -333,35 +335,127 @@ class handler
static_assert(tgt != access::target::host_image,
"host_image targets are unsupported");

// Use a function object instead of lambda to avoid
// requiring a unique kernel name for each fill call
// TODO: hipSYCL rt currently does not have a dedicated operation
// for fills - implement for the ability to implement fill using
// backend functionality instead of a kernel
class fill_kernel
{
public:
fill_kernel(accessor<T, dim, mode, tgt> dest,
const T& src)
: _dest{dest}, _src{src}
{}

void operator()(sycl::id<dim> tid)
{
_dest[tid] = _src;
}

private:
accessor<T, dim, mode, tgt> _dest;
T _src;
};

this->submit_kernel<class _unnamed_kernel, rt::kernel_type::basic_parallel_for>(
dest.get_offset(), dest.get_range(),
dest.get_range() /*local range unused for basic pf*/,
fill_kernel{dest, src});
detail::kernels::fill_kernel{dest, src});
}

// ------ USM functions ------

void memcpy(void *dest, const void *src, std::size_t num_bytes) {

rt::dag_build_guard build{rt::application::dag()};

if(!_execution_hints.has_hint<rt::hints::bind_to_device>())
throw invalid_parameter_error{"handler: explicit memcpy() is unsupported "
"for queues not bound to devices"};

rt::device_id queue_dev =
_execution_hints.get_hint<rt::hints::bind_to_device>()->get_device_id();


auto determine_ptr_device = [&, this](const void *ptr) {
usm::alloc alloc_type = get_pointer_type(ptr, _ctx);
// For shared allocations, be optimistic and assume that data is
// already on target device
if (alloc_type == usm::alloc::shared)
return queue_dev;

if (alloc_type == usm::alloc::host ||
alloc_type == usm::alloc::unknown)
return detail::get_host_device();

if(alloc_type == usm::alloc::device)
// we are dealing with a device allocation
return detail::extract_rt_device(get_pointer_device(ptr, _ctx));

throw invalid_parameter_error{"Invalid allocation type"};
};

rt::device_id src_dev = determine_ptr_device(src);
rt::device_id dest_dev = determine_ptr_device(dest);

rt::memory_location source_location{
src_dev, extract_ptr(src), rt::id<3>{},
rt::embed_in_range3(range<1>{num_bytes}), 1};

rt::memory_location dest_location{
dest_dev, extract_ptr(dest), rt::id<3>{},
rt::embed_in_range3(range<1>{num_bytes}), 1};

auto op = rt::make_operation<rt::memcpy_operation>(
source_location, dest_location, rt::embed_in_range3(range<1>{num_bytes}));

rt::dag_node_ptr node = build.builder()->add_memcpy(
std::move(op), _requirements, _execution_hints);

_command_group_nodes.push_back(node);
}


template <class T> void fill(void *ptr, const T &pattern, std::size_t count) {
// For special cases we can map this to a potentially more low-level memset
if (sizeof(T) == 1) {
unsigned char val = *reinterpret_cast<const unsigned char*>(&pattern);

memset(ptr, static_cast<int>(val), count);
} else {
T *typed_ptr = static_cast<T *>(ptr);

if (!_execution_hints.has_hint<rt::hints::bind_to_device>())
throw invalid_parameter_error{"handler: USM fill() is unsupported "
"for queues not bound to devices"};

this->submit_kernel<class _unnamed_kernel,
rt::kernel_type::basic_parallel_for>(
sycl::id<1>{}, sycl::range<1>{count},
sycl::range<1>{count} /*local range unused for basic pf*/,
detail::kernels::fill_kernel_usm{typed_ptr, pattern});
}
}

void memset(void *ptr, int value, std::size_t num_bytes) {

rt::dag_build_guard build{rt::application::dag()};

if(!_execution_hints.has_hint<rt::hints::bind_to_device>())
throw invalid_parameter_error{"handler: explicit memset() is unsupported "
"for queues not bound to devices"};

auto op = rt::make_operation<rt::memset_operation>(
ptr, static_cast<unsigned char>(value), num_bytes);

rt::dag_node_ptr node = build.builder()->add_memcpy(
std::move(op), _requirements, _execution_hints);

_command_group_nodes.push_back(node);
}

void prefetch(const void *ptr, std::size_t num_bytes) {

rt::dag_build_guard build{rt::application::dag()};

if(!_execution_hints.has_hint<rt::hints::bind_to_device>())
throw invalid_parameter_error{"handler: explicit prefetch() is unsupported "
"for queues not bound to devices"};

auto op = rt::make_operation<rt::prefetch_operation>(
ptr, num_bytes);

rt::dag_node_ptr node = build.builder()->add_prefetch(
std::move(op), _requirements, _execution_hints);

_command_group_nodes.push_back(node);
}

void mem_advise(const void *addr, std::size_t num_bytes, int advice) {
throw feature_not_supported{"mem_advise() is not yet supported"};
}



detail::local_memory_allocator& get_local_memory_allocator()
{
return _local_mem_allocator;
Expand Down Expand Up @@ -517,10 +611,12 @@ class handler
const std::vector<rt::dag_node_ptr>& get_cg_nodes() const
{ return _command_group_nodes; }

// defined in queue.hpp
handler(const queue& q, async_handler handler, const rt::execution_hints& hints);

const queue* _queue;
handler(const context &ctx, async_handler handler,
const rt::execution_hints &hints)
: _ctx{ctx}, _handler{handler},
_execution_hints{hints} {}

const context& _ctx;
detail::local_memory_allocator _local_mem_allocator;
async_handler _handler;

Expand Down
Loading