diff --git a/sycl/doc/extensions/DataFlowPipes/data_flow_pipes_rev4_proposed.asciidoc b/sycl/doc/extensions/DataFlowPipes/data_flow_pipes_rev4_proposed.asciidoc index d8eb67072d4ed..c0587f4e81a55 100755 --- a/sycl/doc/extensions/DataFlowPipes/data_flow_pipes_rev4_proposed.asciidoc +++ b/sycl/doc/extensions/DataFlowPipes/data_flow_pipes_rev4_proposed.asciidoc @@ -170,20 +170,82 @@ The pipe class exposes static member functions for writing a data word to a pipe Blocking and non-blocking forms of the read and write members are defined, with the form chosen based on overload resolution. +Read/write methods take an ext::oneapi::properties argument, which can contain the latency control properties `latency_anchor_id` and/or `latency_constraint`. + +* `sycl::ext::intel::latency_anchor_id`, where `N` is an integer: An ID to associate with the current read/write function call, which can then be referenced by other `latency_constraint` properties elsewhere in the program to define relative latency constaints. ID must be unique within the application, and a diagnostic is required if that condition is not met. +* `sycl::ext::intel::latency_constraint`: A tuple of three values which cause the current read/write function call to act as an endpoint of a latency constraint relative to a specified `latency_anchor_id` defined by a different instruction. +** `A` is an integer: The ID of the target anchor defined on a different instruction through a `latency_anchor_id` property. +** `B` is an enum value: The type of control from the set {`type::exact`, `type::min`, `type::max`}. +** `C` is an integer: The relative clock cycle difference between the target anchor and the current function call, that the constraint should infer subject to the type of the control (exact, min, max). + [source,c++,Read write members,linenums] ---- +namespace sycl::ext::intel { + +struct latency_anchor_id { + template + using value_t = property_value>; +}; + +struct latency_constraint { + enum class type { + none, + exact, + max, + min + }; + template + using value_t = property_value, + std::integral_constant, + std::integral_constant>; +}; + +template +struct property_value, + std::integral_constant, + std::integral_constant> { + static constexpr int target = Target; + static constexpr latency_constraint::type type = Type; + static constexpr int cycle = Cycle; +}; + template class pipe { // Blocking + static dataT read(); + + // Added in version 2 of this extension. + template + static dataT read( Properties p ); // p must be an ext::oneapi::properties + static void write( const dataT &data ); + // Added in version 2 of this extension. + template + static void write( const dataT &data, Properties p ); // p must be an ext::oneapi::properties + // Non-blocking + static dataT read( bool &success_code ); + + // Added in version 2 of this extension. + template + static dataT read( bool &success_code, Properties p ); // p must be an ext::oneapi::properties + static void write( const dataT &data, bool &success_code ); + + // Added in version 2 of this extension. + template + static void write( const dataT &data, bool &success_code, Properties p ); // p must be an ext::oneapi::properties } + +} // namespace sycl::ext::intel ---- The read and write member functions may be invoked within device code, or within a SYCL host program. Some interfaces may not be available on all devices/implementations, but the pipe definition itself does not gate availability. Instead, implementations should error if an unsupported pipe is used. See section <> for information on querying the availability of specific pipe features relative to a device. @@ -648,6 +710,7 @@ extension's APIs the implementation supports. |=== |Value |Description |1 |Initial extension version. Base features are supported. +|2 |Add latency control feature to member functions. |=== == Revision History @@ -660,6 +723,7 @@ extension's APIs the implementation supports. |1|2019-09-12|Michael Kinsner|*Initial public working draft* |2|2019-11-13|Michael Kinsner|Incorporate feedback |3|2020-04-27|Michael Kinsner|Clarify that pipe operations behave as-if they are relaxed atomic operations. Make SYCL2020 the baseline +|4|2021-11-08|Shuo Niu|Add latency control feature to member functions |======================================== //************************************************************************ diff --git a/sycl/doc/extensions/IntelFPGA/FPGALsu_rev2_proposed.md b/sycl/doc/extensions/IntelFPGA/FPGALsu_rev2_proposed.md index e79cafe7560e5..06a2a3872ff65 100644 --- a/sycl/doc/extensions/IntelFPGA/FPGALsu_rev2_proposed.md +++ b/sycl/doc/extensions/IntelFPGA/FPGALsu_rev2_proposed.md @@ -4,38 +4,55 @@ The Intel FPGA `lsu` class is implemented in `sycl/ext/intel/fpga_lsu.hpp` which is included in `sycl/ext/intel/fpga_extensions.hpp`. -The class `cl::sycl::ext::intel::lsu` allows users to explicitly request that the +The class `sycl::ext::intel::lsu` allows users to explicitly request that the implementation of a global memory access is configured in a certain way. The class has two member functions, `load()` and `store()` which allow loading from and storing to a `multi_ptr`, respectively, and is templated on the following -4 optional paremeters: +4 optional parameters: -1. **`cl::sycl::ext::intel::burst_coalesce`, where `B` is a boolean**: request, +1. **`sycl::ext::intel::burst_coalesce`, where `B` is a boolean**: request, to the extent possible, that a dynamic burst coalescer be implemented when `load` or `store` are called. The default value of this parameter is `false`. -2. **`cl::sycl::ext::intel::cache`, where `N` is an integer greater or equal to +2. **`sycl::ext::intel::cache`, where `N` is an integer greater or equal to 0**: request, to the extent possible, that a read-only cache of the specified size in bytes be implemented when when `load` is called. It is not allowed to use that parameter for `store`. The default value of this parameter is `0`. -3. **`cl::sycl::ext::intel::statically_coalesce`, where `B` is a boolean**: +3. **`sycl::ext::intel::statically_coalesce`, where `B` is a boolean**: request, to the extent possible, that `load` or `store` accesses, is allowed to be statically coalesced with other memory accesses at compile time. The default value of this parameter is `true`. -4. **`cl::sycl::ext::intel::prefetch`, where `B` is a boolean**: request, to the +4. **`sycl::ext::intel::prefetch`, where `B` is a boolean**: request, to the extent possible, that a prefetcher be implemented when `load` is called. It is not allowed to use that parameter for `store`. The default value of this parameter is `false`. Currently, not every combination of parameters is allowed due to limitations in the backend. The following rules apply: -1. For `store`, `cl::sycl::ext::intel::cache` must be `0` and -`cl::sycl::ext::intel::prefetch` must be `false`. -2. For `load`, if `cl::sycl::ext::intel::cache` is set to a value greater than `0`, -then `cl::sycl::ext::intel::burst_coalesce` must be set to `true`. -3. For `load`, exactly one of `cl::sycl::ext::intel::prefetch` and -`cl::sycl::ext::intel::burst_coalesce` is allowed to be `true`. -4. For `load`, exactly one of `cl::sycl::ext::intel::prefetch` and -`cl::sycl::ext::intel::cache` is allowed to be `true`. +1. For `store`, `sycl::ext::intel::cache` must be `0` and +`sycl::ext::intel::prefetch` must be `false`. +2. For `load`, if `sycl::ext::intel::cache` is set to a value greater than `0`, +then `sycl::ext::intel::burst_coalesce` must be set to `true`. +3. For `load`, exactly one of `sycl::ext::intel::prefetch` and +`sycl::ext::intel::burst_coalesce` is allowed to be `true`. +4. For `load`, exactly one of `sycl::ext::intel::prefetch` and +`sycl::ext::intel::cache` is allowed to be `true`. + +Member functions `load()` or `store()` can take in an ext::oneapi::properties as argument, +which contains the following two properties of latency control: + +1. **`sycl::ext::intel::latency_anchor_id`, where `N` is an integer**: +represents ID of the current function call when it performs as an anchor. The ID +must be unique within the application, with a diagnostic required if that +condition is not met. +2. **`sycl::ext::intel::latency_constraint`** contains control +parameters when the current function performs as a non-anchor, where: + - **`A` is an integer**: The ID of the target anchor defined on a different + instruction through a `latency_anchor_id` property. + - **`B` is an enum value**: The type of control from the set + {`type::exact`, `type::min`, `type::max`}. + - **`C` is an integer**: The relative clock cycle difference between the + target anchor and the current function call, that the constraint should + infer subject to the type of the control (exact, min, max). ## Implementation @@ -43,10 +60,38 @@ The implementation relies on the Clang built-in `__builtin_intel_fpga_mem` when parsing the SYCL device code. The built-in uses the LLVM `ptr.annotation` intrinsic under the hood to annotate the pointer that is being accessed. ```c++ +namespace sycl::ext::intel { +struct latency_anchor_id { + template + using value_t = property_value>; +}; +struct latency_constraint { + enum class type { + none, + exact, + max, + min + }; + template + using value_t = property_value, + std::integral_constant, + std::integral_constant>; +}; +template +struct property_value, + std::integral_constant, + std::integral_constant> { + static constexpr int target = Target; + static constexpr latency_constraint::type type = Type; + static constexpr int cycle = Cycle; +}; + template class lsu final { public: lsu() = delete; - template static _T load(sycl::multi_ptr<_T, _space> Ptr) { check_space<_space>(); @@ -54,13 +99,42 @@ public: #if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) return *__builtin_intel_fpga_mem((_T *)Ptr, _burst_coalesce | _cache | - _dont_statically_coalesce | _prefetch, - _cache_val); + _dont_statically_coalesce | _prefetch, + _cache_val, + _reserved_default_anchor_id, 0, 0, 0); +#else + return *Ptr; +#endif + } + // Added in version 2 of this extension. + template + static _T load(sycl::multi_ptr<_T, _space> Ptr, Properties p) { + check_space<_space>(); + check_load(); +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + statuc constexpr int _target = p.get_property().target; + static constexpr sycl::ext::intel::latency_constraint::type _type_enum = p.get_property().type; + int _type; + if (_type_enum == sycl::ext::intel::latency_constraint::type::none) { + _type = 0; + } else if (_type_enum == sycl::ext::intel::latency_constraint::type::exact) { + _type = 1; + } else if (_type_enum == sycl::ext::intel::latency_constraint::type::max) { + _type = 2; + } else { // _type_enum == sycl::ext::intel::latency_constraint::type::min + _type = 3; + } + statuc constexpr int _cycle = p.get_property().cycle; + return *__builtin_intel_fpga_mem((_T *)Ptr, + _burst_coalesce | _cache | + _dont_statically_coalesce | _prefetch, + _cache_val, + p.get_property().value, + _target, _type, _cycle); #else return *Ptr; #endif } - template static void store(sycl::multi_ptr<_T, _space> Ptr, _T Val) { check_space<_space>(); @@ -68,14 +142,45 @@ public: #if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) *__builtin_intel_fpga_mem((_T *)Ptr, _burst_coalesce | _cache | - _dont_statically_coalesce | _prefetch, - _cache_val) = Val; + _dont_statically_coalesce | _prefetch, + _cache_val, + _reserved_default_anchor_id, 0, 0, 0) = Val; +#else + *Ptr = Val; +#endif + } + // Added in version 2 of this extension. + template + static void store(sycl::multi_ptr<_T, _space> Ptr, _T Val, Properties p) { + check_space<_space>(); + check_store(); +#if defined(__SYCL_DEVICE_ONLY__) && __has_builtin(__builtin_intel_fpga_mem) + statuc constexpr int _target = p.get_property().target; + static constexpr sycl::ext::intel::latency_constraint::type _type_enum = p.get_property().type; + int _type; + if (_type_enum == sycl::ext::intel::latency_constraint::type::none) { + _type = 0; + } else if (_type_enum == sycl::ext::intel::latency_constraint::type::exact) { + _type = 1; + } else if (_type_enum == sycl::ext::intel::latency_constraint::type::max) { + _type = 2; + } else { // _type_enum == sycl::ext::intel::latency_constraint::type::min + _type = 3; + } + statuc constexpr int _cycle = p.get_property().cycle; + *__builtin_intel_fpga_mem((_T *)Ptr, + _burst_coalesce | _cache | + _dont_statically_coalesce | _prefetch, + _cache_val, + p.get_property().value, + _target, _type, _cycle) = Val; #else *Ptr = Val; #endif } ... } +} // namespace sycl::ext::intel ``` ## Usage @@ -83,37 +188,38 @@ public: ```c++ #include ... -cl::sycl::buffer output_buffer(output_data, 1); -cl::sycl::buffer input_buffer(input_data, 1); - -Queue.submit([&](cl::sycl::handler &cgh) { - auto output_accessor = output_buffer.get_access(cgh); - auto input_accessor = input_buffer.get_access(cgh); - +sycl::buffer output_buffer(output_data, 1); +sycl::buffer input_buffer(input_data, 1); +Queue.submit([&](sycl::handler &cgh) { + auto output_accessor = output_buffer.get_access(cgh); + auto input_accessor = input_buffer.get_access(cgh); cgh.single_task([=] { auto input_ptr = input_accessor.get_pointer(); auto output_ptr = output_accessor.get_pointer(); - using PrefetchingLSU = - cl::sycl::ext::intel::lsu, - cl::sycl::ext::intel::statically_coalesce>; - + sycl::ext::intel::lsu, + sycl::ext::intel::statically_coalesce>; using BurstCoalescedLSU = - cl::sycl::ext::intel::lsu, - cl::sycl::ext::intel::statically_coalesce>; - + sycl::ext::intel::lsu, + sycl::ext::intel::statically_coalesce>; using CachingLSU = - cl::sycl::ext::intel::lsu, - cl::sycl::ext::intel::cache<1024>, - cl::sycl::ext::intel::statically_coalesce>; - - using PipelinedLSU = cl::sycl::ext::intel::lsu<>; - + sycl::ext::intel::lsu, + sycl::ext::intel::cache<1024>, + sycl::ext::intel::statically_coalesce>; + using PipelinedLSU = sycl::ext::intel::lsu<>; int X = PrefetchingLSU::load(input_ptr); // int X = input_ptr[0] int Y = CachingLSU::load(input_ptr + 1); // int Y = input_ptr[1] - BurstCoalescedLSU::store(output_ptr, X); // output_ptr[0] = X PipelinedLSU::store(output_ptr + 1, Y); // output_ptr[1] = Y + // Latency controls. Added in version 2 of this extension. + // Load is anchor 1 + int Z = PrefetchingLSU::load(input_ptr + 2, + sycl::ext::oneapi::properties{latency_anchor_id<1>}); + // Store occurs 5 cycles after the anchor 1 load + BurstCoalescedLSU::store(output_ptr + 2, Z, + sycl::ext::oneapi::properties{latency_constraint<1, + sycl::ext::intel::latency_constraint::type::exact, + 5>}); }); }); ... @@ -132,3 +238,4 @@ extension |Value |Description| |:---- |:---------:| |1 |Initial extension version. Base features are supported.| +|2 |Add latency control feature to member functions.|