diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/alt_ui.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/alt_ui.hpp index fe5ee68022b56..e6d571f32c848 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/alt_ui.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/alt_ui.hpp @@ -18,8 +18,8 @@ namespace intel { namespace experimental { namespace esimd { +/// @addtogroup sycl_esimd_misc /// @{ -/// @ingroup sycl_esimd_misc /// "Merges" elements of the input vectors according to the merge mask. /// @param a the first vector diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp index 81d3944bde558..e1dabe5dea48e 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp @@ -14,6 +14,8 @@ #include // for uint* types +/// @cond ESIMD_DETAIL + #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_ESIMD_KERNEL __attribute__((sycl_explicit_simd)) #define SYCL_ESIMD_FUNCTION __attribute__((sycl_explicit_simd)) @@ -63,6 +65,8 @@ #define __ESIMD_DEPRECATED(new_api) \ __SYCL_DEPRECATED("use " __ESIMD_NS_QUOTED "::" __ESIMD_QUOTE(new_api)) +/// @endcond ESIMD_DETAIL + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace ext { @@ -70,8 +74,8 @@ namespace intel { namespace experimental { namespace esimd { +/// @addtogroup sycl_esimd_core /// @{ -/// @ingroup sycl_esimd_core using uchar = unsigned char; using ushort = unsigned short; diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp index 995e6d1a14421..71059654250a3 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp @@ -28,22 +28,39 @@ namespace esimd { /// @addtogroup sycl_esimd_core /// @{ -/// @defgroup sycl_esimd_core_align Alignment control +/// @defgroup sycl_esimd_core_align Alignment control. /// Alignment type tags and related APIs for use with ESIMD memory access -/// operations. +/// operations. The basic restrictions for memory location specified as +/// parameters for memory access APIs supporting alignment control are as +/// follows: +/// - If alignment control parameter is \c element_aligned_tag, then the +/// location must be aligned by alignof(T), where \c T is element +/// type. +/// - If it is \c vector_aligned_tag, the location must be aligned by +/// alignof(VT), where \c VT is the raw vector type of the +/// accessed \c simd_obj_impl derivative class object. +/// - If it is overaligned_tag, the location must be aligned by +/// \c N. +/// +/// Program not meeting alignment requirements results in undefined +/// behavior. + +/// @} /// @addtogroup sycl_esimd_core_align /// @{ -/// element_aligned_tag type. Flag of this type should be used in load and store -/// operations when memory address is aligned by simd object's element type. + +/// \c element_aligned_tag type. Flag of this type should be used in load and +/// store operations when memory address is aligned by simd object's element +/// type. struct element_aligned_tag { template > static constexpr unsigned alignment = alignof(ET); }; -/// vector_aligned_tag type. Flag of this type should be used in load and store -/// operations when memory address is guaranteed to be aligned by simd object's -/// vector type. +/// \c vector_aligned_tag type. Flag of this type should be used in load and +/// store operations when memory address is guaranteed to be aligned by simd +/// object's vector type. struct vector_aligned_tag { template static constexpr unsigned alignment = alignof(VT); }; @@ -51,7 +68,7 @@ struct vector_aligned_tag { /// overaligned_tag type. Flag of this type should be used in load and store /// operations when memory address is aligned by the user-provided alignment /// value N. -/// \tparam N is the alignment value. N must be a power of two. +/// @tparam N is the alignment value. N must be a power of two. template struct overaligned_tag { static_assert( detail::isPowerOf2(N), @@ -80,7 +97,7 @@ struct is_simd_flag_type> : std::true_type {}; template static inline constexpr bool is_simd_flag_type_v = is_simd_flag_type::value; -/// @} alignment tags +/// @} sycl_esimd_core_align /// @cond ESIMD_DETAIL @@ -112,6 +129,9 @@ constexpr vector_type_t make_vector(T Base, T Stride) { /// @endcond ESIMD_DETAIL +/// @addtogroup sycl_esimd_core +/// @{ + /// This is a base class for all ESIMD simd classes with real storage (simd, /// simd_mask_impl). It wraps a clang vector as the storage for the elements. /// Additionally this class supports region operations that map to Intel GPU @@ -119,30 +139,37 @@ constexpr vector_type_t make_vector(T Base, T Stride) { /// simd_view type, which models a "window" into this object's storage and can /// used to read and modify it. /// -/// \tparam RawTy raw (storage) element type -/// \tparam N number of elements -/// \tparam Derived - a class derived from this one; this class and its -/// derivatives must follow the 'curiously recurring template' pattern. -/// Note that for some element types, the element type in the \c Derived -/// type and this type may differ - for example, half type. -/// \tparam SFINAE - defaults to 'void' in the forward declarion within -/// types.hpp, used to disable invalid specializations. +/// This class and its derivatives must follow the +/// 'curiously recurring template' design pattern. +/// +/// @tparam RawTy Raw (storage) element type +/// @tparam N Number of elements +/// @tparam Derived - A class derived from this one. Pure \c simd_obj_impl +/// objects are never supposed to be constructed directly neither by user nor +/// by ESIMD library code, instead they should always be enclosed into objects +/// of some derived class - \c simd or \c simd_mask. This derived class is +/// captured by this template parameter. +/// Note that for some element types, the element type in the \c Derived +/// type and this type may differ - for example, \c half type. +/// @tparam SFINAE - defaults to 'void' in the forward declarion within +/// types.hpp, used to disable invalid specializations. /// -// For the is_simd_obj_impl_derivative helper to work correctly, all derived -// classes must be templated by element type and number of elements. If fewer -// template arguments are needed, template aliases can be used -// (simd_mask_type). -// template class simd_obj_impl { + // For the is_simd_obj_impl_derivative helper to work correctly, all derived + // classes must be templated by element type and number of elements. If fewer + // template arguments are needed, template aliases can be used + // (simd_mask_type). + // template friend class simd_view; + template friend class simd_view_impl; template friend class simd; template friend class simd_mask_impl; +public: + /// Element type of the derived (user) class. using element_type = get_vector_element_type; - using Ty = element_type; -public: /// The underlying raw storage vector data type. using raw_vector_type = vector_type_t; @@ -153,6 +180,8 @@ class simd_obj_impl { static constexpr int length = N; protected: + using Ty = element_type; + template void init_from_array(const Ty (&&Arr)[N]) noexcept { raw_vector_type tmp; @@ -171,6 +200,11 @@ class simd_obj_impl { } } + explicit operator raw_vector_type() const { + __esimd_dbg_print(explicit operator raw_vector_type()); + return data(); + } + private: Derived &cast_this_to_derived() { return reinterpret_cast(*this); } const Derived &cast_this_to_derived() const { @@ -178,15 +212,23 @@ class simd_obj_impl { } public: + /// Default constructor. Values of the constructed object's elements are + /// undefined. simd_obj_impl() = default; /// Copy constructor. + /// @param other The other object to bitwise-copy elements from. simd_obj_impl(const simd_obj_impl &other) { __esimd_dbg_print(simd_obj_impl(const simd_obj_impl &other)); set(other.data()); } /// Implicit conversion constructor from another \c simd_obj_impl object. + /// Elements of the of the other object are type-converted to \c element_type + /// to obtain elements of this object. + /// @tparam Ty1 Raw element type of the other object. + /// @tparam Derived1 The actual type of the other object. + /// @param other The other object. template simd_obj_impl(const simd_obj_impl &other) { __esimd_dbg_print(simd_obj_impl(const simd_obj_impl... > &other)); @@ -194,12 +236,18 @@ class simd_obj_impl { } /// Implicit conversion constructor from a raw vector object. + /// @param Val the raw vector to convert from. simd_obj_impl(const raw_vector_type &Val) { __esimd_dbg_print(simd_obj_impl(const raw_vector_type &Val)); set(Val); } - /// Initialize a simd_obj_impl object with an initial value and step. + /// Arithmetic progression constructor. Consecutive elements of this object + /// are initialized with the arithmetic progression defined by the arguments. + /// For example, simd x(1, 3) will initialize x to the + /// {1, 4, 7, 10} sequence. + /// @param Val The start of the progression. + /// @param Step The step of the progression. simd_obj_impl(Ty Val, Ty Step) noexcept { __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step)); if constexpr (is_wrapper_elem_type_v || !std::is_integral_v) { @@ -212,7 +260,11 @@ class simd_obj_impl { } } - /// Broadcast constructor + /// Broadcast constructor. Given value is type-converted to the + /// \c element_type and resulting bit representation is broadcast to all lanes + /// of the underlying vector. + /// @tparam T1 Type of the value. + /// @param Val The value to broadcast. template >> simd_obj_impl(T1 Val) noexcept { @@ -220,7 +272,10 @@ class simd_obj_impl { M_data = bitcast_to_raw_type(detail::convert_scalar(Val)); } - /// Construct from an array. To allow e.g. simd_mask_type m({1,0,0,1,...}). + /// Rvalue array-based constructor. Used for in-place initialization like + /// simd x({1,0,0,1,...}). + /// + /// @param Arr Rvalue reference to an array of size @ref N to initialize from. template > simd_obj_impl(const Ty (&&Arr)[N1]) noexcept { __esimd_dbg_print(simd_obj_impl(const Ty(&&Arr)[N1])); @@ -233,7 +288,12 @@ class simd_obj_impl { // default ctor can be used for them } - /// Load constructor. + /// Pointer-based load constructor. Initializes this object from values stored + /// in memory. For example: + /// simd x(ptr, overaligned_tag<16>{});. + /// @tparam Flags Specifies memory address alignment. Affects efficiency of + /// the generated code. + /// @param ptr The memory address to read from. template >> simd_obj_impl(const Ty *ptr, Flags = {}) noexcept { @@ -241,7 +301,16 @@ class simd_obj_impl { copy_from(ptr, Flags{}); } - /// Accessor-based load constructor. + /// Accessor-based load constructor. Initializes constructed object from + /// values stored in memory represented by an accessor and an offset. For + /// example: + /// simd x(acc, 128, overaligned_tag<16>{});. + /// @tparam AccessorT the type of the accessor. Auto-deduced. + /// @tparam Flags Specifies memory address alignment. Affects efficiency of + /// the generated code. Auto-deduced from the unnamed alignment tag + /// argument. + /// @param acc The accessor to read from. + /// @param offset 32-bit offset in bytes of the first element. template std::enable_if_t copy_from(const Ty (&&Arr)[N1]) { __esimd_dbg_print(copy_from(const Ty(&&Arr)[N1])); init_from_array(std::move(Arr)); } - explicit operator raw_vector_type() const { - __esimd_dbg_print(explicit operator raw_vector_type()); - return data(); - } - /// Type conversion into a scalar: - /// simd_obj_impl> to Ty. + /// > to \c Ty. template > operator Ty() const { @@ -273,6 +339,7 @@ class simd_obj_impl { return bitcast_to_wrapper_type(data()[0]); } + /// @return The value of the underlying raw vector. raw_vector_type data() const { __esimd_dbg_print(raw_vector_type data()); #ifndef __SYCL_DEVICE_ONLY__ @@ -282,34 +349,59 @@ class simd_obj_impl { #endif } - /// Whole region read. + /// @return Newly constructed (from the underlying data) object of the Derived + /// type. Derived read() const { return Derived{data()}; } - /// Whole region write. + /// Replaces the underlying data with the one taken from another object. + /// @return This object. Derived &write(const Derived &Val) { set(Val.data()); return cast_this_to_derived(); } - /// Whole region update with predicates. + /// "Merges" this object's value with another object: + /// replaces part of the underlying data with the one taken from the other + /// object according to a mask. Only elements in lanes where corresponding + /// mask's value is non-zero are replaced. + /// @param Val The object to take new values from. + /// @param Mask The mask. void merge(const Derived &Val, const simd_mask_type &Mask) { set(__esimd_wrregion(data(), Val.data(), 0, Mask.data())); } + /// Merges given two objects with a mask and writes resulting data into this + /// object. + /// @param Val1 The first object, provides elements for lanes with zero + /// corresponding predicates. + /// @param Val2 The second object, provides elements for lanes with non-zero + /// corresponding predicates. + /// @param Mask The mask. void merge(const Derived &Val1, Derived Val2, const simd_mask_type &Mask) { Val2.merge(Val1, Mask); set(Val2.data()); } - /// View this simd_obj_impl object in a different element type. + /// View this \c simd_obj_impl object in a different element type and + /// potentially a different number of elements, if the new element type size + /// is different. + /// @tparam EltTy The new element type. + /// @return A simd_view object providing the alternative view of entire + /// \c this object. template auto bit_cast_view() &[[clang::lifetimebound]] { using TopRegionTy = compute_format_type_t; using RetTy = simd_view; return RetTy{cast_this_to_derived(), TopRegionTy{0}}; } - /// View as a 2-dimensional simd_view. + /// Create a 2-dimensional view (\c simd_view object) of this object. + /// sizeof(EltTy)*Height*Width must be equal to the byte size of + /// this object. + /// @tparam ElTy Element type of the view. Can mismatch current type. + /// @tparam Height Height of the view in rows. + /// @tparam Width Width of the view in elements. + /// @return The 2D view. template auto bit_cast_view() &[[clang::lifetimebound]] { using TopRegionTy = compute_format_type_2d_t; @@ -317,12 +409,13 @@ class simd_obj_impl { return RetTy{cast_this_to_derived(), TopRegionTy{0, 0}}; } - /// 1D region select, apply a region on top of this LValue object. + /// Select elements of this object into a subregion and create a 1D view for + /// for it. Used when \c this is an lvalue. /// - /// \tparam Size is the number of elements to be selected. - /// \tparam Stride is the element distance between two consecutive elements. - /// \param Offset is the starting element offset. - /// \return the representing region object. + /// @tparam Size The number of elements selected for the subregion. + /// @tparam Stride A distance in elements between two consecutive elements. + /// @param Offset The starting element's offset. + /// @return A view of the subregion. template simd_view> select(uint16_t Offset = 0) &[[clang::lifetimebound]] { @@ -332,12 +425,13 @@ class simd_obj_impl { return {cast_this_to_derived(), std::move(Reg)}; } - /// 1D region select, apply a region on top of this RValue object. + /// Select and extract a subregion of this object's elements and return it as + /// a new vector object. Used when \c this is an rvalue. /// - /// \tparam Size is the number of elements to be selected. - /// \tparam Stride is the element distance between two consecutive elements. - /// \param Offset is the starting element offset. - /// \return the value this region object refers to. + /// @tparam Size The number of elements selected for the subregion. + /// @tparam Stride A distance in elements between two consecutive elements. + /// @param Offset The starting element's offset. + /// @return Extracted subregion as a new vector object. template resize_a_simd_type_t select(uint16_t Offset = 0) && { static_assert(Size > 1 || Stride == 1, @@ -347,33 +441,45 @@ class simd_obj_impl { Offset); } - /// Read single element, return value only (not reference). + /// Get value of this vector's element. + /// @param i Element index. + /// @return Value of i'th element. Ty operator[](int i) const { return bitcast_to_wrapper_type(data()[i]); } /// Return writable view of a single element. + /// @param i Element index. + /// @return View of i'th element. simd_view> operator[](int i) [[clang::lifetimebound]] { return select<1, 1>(i); } - // TODO ESIMD_EXPERIMENTAL - /// Read multiple elements by their indices in vector + /// Indirect select - select and extract multiple elements with given + /// variable indices. + /// @tparam Size The number of elements to select. + /// @param Indices Indices of element to select. + /// @return Vector of extracted elements. template resize_a_simd_type_t iselect(const simd &Indices) { vector_type_t Offsets = Indices.data() * sizeof(RawTy); return __esimd_rdindirect(data(), Offsets); } - // TODO ESIMD_EXPERIMENTAL - /// update single element + + /// Update single element with variable index. + /// @param Index Element index. + /// @param V New value. void iupdate(ushort Index, Ty V) { auto Val = data(); Val[Index] = bitcast_to_raw_type(V); set(Val); } - // TODO ESIMD_EXPERIMENTAL - /// update multiple elements by their indices in vector + /// Indirect update - update multiple elements with given variable indices. + /// @tparam Size The number of elements to update. + /// @param Indices Indices of element to update. + /// @param Val New values. + /// @param Mask Operation mask. 1 - update, 0 - not. template void iupdate(const simd &Indices, const resize_a_simd_type_t &Val, @@ -383,68 +489,113 @@ class simd_obj_impl { Mask.data())); } - /// \name Replicate - /// Replicate simd_obj_impl instance given a region. - /// @{ - /// - - /// \tparam Rep is number of times region has to be replicated. - /// \return replicated simd_obj_impl instance. + /// Replicates contents of this vector a number of times into a new vector. + /// @tparam Rep The number of times this vector has to be replicated. + /// @return Replicated simd_obj_impl instance. template resize_a_simd_type_t replicate() const { return replicate_w(0); } - /// \tparam Rep is number of times region has to be replicated. - /// \tparam W is width of src region to replicate. - /// \param Offset is offset in number of elements in src region. - /// \return replicated simd_obj_impl instance. + /// Shortcut to \c replicate_vs_w_hs with \c VS=0 and \c HS=1 to replicate a + /// single "dense" (w/o gaps between elements) block \c Rep times. + /// @tparam Rep The number of times to replicate the block. + /// @tparam W Width - number of elements in the block. + /// @param Offset Offset of the block's first element. + /// @return Vector of size Rep*W consisting of replicated + /// elements of \c this object. template resize_a_simd_type_t replicate_w(uint16_t Offset) const { return replicate_vs_w_hs(Offset); } - /// \tparam Rep is number of times region has to be replicated. - /// \tparam VS vertical stride of src region to replicate. - /// \tparam W width of src region to replicate. - /// \param Offset offset in number of elements in src region. - /// \return replicated simd_obj_impl instance. + /// Shortcut to \c replicate_vs_w_hs with \c HS=1 to replicate dense blocks. + /// @tparam Rep Number of blocks to select for replication. + /// @tparam VS Vertical stride - distance between first elements of + /// consecutive blocks. If \c VS=0, then the same block will be + /// replicated \c Rep times in the result. + /// @tparam W Width - number of elements in a block. + /// @param Offset The offset of the first element of the first block. + /// @return Vector of size Rep*W consisting of replicated + /// elements of \c this object. template resize_a_simd_type_t replicate_vs_w(uint16_t Offset) const { return replicate_vs_w_hs(Offset); } - /// \tparam Rep is number of times region has to be replicated. - /// \tparam VS vertical stride of src region to replicate. - /// \tparam W is width of src region to replicate. - /// \tparam HS horizontal stride of src region to replicate. - /// \param Offset is offset in number of elements in src region. - /// \return replicated simd_obj_impl instance. + /// This function "replicates" a portion of this object's elements into a new + /// object. The source elements to replicate are \c Rep number of blocks each + /// of size \c W elements. Starting elements of consecutive blocks are \c VS + /// elements apart and i'th block starts from + /// ith_block_start_ind = Offset + i*VS + /// index. Consecutive elements within a block are \c HS elements apart and + /// j'th element in the block has ith_block_start_ind + j*HS + /// index. Thus total of Rep*W elements are returned. Note that + /// depending on \c VS, \c W and \c HS, blocks' elements may overlap and in + /// this case the elements where the overlap happens may participate 2 or more + /// times in the result. + /// + /// *Example 1*. Source object has 32 elements, \c Rep is 2, \c VS is 17, \c W + /// is 3 and \c HS is 4. Selected elements are depicted with their index + /// (mathing their values) /// instead of a dot: + /// @code + /// simd Source(0/*Base*/, 1/*Step*/); + /// simd Result = Source.replicate_vs_w_hs<2,17,3,4>(1); + /// // |<-------------- VS=17 ------------->| + /// // v-------v-------v W=3 + /// // . 1 . . . 5 . . . 9 . . . . . . . \ Rep=2 + /// // . 18 . . . 22 . . . 26 . . . . . / + /// // |<- HS=4->| + /// // The Result is a vector of 6 source elements {1,5,9,18,22,26}. + /// @endcode + /// + /// *Example 2*. AOS 7x3 => SOA 3x7 conversion. + /// \c Rep is 3, \c VS is 1, \c W is 7 and \c HS is 3. + /// @code + /// simd Source = getSource(); + /// simd Result = Source.replicate_vs_w_hs<3,1,7,3>(0); + /// // Source: + /// // x0 y0 z0 x1 y1 z1 x2 y2 z2 x3 y3 z3 x4 y4 z4 x5 y5 z5 x6 y6 z6 + /// // Result: + /// // x0 x1 x2 x3 x4 x5 x6 y0 y1 y2 y3 y4 y5 y6 z0 z1 z2 z3 z4 z5 z6 + /// @endcode + /// + /// @tparam Rep Number of blocks to select for replication. + /// @tparam VS Vertical stride - distance between first elements of + /// consecutive blocks. If \c VS=0, then the same block will be + /// replicated \c Rep times in the result. + /// @tparam W Width - number of elements in a block. + /// @tparam HS Horizontal stride - distance between consecutive elements in a + /// block. + /// @param Offset The offset of the first element of the first block. + /// @return Vector of size Rep*W consisting of replicated + /// elements of \c this object. + /// template resize_a_simd_type_t replicate_vs_w_hs(uint16_t Offset) const { return __esimd_rdregion( data(), Offset * sizeof(RawTy)); } - ///@} - /// 'any' operation. + /// See if any element is non-zero. /// - /// \return 1 if any element is set, 0 otherwise. + /// @return 1 if any element is non-zero, 0 otherwise. template ::value>> uint16_t any() const { return __esimd_any(data()); } - /// 'all' operation. + /// See if all elements are non-zero. /// - /// \return 1 if all elements are set, 0 otherwise. + /// @return 1 if all elements are non-zero, 0 otherwise. template ::value>> uint16_t all() const { return __esimd_all(data()); } +protected: /// Write a simd_obj_impl-vector into a basic region of a simd_obj_impl /// object. template > @@ -526,24 +677,17 @@ class simd_obj_impl { } } - /// @name Memory operations - /// TODO NOTE: These APIs do not support cache hint specification yet, as this - /// is WIP. Later addition of hints is not expected to break code using these - /// APIs. - /// - /// @{ - +public: /// Copy a contiguous block of data from memory into this simd_obj_impl /// object. The amount of memory copied equals the total size of vector /// elements in this object. + /// None of the template parameters except documented ones can/should be + /// specified by callers. + /// + /// @tparam Flags Alignment control for the copy operation. + /// See @ref sycl_esimd_core_align for more info. /// @param addr the memory address to copy from. Must be a pointer to the /// global address space, otherwise behavior is undefined. - /// @param flags for the copy operation. If the template parameter Flags is - /// is element_aligned_tag, \p addr must be aligned by alignof(T). If Flags is - /// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's - /// raw_vector_type alignment. If Flags is overaligned_tag, \p addr must be - /// aligned by N. Program not meeting alignment requirements results in - /// undefined behavior. template >> ESIMD_INLINE void copy_from(const Ty *addr, Flags = {}) SYCL_ESIMD_FUNCTION; @@ -552,14 +696,13 @@ class simd_obj_impl { /// object. The amount of memory copied equals the total size of vector /// elements in this object. Source memory location is represented via a /// global accessor and offset. + /// None of the template parameters except documented ones can/should be + /// specified by callers. + /// @tparam AccessorT Type of the accessor (auto-deduced). + /// @tparam Flags Alignment control for the copy operation. + /// See @ref sycl_esimd_core_align for more info. /// @param acc accessor to copy from. /// @param offset offset to copy from (in bytes). - /// @param flags for the copy operation. If the template parameter Flags is - /// is element_aligned_tag, offset must be aligned by alignof(T). If Flags is - /// vector_aligned_tag, offset must be aligned by simd_obj_impl's - /// raw_vector_type alignment. If Flags is overaligned_tag, offset must be - /// aligned by N. Program not meeting alignment requirements results in - /// undefined behavior. template >> @@ -568,14 +711,11 @@ class simd_obj_impl { copy_from(AccessorT acc, uint32_t offset, Flags = {}) SYCL_ESIMD_FUNCTION; /// Copy all vector elements of this object into a contiguous block in memory. + /// None of the template parameters should be be specified by callers. + /// @tparam Flags Alignment control for the copy operation. + /// See @ref sycl_esimd_core_align for more info. /// @param addr the memory address to copy to. Must be a pointer to the /// global address space, otherwise behavior is undefined. - /// @param flags for the copy operation. If the template parameter Flags is - /// is element_aligned_tag, \p addr must be aligned by alignof(T). If Flags is - /// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's - /// raw_vector_type alignment. If Flags is overaligned_tag, \p addr must be - /// aligned by N. Program not meeting alignment requirements results in - /// undefined behavior. template >> ESIMD_INLINE void copy_to(Ty *addr, Flags = {}) const SYCL_ESIMD_FUNCTION; @@ -583,14 +723,12 @@ class simd_obj_impl { /// Copy all vector elements of this object into a contiguous block in memory. /// Destination memory location is represented via a global accessor and /// offset. + /// None of the template parameters should be be specified by callers. + /// @tparam AccessorT Type of the accessor (auto-deduced). + /// @tparam Flags Alignment control for the copy operation. + /// See @ref sycl_esimd_core_align for more info. /// @param acc accessor to copy from. /// @param offset offset to copy from. - /// @param flags for the copy operation. If the template parameter Flags is - /// is element_aligned_tag, offset must be aligned by alignof(T). If Flags is - /// vector_aligned_tag, offset must be aligned by simd_obj_impl's - /// raw_vector_type alignment. If Flags is overaligned_tag, offset must be - /// aligned by N. Program not meeting alignment requirements results in - /// undefined behavior. template >> @@ -598,21 +736,23 @@ class simd_obj_impl { sycl::access::target::global_buffer, void> copy_to(AccessorT acc, uint32_t offset, Flags = {}) const SYCL_ESIMD_FUNCTION; - /// @} // Memory operations - // Unary operations. - /// Bitwise inversion, available in all subclasses. + /// Per-element bitwise inversion, available in all subclasses, but only for + /// integral element types (\c simd_mask included). + /// @return Copy of this object with all elements bitwise inverted. template >> Derived operator~() const { return Derived{ detail::vector_unary_op(data())}; } - /// Unary logical negation operator, available in all subclasses. + /// Unary logical negation operator, available in all subclasses, but only for + /// integral element types (\c simd_mask included). /// Similarly to C++, where !x returns bool, !simd returns a simd_mask, where /// each element is a result of comparision with zero. - /// No need to implement via detail::vector_unary_op + /// @return A \c simd_mask instance where each element is a result of + /// comparison of the original element with zero. template >> simd_mask_type operator!() const { return *this == 0; diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp index d33dab80c80ec..795c76917abda 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp @@ -22,7 +22,7 @@ namespace experimental { namespace esimd { namespace detail { -/// @ingroup sycl_esimd_core +/// @addtogroup sycl_esimd_core /// @{ /// The simd_view base class. @@ -78,9 +78,6 @@ class simd_view_impl { /// Default move constructor. simd_view_impl(simd_view_impl &&Other) = default; - /// @name Implicit conversions. - /// @{ - /// Implicit conversion to simd type. template >> @@ -96,10 +93,6 @@ class simd_view_impl { inline operator simd_mask_type() const { return read(); } - /// @} - - /// @name Region accessors. - /// @{ /// Tells whether this view is 1-dimensional. static constexpr bool is1D() { return !ShapeTy::Is_2D; } @@ -125,7 +118,6 @@ class simd_view_impl { constexpr uint16_t getOffsetY() const { return getTopRegion(M_region).M_offset_y; } - /// @} /// Read the object. value_type read() const { @@ -384,10 +376,8 @@ class simd_view_impl { return select<1, 1>(i); } - /// @name Replicate. Create a new simd object from a subset of elements + /// Replicate. Create a new simd object from a subset of elements /// referred to by this \c simd_view_impl object. - /// @{ - /// \tparam Rep is number of times region has to be replicated. template get_simd_t replicate() { return read().template replicate(); @@ -460,7 +450,6 @@ class simd_view_impl { return read().template replicate_vs_w_hs(OffsetY * RowSize + OffsetX); } - /// @} /// 'any' operation. ///