@@ -232,7 +232,7 @@ class __SYCL_EXPORT handler {
232232 void saveCodeLoc (detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
233233
234234 // / Stores the given \param Event to the \param Queue.
235- // / Even thought MQueue is a field of handler, the method addEvent() of
235+ // / Even though MQueue is a field of handler, the method addEvent() of
236236 // / queue_impl class cannot be called inside this handler.hpp file
237237 // / as queue_impl is incomplete class for handler.
238238 static void addEventToQueue (shared_ptr_class<detail::queue_impl> Queue,
@@ -814,7 +814,7 @@ class __SYCL_EXPORT handler {
814814 // / user's lambda function \param KernelFunc and does one iteration of
815815 // / reduction of elements in each of work-groups.
816816 // / This version uses tree-reduction algorithm to reduce elements in each
817- // / of work-groups. At the end of each work-groups the partial sum is written
817+ // / of work-groups. At the end of each work-group the partial sum is written
818818 // / to a global buffer.
819819 // /
820820 // / Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
@@ -827,21 +827,22 @@ class __SYCL_EXPORT handler {
827827 size_t NWorkGroups = Range.get_group_range ().size ();
828828
829829 bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0 ;
830- size_t InefficientCase = ( IsUnderLoaded || ( WGSize & (WGSize - 1 ))) ? 1 : 0 ;
830+ bool IsEfficientCase = ! IsUnderLoaded && (( WGSize & (WGSize - 1 )) == 0 ) ;
831831
832832 bool IsUpdateOfUserAcc =
833833 Reduction::accessor_mode == access::mode::read_write &&
834834 NWorkGroups == 1 ;
835835
836836 // Use local memory to reduce elements in work-groups into 0-th element.
837837 // If WGSize is not power of two, then WGSize+1 elements are allocated.
838- // The additional last element is used to catch reduce elements that could
839- // otherwise be lost in the tree-reduction algorithm used in the kernel.
840- auto LocalReds = Redu.getReadWriteLocalAcc (WGSize + InefficientCase, *this );
838+ // The additional last element is used to catch elements that could
839+ // otherwise be lost in the tree-reduction algorithm.
840+ size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1 );
841+ auto LocalReds = Redu.getReadWriteLocalAcc (NumLocalElements, *this );
841842
842843 auto Out = Redu.getWriteAccForPartialReds (NWorkGroups, 0 , *this );
843844 auto ReduIdentity = Redu.getIdentity ();
844- if (!InefficientCase ) {
845+ if (IsEfficientCase ) {
845846 // Efficient case: work-groups are fully loaded and work-group size
846847 // is power of two.
847848 parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
@@ -863,7 +864,7 @@ class __SYCL_EXPORT handler {
863864 NDIt.barrier ();
864865 }
865866
866- // Compute the the partial sum/reduction for the work-group.
867+ // Compute the partial sum/reduction for the work-group.
867868 if (LID == 0 )
868869 Out.get_pointer ().get ()[NDIt.get_group_linear_id ()] =
869870 IsUpdateOfUserAcc ? BOp (*(Out.get_pointer ()), LocalReds[0 ])
@@ -904,7 +905,7 @@ class __SYCL_EXPORT handler {
904905 PrevStep = CurStep;
905906 }
906907
907- // Compute the the partial sum/reduction for the work-group.
908+ // Compute the partial sum/reduction for the work-group.
908909 if (LID == 0 ) {
909910 auto GrID = NDIt.get_group_linear_id ();
910911 auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
@@ -918,7 +919,7 @@ class __SYCL_EXPORT handler {
918919 // / Implements a command group function that enqueues a kernel that does one
919920 // / iteration of reduction of elements in each of work-groups.
920921 // / This version uses tree-reduction algorithm to reduce elements in each
921- // / of work-groups. At the end of each work-groups the partial sum is written
922+ // / of work-groups. At the end of each work-group the partial sum is written
922923 // / to a global buffer.
923924 // /
924925 // / Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
@@ -932,17 +933,18 @@ class __SYCL_EXPORT handler {
932933 // size may be not power of those. Those two cases considered inefficient
933934 // as they require additional code and checks in the kernel.
934935 bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
935- size_t InefficientCase = ( IsUnderLoaded || (WGSize & (WGSize - 1 ))) ? 1 : 0 ;
936+ bool IsEfficientCase = ! IsUnderLoaded && (WGSize & (WGSize - 1 )) == 0 ;
936937
937938 bool IsUpdateOfUserAcc =
938939 Reduction::accessor_mode == access::mode::read_write &&
939940 NWorkGroups == 1 ;
940941
941942 // Use local memory to reduce elements in work-groups into 0-th element.
942943 // If WGSize is not power of two, then WGSize+1 elements are allocated.
943- // The additional last element is used to catch reduce elements that
944- // could otherwise be lost in the tree-reduction algorithm.
945- auto LocalReds = Redu.getReadWriteLocalAcc (WGSize + InefficientCase, *this );
944+ // The additional last element is used to catch elements that could
945+ // otherwise be lost in the tree-reduction algorithm.
946+ size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1 );
947+ auto LocalReds = Redu.getReadWriteLocalAcc (NumLocalElements, *this );
946948
947949 // Get read accessor to the buffer that was used as output
948950 // in the previous kernel. After that create new output buffer if needed
@@ -951,7 +953,7 @@ class __SYCL_EXPORT handler {
951953 auto In = Redu.getReadAccToPreviousPartialReds (*this );
952954 auto Out = Redu.getWriteAccForPartialReds (NWorkGroups, KernelRun, *this );
953955
954- if (!InefficientCase ) {
956+ if (IsEfficientCase ) {
955957 // Efficient case: work-groups are fully loaded and work-group size
956958 // is power of two.
957959 using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t <
@@ -972,7 +974,7 @@ class __SYCL_EXPORT handler {
972974 NDIt.barrier ();
973975 }
974976
975- // Compute the the partial sum/reduction for the work-group.
977+ // Compute the partial sum/reduction for the work-group.
976978 if (LID == 0 )
977979 Out.get_pointer ().get ()[NDIt.get_group_linear_id ()] =
978980 IsUpdateOfUserAcc ? BOp (*(Out.get_pointer ()), LocalReds[0 ])
@@ -1010,7 +1012,7 @@ class __SYCL_EXPORT handler {
10101012 PrevStep = CurStep;
10111013 }
10121014
1013- // Compute the the partial sum/reduction for the work-group.
1015+ // Compute the partial sum/reduction for the work-group.
10141016 if (LID == 0 ) {
10151017 auto GrID = NDIt.get_group_linear_id ();
10161018 auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
@@ -1096,7 +1098,7 @@ class __SYCL_EXPORT handler {
10961098 handler AuxHandler (QueueCopy, MIsHost);
10971099 AuxHandler.saveCodeLoc (MCodeLoc);
10981100
1099- // The last kernel DOES write to reductions 's accessor.
1101+ // The last kernel DOES write to reduction 's accessor.
11001102 // Associate it with handler manually.
11011103 if (NWorkGroups == 1 )
11021104 AuxHandler.associateWithHandler (Redu.MAcc );
0 commit comments