From 33dadc8849d8a0ea4b8696e96ec9681cc8cfada3 Mon Sep 17 00:00:00 2001
From: Vlad Romanov <vlad.romanov@intel.com>
Date: Thu, 23 Jul 2020 20:22:41 +0300
Subject: [PATCH 1/7] [SYCL][NFC] Move stream_impl.hpp to source dir

---
 sycl/CMakeLists.txt                         |   2 +-
 sycl/include/CL/sycl/detail/stream_impl.hpp | 677 --------------------
 sycl/include/CL/sycl/stream.hpp             | 609 +++++++++++++++++-
 sycl/source/detail/scheduler/commands.cpp   |   2 +-
 sycl/source/detail/stream_impl.cpp          |   2 +-
 sycl/source/detail/stream_impl.hpp          |  81 +++
 sycl/source/stream.cpp                      |   1 +
 sycl/test/abi/symbol_size.cpp               |   2 -
 8 files changed, 690 insertions(+), 686 deletions(-)
 delete mode 100644 sycl/include/CL/sycl/detail/stream_impl.hpp
 create mode 100644 sycl/source/detail/stream_impl.hpp
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index ef41c2fc7c2ca..c01fc9ef44b63 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -14,7 +14,7 @@ include(AddSYCLExecutable)
 set(SYCL_MAJOR_VERSION 2)
 set(SYCL_MINOR_VERSION 1)
 set(SYCL_PATCH_VERSION 0)
-set(SYCL_DEV_ABI_VERSION 2)
+set(SYCL_DEV_ABI_VERSION 3)
 if (SYCL_ADD_DEV_VERSION_POSTFIX)
   set(SYCL_VERSION_POSTFIX "-${SYCL_DEV_ABI_VERSION}")
 endif()
diff --git a/sycl/include/CL/sycl/detail/stream_impl.hpp b/sycl/include/CL/sycl/detail/stream_impl.hpp
deleted file mode 100644
index f31990e627a55..0000000000000
--- a/sycl/include/CL/sycl/detail/stream_impl.hpp
+++ /dev/null
@@ -1,677 +0,0 @@
-//==----------------- stream_impl.hpp - SYCL standard header file ----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <CL/sycl/accessor.hpp>
-#include <CL/sycl/builtins.hpp>
-#include <CL/sycl/detail/array.hpp>
-#include <CL/sycl/detail/export.hpp>
-#include <CL/sycl/device_selector.hpp>
-#include <CL/sycl/queue.hpp>
-
-__SYCL_INLINE_NAMESPACE(cl) {
-namespace sycl {
-
-namespace detail {
-
-using FmtFlags = unsigned int;
-
-// Mapping from stream_manipulator to FmtFlags. Each manipulator corresponds
-// to the bit in FmtFlags.
-static constexpr FmtFlags Dec = 0x0001;
-static constexpr FmtFlags Hex = 0x0002;
-static constexpr FmtFlags Oct = 0x0004;
-static constexpr FmtFlags ShowBase = 0x0008;
-static constexpr FmtFlags ShowPos = 0x0010;
-static constexpr FmtFlags Fixed = 0x0020;
-static constexpr FmtFlags Scientific = 0x0040;
-
-// Bitmask made of the combination of the base flags. Base flags are mutually
-// exclusive, this mask is used to clean base field before setting the new
-// base flag.
-static constexpr FmtFlags BaseField = Dec | Hex | Oct;
-
-// Bitmask made of the combination of the floating point value format flags.
-// Thease flags are mutually exclusive, this mask is used to clean float field
-// before setting the new float flag.
-static constexpr FmtFlags FloatField = Scientific | Fixed;
-
-constexpr size_t MAX_FLOATING_POINT_DIGITS = 24;
-constexpr size_t MAX_INTEGRAL_DIGITS = 23;
-constexpr const char *VEC_ELEMENT_DELIMITER = ", ";
-constexpr char VEC_OPEN_BRACE = '{';
-constexpr char VEC_CLOSE_BRACE = '}';
-
-constexpr size_t MAX_DIMENSIONS = 3;
-
-// Space for integrals (up to 3), comma and space between the
-// integrals and enclosing braces.
-constexpr size_t MAX_ARRAY_SIZE =
-    MAX_INTEGRAL_DIGITS * MAX_DIMENSIONS + 2 * (MAX_DIMENSIONS - 1) + 2;
-
-template <class F, class T = void>
-using EnableIfFP = typename std::enable_if<std::is_same<F, float>::value ||
-                                               std::is_same<F, double>::value ||
-                                               std::is_same<F, half>::value,
-                                           T>::type;
-
-template <typename> struct IsSwizzleOp : std::false_type {};
-
-template <typename VecT, typename OperationLeftT, typename OperationRightT,
-          template <typename> class OperationCurrentT, int... Indexes>
-struct IsSwizzleOp<cl::sycl::detail::SwizzleOp<
-    VecT, OperationLeftT, OperationRightT, OperationCurrentT, Indexes...>>
-    : std::true_type {
-  using T = typename VecT::element_type;
-  using Type = typename cl::sycl::vec<T, (sizeof...(Indexes))>;
-};
-
-template <typename T>
-using EnableIfSwizzleVec =
-    typename std::enable_if<IsSwizzleOp<T>::value,
-                            typename IsSwizzleOp<T>::Type>::type;
-
-class __SYCL_EXPORT stream_impl {
-public:
-  using GlobalBufAccessorT =
-      accessor<char, 1, cl::sycl::access::mode::read_write,
-               cl::sycl::access::target::global_buffer,
-               cl::sycl::access::placeholder::false_t>;
-
-  using GlobalOffsetAccessorT =
-      accessor<unsigned, 1, cl::sycl::access::mode::atomic,
-               cl::sycl::access::target::global_buffer,
-               cl::sycl::access::placeholder::false_t>;
-
-  stream_impl(size_t BufferSize, size_t MaxStatementSize, handler &CGH);
-
-  // Method to provide an access to the global stream buffer
-  GlobalBufAccessorT accessGlobalBuf(handler &CGH) {
-    return Buf.get_access<cl::sycl::access::mode::read_write>(
-        CGH, range<1>(BufferSize_), id<1>(OffsetSize));
-  }
-
-  // Method to provide an accessor to the global flush buffer
-  GlobalBufAccessorT accessGlobalFlushBuf(handler &CGH) {
-    return FlushBuf.get_access<cl::sycl::access::mode::read_write>(
-        CGH, range<1>(MaxStatementSize_), id<1>(0));
-  }
-
-  // Method to provide an atomic access to the offset in the global stream
-  // buffer
-  GlobalOffsetAccessorT accessGlobalOffset(handler &CGH) {
-    auto OffsetSubBuf = buffer<char, 1>(Buf, id<1>(0), range<1>(OffsetSize));
-    auto ReinterpretedBuf = OffsetSubBuf.reinterpret<unsigned, 1>(range<1>(1));
-    return ReinterpretedBuf.get_access<cl::sycl::access::mode::atomic>(
-        CGH, range<1>(1), id<1>(0));
-  }
-
-  // Copy stream buffer to the host and print the contents
-  void flush();
-
-  size_t get_size() const;
-
-  size_t get_max_statement_size() const;
-
-private:
-  // Size of the stream buffer
-  size_t BufferSize_;
-
-  // Maximum number of symbols which could be streamed from the beginning of a
-  // statement till the semicolon
-  unsigned MaxStatementSize_;
-
-  // Size of the variable which is used as an offset in the stream buffer.
-  // Additinonal memory is allocated in the beginning of the stream buffer for
-  // this variable.
-  static const size_t OffsetSize = sizeof(unsigned);
-
-  // Vector on the host side which is used to initialize the stream buffer
-  std::vector<char> Data;
-
-  // Stream buffer
-  buffer<char, 1> Buf;
-
-  // Global flush buffer
-  buffer<char, 1> FlushBuf;
-};
-
-template <typename T>
-inline typename std::make_unsigned<T>::type getAbsVal(const T Val,
-                                                      const int Base) {
-  return ((Base == 10) && (Val < 0)) ? -Val : Val;
-}
-
-inline char digitToChar(const int Digit) {
-  if (Digit < 10) {
-    return '0' + Digit;
-  } else {
-    return 'a' + Digit - 10;
-  }
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_integral<T>::value, unsigned>::type
-integralToBase(T Val, int Base, char *Digits) {
-  unsigned NumDigits = 0;
-
-  do {
-    Digits[NumDigits++] = digitToChar(Val % Base);
-    Val /= Base;
-  } while (Val);
-
-  return NumDigits;
-}
-
-template <typename T>
-EnableIfFP<T, unsigned> floatingPointToDecStr(T AbsVal, char *Digits,
-                                              int Precision, bool IsSci) {
-  int Exp = 0;
-
-  // For the case that the value is larger than 10.0
-  while (AbsVal >= 10.0) {
-    ++Exp;
-    AbsVal /= 10.0;
-  }
-  // For the case that the value is less than 1.0
-  while (AbsVal > 0.0 && AbsVal < 1.0) {
-    --Exp;
-    AbsVal *= 10.0;
-  }
-
-  auto IntegralPart = static_cast<int>(AbsVal);
-  auto FractionPart = AbsVal - IntegralPart;
-
-  int FractionDigits[MAX_FLOATING_POINT_DIGITS] = {0};
-
-  // Exponent
-  int P = Precision > 0 ? Precision : 4;
-  size_t FractionLength = Exp + P;
-
-  // After normalization integral part contains 1 symbol, also there could be
-  // '.', 'e', sign of the exponent and sign of the number, overall 5 symbols.
-  // So, clamp fraction length if required according to maximum size of the
-  // buffer for floating point number.
-  if (FractionLength > MAX_FLOATING_POINT_DIGITS - 5)
-    FractionLength = MAX_FLOATING_POINT_DIGITS - 5;
-
-  for (unsigned I = 0; I < FractionLength; ++I) {
-    FractionPart *= 10.0;
-    FractionDigits[I] = static_cast<int>(FractionPart);
-    FractionPart -= static_cast<int>(FractionPart);
-  }
-
-  int Carry = FractionPart > static_cast<T>(0.5) ? 1 : 0;
-
-  // Propagate the Carry
-  for (int I = FractionLength - 1; I >= 0 && Carry; --I) {
-    auto Digit = FractionDigits[I] + Carry;
-    FractionDigits[I] = Digit % 10;
-    Carry = Digit / 10;
-  }
-
-  // Carry from the fraction part is propagated to integral part
-  IntegralPart += Carry;
-  if (IntegralPart == 10) {
-    IntegralPart = 1;
-    ++Exp;
-  }
-
-  unsigned Offset = 0;
-
-  // Assemble the final string correspondingly
-  if (IsSci) { // scientific mode
-    // Append the integral part
-    Digits[Offset++] = digitToChar(IntegralPart);
-    Digits[Offset++] = '.';
-
-    // Append all fraction
-    for (unsigned I = 0; I < FractionLength; ++I)
-      Digits[Offset++] = digitToChar(FractionDigits[I]);
-
-    // Exponent part
-    Digits[Offset++] = 'e';
-    Digits[Offset++] = Exp >= 0 ? '+' : '-';
-    Digits[Offset++] = digitToChar(abs(Exp) / 10);
-    Digits[Offset++] = digitToChar(abs(Exp) % 10);
-  } else { // normal mode
-    if (Exp < 0) {
-      Digits[Offset++] = '0';
-      Digits[Offset++] = '.';
-      while (++Exp)
-        Digits[Offset++] = '0';
-
-      // Append the integral part
-      Digits[Offset++] = digitToChar(IntegralPart);
-
-      // Append all fraction
-      for (unsigned I = 0; I < FractionLength; ++I)
-        Digits[Offset++] = digitToChar(FractionDigits[I]);
-    } else {
-      // Append the integral part
-      Digits[Offset++] = digitToChar(IntegralPart);
-      unsigned I = 0;
-      // Append the integral part first
-      for (; I < FractionLength && Exp--; ++I)
-        Digits[Offset++] = digitToChar(FractionDigits[I]);
-
-      // Put the dot
-      Digits[Offset++] = '.';
-
-      // Append the rest of fraction part, or the real fraction part
-      for (; I < FractionLength; ++I)
-        Digits[Offset++] = digitToChar(FractionDigits[I]);
-    }
-    // The normal mode requires no tailing zero digit, then we need to first
-    // find the first non-zero digit
-    while (Digits[Offset - 1] == '0')
-      Offset--;
-
-    // If dot is the last digit, it should be stripped off as well
-    if (Digits[Offset - 1] == '.')
-      Offset--;
-  }
-  return Offset;
-}
-
-// Helper method to update offset in the global buffer atomically according to
-// the provided size of the data in the flush buffer. Return true if offset is
-// updated and false in case of overflow.
-inline bool updateOffset(stream_impl::GlobalOffsetAccessorT &GlobalOffset,
-                         stream_impl::GlobalBufAccessorT &GlobalBuf,
-                         unsigned Size, unsigned &Cur) {
-  unsigned New;
-  Cur = GlobalOffset[0].load();
-  do {
-    if (GlobalBuf.get_range().size() - Cur < Size)
-      // Overflow
-      return false;
-    New = Cur + Size;
-  } while (!GlobalOffset[0].compare_exchange_strong(Cur, New));
-  return true;
-}
-
-inline void flushBuffer(stream_impl::GlobalOffsetAccessorT &GlobalOffset,
-                        stream_impl::GlobalBufAccessorT &GlobalBuf,
-                        stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-                        unsigned &WIOffset, unsigned &Offset) {
-
-  unsigned Cur = 0;
-  if (!updateOffset(GlobalOffset, GlobalBuf, Offset, Cur))
-    return;
-
-  for (unsigned I = WIOffset; I < WIOffset + Offset; I++) {
-    GlobalBuf[Cur++] = GlobalFlushBuf[I];
-  }
-  // Reset the offset in the flush buffer
-  Offset = 0;
-}
-
-inline void write(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-                  size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset,
-                  const char *Str, unsigned Len, unsigned Padding = 0) {
-  if ((FlushBufferSize - Offset < Len + Padding) ||
-      (WIOffset + Offset + Len + Padding > GlobalFlushBuf.get_count()))
-    // TODO: flush here
-    return;
-
-  // Write padding
-  for (size_t I = 0; I < Padding; ++I, ++Offset)
-    GlobalFlushBuf[WIOffset + Offset] = ' ';
-
-  for (size_t I = 0; I < Len; ++I, ++Offset) {
-    GlobalFlushBuf[WIOffset + Offset] = Str[I];
-  }
-}
-
-inline void reverseBuf(char *Buf, unsigned Len) {
-  int I = Len - 1;
-  int J = 0;
-  while (I > J) {
-    int Temp = Buf[I];
-    Buf[I] = Buf[J];
-    Buf[J] = Temp;
-    I--;
-    J++;
-  }
-}
-
-inline unsigned append(char *Dst, const char *Src) {
-  unsigned Len = 0;
-  for (; Src[Len] != '\0'; ++Len)
-    ;
-
-  for (unsigned I = 0; I < Len; ++I)
-    Dst[I] = Src[I];
-  return Len;
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, half>::value, unsigned>::type
-checkForInfNan(char *Buf, T Val) {
-  if (Val != Val)
-    return append(Buf, "nan");
-
-  // Extract the sign from the bits
-  const uint16_t Sign = reinterpret_cast<uint16_t &>(Val) & 0x8000;
-  // Extract the exponent from the bits
-  const uint16_t Exp16 = (reinterpret_cast<uint16_t &>(Val) & 0x7c00) >> 10;
-
-  if (Exp16 == 0x1f) {
-    if (Sign)
-      return append(Buf, "-inf");
-    return append(Buf, "inf");
-  }
-  return 0;
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value ||
-                                   std::is_same<T, double>::value,
-                               unsigned>::type
-checkForInfNan(char *Buf, T Val) {
-  if (isnan(Val))
-    return append(Buf, "nan");
-  if (isinf(Val)) {
-    if (signbit(Val))
-      return append(Buf, "-inf");
-    return append(Buf, "inf");
-  }
-  return 0;
-}
-
-// Returns number of symbols written to the buffer
-template <typename T>
-inline EnableIfFP<T, unsigned>
-ScalarToStr(const T &Val, char *Buf, unsigned Flags, int, int Precision = -1) {
-  unsigned Offset = checkForInfNan(Buf, Val);
-  if (Offset)
-    return Offset;
-
-  T Neg = -Val;
-  auto AbsVal = Val < 0 ? Neg : Val;
-
-  if (Val < 0) {
-    Buf[Offset++] = '-';
-  } else if (Flags & ShowPos) {
-    Buf[Offset++] = '+';
-  }
-
-  bool IsSci = false;
-  if (Flags & detail::Scientific)
-    IsSci = true;
-
-  // TODO: manipulators for floating-point output - hexfloat, fixed
-  Offset += floatingPointToDecStr(AbsVal, Buf + Offset, Precision, IsSci);
-
-  return Offset;
-}
-
-// Returns number of symbols written to the buffer
-template <typename T>
-inline typename std::enable_if<std::is_integral<T>::value, unsigned>::type
-ScalarToStr(const T &Val, char *Buf, unsigned Flags, int, int Precision = -1) {
-  (void)Precision;
-  int Base = 10;
-
-  // append base manipulator
-  switch (Flags & BaseField) {
-  case Dec:
-    Base = 10;
-    break;
-  case Hex:
-    Base = 16;
-    break;
-  case Oct:
-    Base = 8;
-    break;
-  default:
-    // default value is 10
-    break;
-  }
-
-  unsigned Offset = 0;
-
-  // write '+' to the stream if the base is 10 and the value is non-negative
-  // or write '-' to stream if base is 10 and the value is negative
-  if (Base == 10) {
-    if ((Flags & ShowPos) && Val >= 0)
-      Buf[Offset++] = '+';
-    else if (Val < 0)
-      Buf[Offset++] = '-';
-  }
-
-  // write 0 or 0x to the stream if base is not 10 and the manipulator is set
-  if (Base != 10 && (Flags & ShowBase)) {
-    Buf[Offset++] = '0';
-    if (Base == 16)
-      Buf[Offset++] = 'x';
-  }
-
-  auto AbsVal = getAbsVal(Val, Base);
-
-  const unsigned NumBuf = integralToBase(AbsVal, Base, Buf + Offset);
-
-  reverseBuf(Buf + Offset, NumBuf);
-  return Offset + NumBuf;
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_integral<T>::value>::type
-writeIntegral(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-              size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset,
-              unsigned Flags, int Width, const T &Val) {
-  char Digits[MAX_INTEGRAL_DIGITS] = {0};
-  unsigned Len = ScalarToStr(Val, Digits, Flags, Width);
-  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Digits, Len,
-        (Width > 0 && static_cast<unsigned>(Width) > Len)
-            ? static_cast<unsigned>(Width) - Len
-            : 0);
-}
-
-template <typename T>
-inline EnableIfFP<T>
-writeFloatingPoint(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-                   size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset,
-                   unsigned Flags, int Width, int Precision, const T &Val) {
-  char Digits[MAX_FLOATING_POINT_DIGITS] = {0};
-  unsigned Len = ScalarToStr(Val, Digits, Flags, Width, Precision);
-  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Digits, Len,
-        (Width > 0 && static_cast<unsigned>(Width) > Len)
-            ? static_cast<unsigned>(Width) - Len
-            : 0);
-}
-
-template <typename T, int VecLength>
-typename std::enable_if<(VecLength == 1), unsigned>::type
-VecToStr(const vec<T, VecLength> &Vec, char *VecStr, unsigned Flags, int Width,
-         int Precision) {
-  return ScalarToStr(static_cast<T>(Vec.x()), VecStr, Flags, Width, Precision);
-}
-
-template <typename T, int VecLength>
-typename std::enable_if<(VecLength == 2 || VecLength == 4 || VecLength == 8 ||
-                         VecLength == 16),
-                        unsigned>::type
-VecToStr(const vec<T, VecLength> &Vec, char *VecStr, unsigned Flags, int Width,
-         int Precision) {
-  unsigned Len =
-      VecToStr<T, VecLength / 2>(Vec.lo(), VecStr, Flags, Width, Precision);
-  Len += append(VecStr + Len, VEC_ELEMENT_DELIMITER);
-  Len += VecToStr<T, VecLength / 2>(Vec.hi(), VecStr + Len, Flags, Width,
-                                    Precision);
-  return Len;
-}
-
-template <typename T, int VecLength>
-typename std::enable_if<(VecLength == 3), unsigned>::type
-VecToStr(const vec<T, VecLength> &Vec, char *VecStr, unsigned Flags, int Width,
-         int Precision) {
-  unsigned Len = VecToStr<T, 2>(Vec.lo(), VecStr, Flags, Width, Precision);
-  Len += append(VecStr + Len, VEC_ELEMENT_DELIMITER);
-  Len += VecToStr<T, 1>(Vec.z(), VecStr + Len, Flags, Width, Precision);
-  return Len;
-}
-
-template <typename T, int VecLength>
-inline void writeVec(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-                     size_t FlushBufferSize, unsigned WIOffset,
-                     unsigned &Offset, unsigned Flags, int Width, int Precision,
-                     const vec<T, VecLength> &Vec) {
-  // Reserve space for vector elements and delimiters
-  constexpr size_t MAX_VEC_SIZE =
-      MAX_FLOATING_POINT_DIGITS * VecLength + (VecLength - 1) * 2;
-  char VecStr[MAX_VEC_SIZE] = {0};
-  unsigned Len = VecToStr<T, VecLength>(Vec, VecStr, Flags, Width, Precision);
-  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, VecStr, Len,
-        (Width > 0 && Width > Len) ? Width - Len : 0);
-}
-
-template <int ArrayLength>
-inline unsigned ArrayToStr(char *Buf, const array<ArrayLength> &Arr) {
-  unsigned Len = 0;
-  Buf[Len++] = VEC_OPEN_BRACE;
-
-  for (int I = 0; I < ArrayLength; ++I) {
-    Len += ScalarToStr(Arr[I], Buf + Len, 0 /* No flags */, -1, -1);
-    if (I != ArrayLength - 1)
-      Len += append(Buf + Len, VEC_ELEMENT_DELIMITER);
-  }
-
-  Buf[Len++] = VEC_CLOSE_BRACE;
-
-  return Len;
-}
-
-template <int ArrayLength>
-inline void writeArray(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-                       size_t FlushBufferSize, unsigned WIOffset,
-                       unsigned &Offset, const array<ArrayLength> &Arr) {
-  char Buf[MAX_ARRAY_SIZE];
-  unsigned Len = ArrayToStr(Buf, Arr);
-  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
-}
-
-template <int Dimensions>
-inline void writeItem(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-                      size_t FlushBufferSize, unsigned WIOffset,
-                      unsigned &Offset, const item<Dimensions> &Item) {
-  // Reserve space for 3 arrays and additional place (40 symbols) for printing
-  // the text
-  char Buf[3 * MAX_ARRAY_SIZE + 40];
-  unsigned Len = 0;
-  Len += append(Buf, "item(");
-  Len += append(Buf + Len, "range: ");
-  Len += ArrayToStr(Buf + Len, Item.get_range());
-  Len += append(Buf + Len, ", id: ");
-  Len += ArrayToStr(Buf + Len, Item.get_id());
-  Len += append(Buf + Len, ", offset: ");
-  Len += ArrayToStr(Buf + Len, Item.get_offset());
-  Buf[Len++] = ')';
-  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
-}
-
-template <int Dimensions>
-inline void writeNDRange(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-                         size_t FlushBufferSize, unsigned WIOffset,
-                         unsigned &Offset,
-                         const nd_range<Dimensions> &ND_Range) {
-  // Reserve space for 3 arrays and additional place (50 symbols) for printing
-  // the text
-  char Buf[3 * MAX_ARRAY_SIZE + 50];
-  unsigned Len = 0;
-  Len += append(Buf, "nd_range(");
-  Len += append(Buf + Len, "global_range: ");
-  Len += ArrayToStr(Buf + Len, ND_Range.get_global_range());
-  Len += append(Buf + Len, ", local_range: ");
-  Len += ArrayToStr(Buf + Len, ND_Range.get_local_range());
-  Len += append(Buf + Len, ", offset: ");
-  Len += ArrayToStr(Buf + Len, ND_Range.get_offset());
-  Buf[Len++] = ')';
-  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
-}
-
-template <int Dimensions>
-inline void writeNDItem(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-                        size_t FlushBufferSize, unsigned WIOffset,
-                        unsigned &Offset, const nd_item<Dimensions> &ND_Item) {
-  // Reserve space for 2 arrays and additional place (40 symbols) for printing
-  // the text
-  char Buf[2 * MAX_ARRAY_SIZE + 40];
-  unsigned Len = 0;
-  Len += append(Buf, "nd_item(");
-  Len += append(Buf + Len, "global_id: ");
-  Len += ArrayToStr(Buf + Len, ND_Item.get_global_id());
-  Len += append(Buf + Len, ", local_id: ");
-  Len += ArrayToStr(Buf + Len, ND_Item.get_local_id());
-  Buf[Len++] = ')';
-  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
-}
-
-template <int Dimensions>
-inline void writeGroup(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-                       size_t FlushBufferSize, unsigned WIOffset,
-                       unsigned &Offset, const group<Dimensions> &Group) {
-  // Reserve space for 4 arrays and additional place (60 symbols) for printing
-  // the text
-  char Buf[4 * MAX_ARRAY_SIZE + 60];
-  unsigned Len = 0;
-  Len += append(Buf, "group(");
-  Len += append(Buf + Len, "id: ");
-  Len += ArrayToStr(Buf + Len, Group.get_id());
-  Len += append(Buf + Len, ", global_range: ");
-  Len += ArrayToStr(Buf + Len, Group.get_global_range());
-  Len += append(Buf + Len, ", local_range: ");
-  Len += ArrayToStr(Buf + Len, Group.get_local_range());
-  Len += append(Buf + Len, ", group_range: ");
-  Len += ArrayToStr(Buf + Len, Group.get_group_range());
-  Buf[Len++] = ')';
-  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
-}
-
-// Space for 2 arrays and additional place (20 symbols) for printing
-// the text
-constexpr size_t MAX_ITEM_SIZE = 2 * MAX_ARRAY_SIZE + 20;
-
-template <int Dimensions>
-inline unsigned ItemToStr(char *Buf, const item<Dimensions, false> &Item) {
-  unsigned Len = 0;
-  Len += append(Buf, "item(");
-  for (int I = 0; I < 2; ++I) {
-    Len += append(Buf + Len, I == 0 ? "range: " : ", id: ");
-    Len += ArrayToStr(Buf + Len, I == 0 ? Item.get_range() : Item.get_id());
-  }
-  Buf[Len++] = ')';
-  return Len;
-}
-
-template <int Dimensions>
-inline void writeHItem(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
-                       size_t FlushBufferSize, unsigned WIOffset,
-                       unsigned &Offset, const h_item<Dimensions> &HItem) {
-  // Reserve space for 3 items and additional place (60 symbols) for printing
-  // the text
-  char Buf[3 * MAX_ITEM_SIZE + 60];
-  unsigned Len = 0;
-  Len += append(Buf, "h_item(");
-  for (int I = 0; I < 3; ++I) {
-    Len += append(Buf + Len, I == 0 ? "\n  global "
-                                    : I == 1 ? "\n  logical local "
-                                             : "\n  physical local ");
-    Len += ItemToStr(Buf + Len, I == 0 ? HItem.get_global()
-                                       : I == 1 ? HItem.get_logical_local()
-                                                : HItem.get_physical_local());
-  }
-  Len += append(Buf + Len, "\n)");
-  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
-}
-
-} // namespace detail
-} // namespace sycl
-} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/stream.hpp b/sycl/include/CL/sycl/stream.hpp
index 58f038980545d..da3151ad38cbe 100644
--- a/sycl/include/CL/sycl/stream.hpp
+++ b/sycl/include/CL/sycl/stream.hpp
@@ -8,12 +8,613 @@
 
 #pragma once
 
+#include <CL/sycl/detail/defines.hpp>
 #include <CL/sycl/detail/export.hpp>
-#include <CL/sycl/detail/stream_impl.hpp>
+#include <CL/sycl/handler.hpp>
+#include <CL/sycl/builtins.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 
+namespace detail {
+
+using FmtFlags = unsigned int;
+
+// Mapping from stream_manipulator to FmtFlags. Each manipulator corresponds
+// to the bit in FmtFlags.
+static constexpr FmtFlags Dec = 0x0001;
+static constexpr FmtFlags Hex = 0x0002;
+static constexpr FmtFlags Oct = 0x0004;
+static constexpr FmtFlags ShowBase = 0x0008;
+static constexpr FmtFlags ShowPos = 0x0010;
+static constexpr FmtFlags Fixed = 0x0020;
+static constexpr FmtFlags Scientific = 0x0040;
+
+// Bitmask made of the combination of the base flags. Base flags are mutually
+// exclusive, this mask is used to clean base field before setting the new
+// base flag.
+static constexpr FmtFlags BaseField = Dec | Hex | Oct;
+
+// Bitmask made of the combination of the floating point value format flags.
+// Thease flags are mutually exclusive, this mask is used to clean float field
+// before setting the new float flag.
+static constexpr FmtFlags FloatField = Scientific | Fixed;
+
+constexpr size_t MAX_FLOATING_POINT_DIGITS = 24;
+constexpr size_t MAX_INTEGRAL_DIGITS = 23;
+constexpr const char *VEC_ELEMENT_DELIMITER = ", ";
+constexpr char VEC_OPEN_BRACE = '{';
+constexpr char VEC_CLOSE_BRACE = '}';
+
+constexpr size_t MAX_DIMENSIONS = 3;
+
+// Space for integrals (up to 3), comma and space between the
+// integrals and enclosing braces.
+constexpr size_t MAX_ARRAY_SIZE =
+    MAX_INTEGRAL_DIGITS * MAX_DIMENSIONS + 2 * (MAX_DIMENSIONS - 1) + 2;
+
+template <class F, class T = void>
+using EnableIfFP = typename std::enable_if<std::is_same<F, float>::value ||
+                                               std::is_same<F, double>::value ||
+                                               std::is_same<F, half>::value,
+                                           T>::type;
+
+using GlobalBufAccessorT = accessor<char, 1, cl::sycl::access::mode::read_write,
+                                    cl::sycl::access::target::global_buffer,
+                                    cl::sycl::access::placeholder::false_t>;
+
+using GlobalOffsetAccessorT =
+    accessor<unsigned, 1, cl::sycl::access::mode::atomic,
+             cl::sycl::access::target::global_buffer,
+             cl::sycl::access::placeholder::false_t>;
+
+inline void write(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize,
+                  unsigned WIOffset, unsigned &Offset, const char *Str,
+                  unsigned Len, unsigned Padding = 0) {
+  if ((FlushBufferSize - Offset < Len + Padding) ||
+      (WIOffset + Offset + Len + Padding > GlobalFlushBuf.get_count()))
+    // TODO: flush here
+    return;
+
+  // Write padding
+  for (size_t I = 0; I < Padding; ++I, ++Offset)
+    GlobalFlushBuf[WIOffset + Offset] = ' ';
+
+  for (size_t I = 0; I < Len; ++I, ++Offset) {
+    GlobalFlushBuf[WIOffset + Offset] = Str[I];
+  }
+}
+
+inline void reverseBuf(char *Buf, unsigned Len) {
+  int I = Len - 1;
+  int J = 0;
+  while (I > J) {
+    int Temp = Buf[I];
+    Buf[I] = Buf[J];
+    Buf[J] = Temp;
+    I--;
+    J++;
+  }
+}
+
+template <typename T>
+inline typename std::make_unsigned<T>::type getAbsVal(const T Val,
+                                                      const int Base) {
+  return ((Base == 10) && (Val < 0)) ? -Val : Val;
+}
+
+inline char digitToChar(const int Digit) {
+  if (Digit < 10) {
+    return '0' + Digit;
+  } else {
+    return 'a' + Digit - 10;
+  }
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_integral<T>::value, unsigned>::type
+integralToBase(T Val, int Base, char *Digits) {
+  unsigned NumDigits = 0;
+
+  do {
+    Digits[NumDigits++] = digitToChar(Val % Base);
+    Val /= Base;
+  } while (Val);
+
+  return NumDigits;
+}
+
+// Returns number of symbols written to the buffer
+template <typename T>
+inline typename std::enable_if<std::is_integral<T>::value, unsigned>::type
+ScalarToStr(const T &Val, char *Buf, unsigned Flags, int, int Precision = -1) {
+  (void)Precision;
+  int Base = 10;
+
+  // append base manipulator
+  switch (Flags & BaseField) {
+  case Dec:
+    Base = 10;
+    break;
+  case Hex:
+    Base = 16;
+    break;
+  case Oct:
+    Base = 8;
+    break;
+  default:
+    // default value is 10
+    break;
+  }
+
+  unsigned Offset = 0;
+
+  // write '+' to the stream if the base is 10 and the value is non-negative
+  // or write '-' to stream if base is 10 and the value is negative
+  if (Base == 10) {
+    if ((Flags & ShowPos) && Val >= 0)
+      Buf[Offset++] = '+';
+    else if (Val < 0)
+      Buf[Offset++] = '-';
+  }
+
+  // write 0 or 0x to the stream if base is not 10 and the manipulator is set
+  if (Base != 10 && (Flags & ShowBase)) {
+    Buf[Offset++] = '0';
+    if (Base == 16)
+      Buf[Offset++] = 'x';
+  }
+
+  auto AbsVal = getAbsVal(Val, Base);
+
+  const unsigned NumBuf = integralToBase(AbsVal, Base, Buf + Offset);
+
+  reverseBuf(Buf + Offset, NumBuf);
+  return Offset + NumBuf;
+}
+
+inline unsigned append(char *Dst, const char *Src) {
+  unsigned Len = 0;
+  for (; Src[Len] != '\0'; ++Len)
+    ;
+
+  for (unsigned I = 0; I < Len; ++I)
+    Dst[I] = Src[I];
+  return Len;
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, float>::value ||
+                                   std::is_same<T, double>::value,
+                               unsigned>::type
+checkForInfNan(char *Buf, T Val) {
+  if (isnan(Val))
+    return append(Buf, "nan");
+  if (isinf(Val)) {
+    if (signbit(Val))
+      return append(Buf, "-inf");
+    return append(Buf, "inf");
+  }
+  return 0;
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, half>::value, unsigned>::type
+checkForInfNan(char *Buf, T Val) {
+  if (Val != Val)
+    return append(Buf, "nan");
+
+  // Extract the sign from the bits
+  const uint16_t Sign = reinterpret_cast<uint16_t &>(Val) & 0x8000;
+  // Extract the exponent from the bits
+  const uint16_t Exp16 = (reinterpret_cast<uint16_t &>(Val) & 0x7c00) >> 10;
+
+  if (Exp16 == 0x1f) {
+    if (Sign)
+      return append(Buf, "-inf");
+    return append(Buf, "inf");
+  }
+  return 0;
+}
+
+template <typename T>
+EnableIfFP<T, unsigned> floatingPointToDecStr(T AbsVal, char *Digits,
+                                              int Precision, bool IsSci) {
+  int Exp = 0;
+
+  // For the case that the value is larger than 10.0
+  while (AbsVal >= 10.0) {
+    ++Exp;
+    AbsVal /= 10.0;
+  }
+  // For the case that the value is less than 1.0
+  while (AbsVal > 0.0 && AbsVal < 1.0) {
+    --Exp;
+    AbsVal *= 10.0;
+  }
+
+  auto IntegralPart = static_cast<int>(AbsVal);
+  auto FractionPart = AbsVal - IntegralPart;
+
+  int FractionDigits[MAX_FLOATING_POINT_DIGITS] = {0};
+
+  // Exponent
+  int P = Precision > 0 ? Precision : 4;
+  size_t FractionLength = Exp + P;
+
+  // After normalization integral part contains 1 symbol, also there could be
+  // '.', 'e', sign of the exponent and sign of the number, overall 5 symbols.
+  // So, clamp fraction length if required according to maximum size of the
+  // buffer for floating point number.
+  if (FractionLength > MAX_FLOATING_POINT_DIGITS - 5)
+    FractionLength = MAX_FLOATING_POINT_DIGITS - 5;
+
+  for (unsigned I = 0; I < FractionLength; ++I) {
+    FractionPart *= 10.0;
+    FractionDigits[I] = static_cast<int>(FractionPart);
+    FractionPart -= static_cast<int>(FractionPart);
+  }
+
+  int Carry = FractionPart > static_cast<T>(0.5) ? 1 : 0;
+
+  // Propagate the Carry
+  for (int I = FractionLength - 1; I >= 0 && Carry; --I) {
+    auto Digit = FractionDigits[I] + Carry;
+    FractionDigits[I] = Digit % 10;
+    Carry = Digit / 10;
+  }
+
+  // Carry from the fraction part is propagated to integral part
+  IntegralPart += Carry;
+  if (IntegralPart == 10) {
+    IntegralPart = 1;
+    ++Exp;
+  }
+
+  unsigned Offset = 0;
+
+  // Assemble the final string correspondingly
+  if (IsSci) { // scientific mode
+    // Append the integral part
+    Digits[Offset++] = digitToChar(IntegralPart);
+    Digits[Offset++] = '.';
+
+    // Append all fraction
+    for (unsigned I = 0; I < FractionLength; ++I)
+      Digits[Offset++] = digitToChar(FractionDigits[I]);
+
+    // Exponent part
+    Digits[Offset++] = 'e';
+    Digits[Offset++] = Exp >= 0 ? '+' : '-';
+    Digits[Offset++] = digitToChar(abs(Exp) / 10);
+    Digits[Offset++] = digitToChar(abs(Exp) % 10);
+  } else { // normal mode
+    if (Exp < 0) {
+      Digits[Offset++] = '0';
+      Digits[Offset++] = '.';
+      while (++Exp)
+        Digits[Offset++] = '0';
+
+      // Append the integral part
+      Digits[Offset++] = digitToChar(IntegralPart);
+
+      // Append all fraction
+      for (unsigned I = 0; I < FractionLength; ++I)
+        Digits[Offset++] = digitToChar(FractionDigits[I]);
+    } else {
+      // Append the integral part
+      Digits[Offset++] = digitToChar(IntegralPart);
+      unsigned I = 0;
+      // Append the integral part first
+      for (; I < FractionLength && Exp--; ++I)
+        Digits[Offset++] = digitToChar(FractionDigits[I]);
+
+      // Put the dot
+      Digits[Offset++] = '.';
+
+      // Append the rest of fraction part, or the real fraction part
+      for (; I < FractionLength; ++I)
+        Digits[Offset++] = digitToChar(FractionDigits[I]);
+    }
+    // The normal mode requires no tailing zero digit, then we need to first
+    // find the first non-zero digit
+    while (Digits[Offset - 1] == '0')
+      Offset--;
+
+    // If dot is the last digit, it should be stripped off as well
+    if (Digits[Offset - 1] == '.')
+      Offset--;
+  }
+  return Offset;
+}
+
+// Returns number of symbols written to the buffer
+template <typename T>
+inline EnableIfFP<T, unsigned>
+ScalarToStr(const T &Val, char *Buf, unsigned Flags, int, int Precision = -1) {
+  unsigned Offset = checkForInfNan(Buf, Val);
+  if (Offset)
+    return Offset;
+
+  T Neg = -Val;
+  auto AbsVal = Val < 0 ? Neg : Val;
+
+  if (Val < 0) {
+    Buf[Offset++] = '-';
+  } else if (Flags & ShowPos) {
+    Buf[Offset++] = '+';
+  }
+
+  bool IsSci = false;
+  if (Flags & detail::Scientific)
+    IsSci = true;
+
+  // TODO: manipulators for floating-point output - hexfloat, fixed
+  Offset += floatingPointToDecStr(AbsVal, Buf + Offset, Precision, IsSci);
+
+  return Offset;
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_integral<T>::value>::type
+writeIntegral(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize,
+              unsigned WIOffset, unsigned &Offset, unsigned Flags, int Width,
+              const T &Val) {
+  char Digits[MAX_INTEGRAL_DIGITS] = {0};
+  unsigned Len = ScalarToStr(Val, Digits, Flags, Width);
+  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Digits, Len,
+        (Width > 0 && static_cast<unsigned>(Width) > Len)
+            ? static_cast<unsigned>(Width) - Len
+            : 0);
+}
+
+template <typename T>
+inline EnableIfFP<T>
+writeFloatingPoint(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize,
+                   unsigned WIOffset, unsigned &Offset, unsigned Flags,
+                   int Width, int Precision, const T &Val) {
+  char Digits[MAX_FLOATING_POINT_DIGITS] = {0};
+  unsigned Len = ScalarToStr(Val, Digits, Flags, Width, Precision);
+  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Digits, Len,
+        (Width > 0 && static_cast<unsigned>(Width) > Len)
+            ? static_cast<unsigned>(Width) - Len
+            : 0);
+}
+
+// Helper method to update offset in the global buffer atomically according to
+// the provided size of the data in the flush buffer. Return true if offset is
+// updated and false in case of overflow.
+inline bool updateOffset(GlobalOffsetAccessorT &GlobalOffset,
+                         GlobalBufAccessorT &GlobalBuf, unsigned Size,
+                         unsigned &Cur) {
+  unsigned New;
+  Cur = GlobalOffset[0].load();
+  do {
+    if (GlobalBuf.get_range().size() - Cur < Size)
+      // Overflow
+      return false;
+    New = Cur + Size;
+  } while (!GlobalOffset[0].compare_exchange_strong(Cur, New));
+  return true;
+}
+
+inline void flushBuffer(GlobalOffsetAccessorT &GlobalOffset,
+                        GlobalBufAccessorT &GlobalBuf,
+                        GlobalBufAccessorT &GlobalFlushBuf, unsigned &WIOffset,
+                        unsigned &Offset) {
+
+  unsigned Cur = 0;
+  if (!updateOffset(GlobalOffset, GlobalBuf, Offset, Cur))
+    return;
+
+  for (unsigned I = WIOffset; I < WIOffset + Offset; I++) {
+    GlobalBuf[Cur++] = GlobalFlushBuf[I];
+  }
+  // Reset the offset in the flush buffer
+  Offset = 0;
+}
+
+template <typename T, int VecLength>
+typename std::enable_if<(VecLength == 1), unsigned>::type
+VecToStr(const vec<T, VecLength> &Vec, char *VecStr, unsigned Flags, int Width,
+         int Precision) {
+  return ScalarToStr(static_cast<T>(Vec.x()), VecStr, Flags, Width, Precision);
+}
+
+template <typename T, int VecLength>
+typename std::enable_if<(VecLength == 2 || VecLength == 4 || VecLength == 8 ||
+                         VecLength == 16),
+                        unsigned>::type
+VecToStr(const vec<T, VecLength> &Vec, char *VecStr, unsigned Flags, int Width,
+         int Precision) {
+  unsigned Len =
+      VecToStr<T, VecLength / 2>(Vec.lo(), VecStr, Flags, Width, Precision);
+  Len += append(VecStr + Len, VEC_ELEMENT_DELIMITER);
+  Len += VecToStr<T, VecLength / 2>(Vec.hi(), VecStr + Len, Flags, Width,
+                                    Precision);
+  return Len;
+}
+
+template <typename T, int VecLength>
+typename std::enable_if<(VecLength == 3), unsigned>::type
+VecToStr(const vec<T, VecLength> &Vec, char *VecStr, unsigned Flags, int Width,
+         int Precision) {
+  unsigned Len = VecToStr<T, 2>(Vec.lo(), VecStr, Flags, Width, Precision);
+  Len += append(VecStr + Len, VEC_ELEMENT_DELIMITER);
+  Len += VecToStr<T, 1>(Vec.z(), VecStr + Len, Flags, Width, Precision);
+  return Len;
+}
+
+template <typename T, int VecLength>
+inline void writeVec(GlobalBufAccessorT &GlobalFlushBuf, size_t FlushBufferSize,
+                     unsigned WIOffset, unsigned &Offset, unsigned Flags,
+                     int Width, int Precision, const vec<T, VecLength> &Vec) {
+  // Reserve space for vector elements and delimiters
+  constexpr size_t MAX_VEC_SIZE =
+      MAX_FLOATING_POINT_DIGITS * VecLength + (VecLength - 1) * 2;
+  char VecStr[MAX_VEC_SIZE] = {0};
+  unsigned Len = VecToStr<T, VecLength>(Vec, VecStr, Flags, Width, Precision);
+  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, VecStr, Len,
+        (Width > 0 && Width > Len) ? Width - Len : 0);
+}
+
+template <int ArrayLength>
+inline unsigned ArrayToStr(char *Buf, const array<ArrayLength> &Arr) {
+  unsigned Len = 0;
+  Buf[Len++] = VEC_OPEN_BRACE;
+
+  for (int I = 0; I < ArrayLength; ++I) {
+    Len += ScalarToStr(Arr[I], Buf + Len, 0 /* No flags */, -1, -1);
+    if (I != ArrayLength - 1)
+      Len += append(Buf + Len, VEC_ELEMENT_DELIMITER);
+  }
+
+  Buf[Len++] = VEC_CLOSE_BRACE;
+
+  return Len;
+}
+
+template <int ArrayLength>
+inline void writeArray(GlobalBufAccessorT &GlobalFlushBuf,
+                       size_t FlushBufferSize, unsigned WIOffset,
+                       unsigned &Offset, const array<ArrayLength> &Arr) {
+  char Buf[MAX_ARRAY_SIZE];
+  unsigned Len = ArrayToStr(Buf, Arr);
+  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
+}
+
+template <int Dimensions>
+inline void writeItem(GlobalBufAccessorT &GlobalFlushBuf,
+                      size_t FlushBufferSize, unsigned WIOffset,
+                      unsigned &Offset, const item<Dimensions> &Item) {
+  // Reserve space for 3 arrays and additional place (40 symbols) for printing
+  // the text
+  char Buf[3 * MAX_ARRAY_SIZE + 40];
+  unsigned Len = 0;
+  Len += append(Buf, "item(");
+  Len += append(Buf + Len, "range: ");
+  Len += ArrayToStr(Buf + Len, Item.get_range());
+  Len += append(Buf + Len, ", id: ");
+  Len += ArrayToStr(Buf + Len, Item.get_id());
+  Len += append(Buf + Len, ", offset: ");
+  Len += ArrayToStr(Buf + Len, Item.get_offset());
+  Buf[Len++] = ')';
+  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
+}
+
+template <int Dimensions>
+inline void writeNDRange(GlobalBufAccessorT &GlobalFlushBuf,
+                         size_t FlushBufferSize, unsigned WIOffset,
+                         unsigned &Offset,
+                         const nd_range<Dimensions> &ND_Range) {
+  // Reserve space for 3 arrays and additional place (50 symbols) for printing
+  // the text
+  char Buf[3 * MAX_ARRAY_SIZE + 50];
+  unsigned Len = 0;
+  Len += append(Buf, "nd_range(");
+  Len += append(Buf + Len, "global_range: ");
+  Len += ArrayToStr(Buf + Len, ND_Range.get_global_range());
+  Len += append(Buf + Len, ", local_range: ");
+  Len += ArrayToStr(Buf + Len, ND_Range.get_local_range());
+  Len += append(Buf + Len, ", offset: ");
+  Len += ArrayToStr(Buf + Len, ND_Range.get_offset());
+  Buf[Len++] = ')';
+  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
+}
+
+template <int Dimensions>
+inline void writeNDItem(GlobalBufAccessorT &GlobalFlushBuf,
+                        size_t FlushBufferSize, unsigned WIOffset,
+                        unsigned &Offset, const nd_item<Dimensions> &ND_Item) {
+  // Reserve space for 2 arrays and additional place (40 symbols) for printing
+  // the text
+  char Buf[2 * MAX_ARRAY_SIZE + 40];
+  unsigned Len = 0;
+  Len += append(Buf, "nd_item(");
+  Len += append(Buf + Len, "global_id: ");
+  Len += ArrayToStr(Buf + Len, ND_Item.get_global_id());
+  Len += append(Buf + Len, ", local_id: ");
+  Len += ArrayToStr(Buf + Len, ND_Item.get_local_id());
+  Buf[Len++] = ')';
+  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
+}
+
+template <int Dimensions>
+inline void writeGroup(GlobalBufAccessorT &GlobalFlushBuf,
+                       size_t FlushBufferSize, unsigned WIOffset,
+                       unsigned &Offset, const group<Dimensions> &Group) {
+  // Reserve space for 4 arrays and additional place (60 symbols) for printing
+  // the text
+  char Buf[4 * MAX_ARRAY_SIZE + 60];
+  unsigned Len = 0;
+  Len += append(Buf, "group(");
+  Len += append(Buf + Len, "id: ");
+  Len += ArrayToStr(Buf + Len, Group.get_id());
+  Len += append(Buf + Len, ", global_range: ");
+  Len += ArrayToStr(Buf + Len, Group.get_global_range());
+  Len += append(Buf + Len, ", local_range: ");
+  Len += ArrayToStr(Buf + Len, Group.get_local_range());
+  Len += append(Buf + Len, ", group_range: ");
+  Len += ArrayToStr(Buf + Len, Group.get_group_range());
+  Buf[Len++] = ')';
+  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
+}
+
+// Space for 2 arrays and additional place (20 symbols) for printing
+// the text
+constexpr size_t MAX_ITEM_SIZE = 2 * MAX_ARRAY_SIZE + 20;
+
+template <int Dimensions>
+inline unsigned ItemToStr(char *Buf, const item<Dimensions, false> &Item) {
+  unsigned Len = 0;
+  Len += append(Buf, "item(");
+  for (int I = 0; I < 2; ++I) {
+    Len += append(Buf + Len, I == 0 ? "range: " : ", id: ");
+    Len += ArrayToStr(Buf + Len, I == 0 ? Item.get_range() : Item.get_id());
+  }
+  Buf[Len++] = ')';
+  return Len;
+}
+
+template <int Dimensions>
+inline void writeHItem(GlobalBufAccessorT &GlobalFlushBuf,
+                       size_t FlushBufferSize, unsigned WIOffset,
+                       unsigned &Offset, const h_item<Dimensions> &HItem) {
+  // Reserve space for 3 items and additional place (60 symbols) for printing
+  // the text
+  char Buf[3 * MAX_ITEM_SIZE + 60];
+  unsigned Len = 0;
+  Len += append(Buf, "h_item(");
+  for (int I = 0; I < 3; ++I) {
+    Len += append(Buf + Len, I == 0   ? "\n  global "
+                             : I == 1 ? "\n  logical local "
+                                      : "\n  physical local ");
+    Len += ItemToStr(Buf + Len, I == 0   ? HItem.get_global()
+                                : I == 1 ? HItem.get_logical_local()
+                                         : HItem.get_physical_local());
+  }
+  Len += append(Buf + Len, "\n)");
+  write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
+}
+
+template <typename> struct IsSwizzleOp : std::false_type {};
+
+template <typename VecT, typename OperationLeftT, typename OperationRightT,
+          template <typename> class OperationCurrentT, int... Indexes>
+struct IsSwizzleOp<cl::sycl::detail::SwizzleOp<
+    VecT, OperationLeftT, OperationRightT, OperationCurrentT, Indexes...>>
+    : std::true_type {
+  using T = typename VecT::element_type;
+  using Type = typename cl::sycl::vec<T, (sizeof...(Indexes))>;
+};
+
+template <typename T>
+using EnableIfSwizzleVec =
+    typename std::enable_if<IsSwizzleOp<T>::value,
+                            typename IsSwizzleOp<T>::Type>::type;
+
+} // namespace detail
+
 enum class stream_manipulator {
   dec,
   hex,
@@ -123,16 +724,16 @@ class __SYCL_EXPORT stream {
 
   // Accessor to the global stream buffer. Global buffer contains all output
   // from the kernel.
-  mutable detail::stream_impl::GlobalBufAccessorT GlobalBuf;
+  mutable detail::GlobalBufAccessorT GlobalBuf;
 
   // Atomic accessor to the global offset variable. It represents an offset in
   // the global stream buffer. Since work items will flush data to global buffer
   // in parallel we need atomic access to this offset.
-  mutable detail::stream_impl::GlobalOffsetAccessorT GlobalOffset;
+  mutable detail::GlobalOffsetAccessorT GlobalOffset;
 
   // Accessor to the flush buffer. Each work item writes its
   // output to a designated section of the flush buffer.
-  mutable detail::stream_impl::GlobalBufAccessorT GlobalFlushBuf;
+  mutable detail::GlobalBufAccessorT GlobalFlushBuf;
 
   // Offset of the WI's flush buffer in the pool.
   mutable unsigned WIOffset = 0;
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 1490a38a0933c..c51635f462f60 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -14,7 +14,6 @@
 #include <CL/sycl/detail/cl.h>
 #include <CL/sycl/detail/kernel_desc.hpp>
 #include <CL/sycl/detail/memory_manager.hpp>
-#include <CL/sycl/detail/stream_impl.hpp>
 #include <CL/sycl/sampler.hpp>
 #include <detail/context_impl.hpp>
 #include <detail/event_impl.hpp>
@@ -25,6 +24,7 @@
 #include <detail/sampler_impl.hpp>
 #include <detail/scheduler/commands.hpp>
 #include <detail/scheduler/scheduler.hpp>
+#include <detail/stream_impl.hpp>
 
 #include <cassert>
 #include <string>
diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp
index 76247c239728a..d041c88c8a629 100644
--- a/sycl/source/detail/stream_impl.cpp
+++ b/sycl/source/detail/stream_impl.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <CL/sycl/detail/stream_impl.hpp>
+#include <detail/stream_impl.hpp>
 #include <detail/scheduler/scheduler.hpp>
 
 #include <cstdio>
diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp
new file mode 100644
index 0000000000000..9091441ce4a4d
--- /dev/null
+++ b/sycl/source/detail/stream_impl.hpp
@@ -0,0 +1,81 @@
+//==----------------- stream_impl.hpp - SYCL standard header file ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/accessor.hpp>
+#include <CL/sycl/buffer.hpp>
+#include <CL/sycl/detail/export.hpp>
+#include <CL/sycl/handler.hpp>
+#include <CL/sycl/range.hpp>
+#include <CL/sycl/stream.hpp>
+
+#include <vector>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace detail {
+
+class __SYCL_EXPORT stream_impl {
+public:
+  stream_impl(size_t BufferSize, size_t MaxStatementSize, handler &CGH);
+
+  // Method to provide an access to the global stream buffer
+  GlobalBufAccessorT accessGlobalBuf(handler &CGH) {
+    return Buf.get_access<cl::sycl::access::mode::read_write>(
+        CGH, range<1>(BufferSize_), id<1>(OffsetSize));
+  }
+
+  // Method to provide an accessor to the global flush buffer
+  GlobalBufAccessorT accessGlobalFlushBuf(handler &CGH) {
+    return FlushBuf.get_access<cl::sycl::access::mode::read_write>(
+        CGH, range<1>(MaxStatementSize_), id<1>(0));
+  }
+
+  // Method to provide an atomic access to the offset in the global stream
+  // buffer
+  GlobalOffsetAccessorT accessGlobalOffset(handler &CGH) {
+    auto OffsetSubBuf = buffer<char, 1>(Buf, id<1>(0), range<1>(OffsetSize));
+    auto ReinterpretedBuf = OffsetSubBuf.reinterpret<unsigned, 1>(range<1>(1));
+    return ReinterpretedBuf.get_access<cl::sycl::access::mode::atomic>(
+        CGH, range<1>(1), id<1>(0));
+  }
+
+  // Copy stream buffer to the host and print the contents
+  void flush();
+
+  size_t get_size() const;
+
+  size_t get_max_statement_size() const;
+
+private:
+  // Size of the stream buffer
+  size_t BufferSize_;
+
+  // Maximum number of symbols which could be streamed from the beginning of a
+  // statement till the semicolon
+  unsigned MaxStatementSize_;
+
+  // Size of the variable which is used as an offset in the stream buffer.
+  // Additinonal memory is allocated in the beginning of the stream buffer for
+  // this variable.
+  static const size_t OffsetSize = sizeof(unsigned);
+
+  // Vector on the host side which is used to initialize the stream buffer
+  std::vector<char> Data;
+
+  // Stream buffer
+  buffer<char, 1> Buf;
+
+  // Global flush buffer
+  buffer<char, 1> FlushBuf;
+};
+
+} // namespace detail
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/source/stream.cpp b/sycl/source/stream.cpp
index d0378cbb3eb6c..88b332051c5db 100644
--- a/sycl/source/stream.cpp
+++ b/sycl/source/stream.cpp
@@ -8,6 +8,7 @@
 
 #include <CL/sycl/stream.hpp>
 #include <detail/queue_impl.hpp>
+#include <detail/stream_impl.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
diff --git a/sycl/test/abi/symbol_size.cpp b/sycl/test/abi/symbol_size.cpp
index 0ecfe714a8f2d..40270190afc4f 100644
--- a/sycl/test/abi/symbol_size.cpp
+++ b/sycl/test/abi/symbol_size.cpp
@@ -8,7 +8,6 @@
 #include <CL/sycl/detail/accessor_impl.hpp>
 #include <CL/sycl/detail/buffer_impl.hpp>
 #include <CL/sycl/detail/image_impl.hpp>
-#include <CL/sycl/detail/stream_impl.hpp>
 #include <CL/sycl/device.hpp>
 #include <CL/sycl/device_event.hpp>
 #include <CL/sycl/device_selector.hpp>
@@ -73,7 +72,6 @@ int main() {
   check_size<range<1>, 8>();
   check_size<sampler, 16>();
   check_size<stream, 144>();
-  check_size<detail::stream_impl, 120>();
   check_size<queue, 16>();
 
   return 0;

From 3bfc38386c39f31aaf2bbe4cd17404f4808dfb90 Mon Sep 17 00:00:00 2001
From: Vlad Romanov <vlad.romanov@intel.com>
Date: Fri, 24 Jul 2020 02:05:22 +0300
Subject: [PATCH 2/7] adjust formatting

---
 sycl/include/CL/sycl/stream.hpp    | 12 ++++++------
 sycl/source/detail/stream_impl.cpp |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/sycl/include/CL/sycl/stream.hpp b/sycl/include/CL/sycl/stream.hpp
index da3151ad38cbe..ce01e6b233fe5 100644
--- a/sycl/include/CL/sycl/stream.hpp
+++ b/sycl/include/CL/sycl/stream.hpp
@@ -586,12 +586,12 @@ inline void writeHItem(GlobalBufAccessorT &GlobalFlushBuf,
   unsigned Len = 0;
   Len += append(Buf, "h_item(");
   for (int I = 0; I < 3; ++I) {
-    Len += append(Buf + Len, I == 0   ? "\n  global "
-                             : I == 1 ? "\n  logical local "
-                                      : "\n  physical local ");
-    Len += ItemToStr(Buf + Len, I == 0   ? HItem.get_global()
-                                : I == 1 ? HItem.get_logical_local()
-                                         : HItem.get_physical_local());
+    Len += append(Buf + Len, I == 0 ? "\n  global "
+                                    : I == 1 ? "\n  logical local "
+                                             : "\n  physical local ");
+    Len += ItemToStr(Buf + Len, I == 0 ? HItem.get_global()
+                                       : I == 1 ? HItem.get_logical_local()
+                                                : HItem.get_physical_local());
   }
   Len += append(Buf + Len, "\n)");
   write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp
index d041c88c8a629..db79b83fd1408 100644
--- a/sycl/source/detail/stream_impl.cpp
+++ b/sycl/source/detail/stream_impl.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <detail/stream_impl.hpp>
 #include <detail/scheduler/scheduler.hpp>
+#include <detail/stream_impl.hpp>
 
 #include <cstdio>
 

From 869e1ef81f176c332f3cbb617d2c56c46f05bfab Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Wed, 22 Jul 2020 11:49:43 +0300
Subject: [PATCH 3/7] [SYCL] Fix undefined behaviour during graph cleanup

With the switch to using the command member field to mark nodes as
visited during graph traversal, deleting nodes during the traversal
leads to invalid memory access whenever there's a node with multiple
indirect dependencies on another node. This patch fixes the issue by
moving the deletion of the nodes to take place post-traversal.
---
 sycl/source/detail/scheduler/graph_builder.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 8f4da1cb46a16..990def535c903 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -911,6 +911,7 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
 
 void Scheduler::GraphBuilder::cleanupFinishedCommands(Command *FinishedCmd) {
   std::queue<Command *> CmdsToVisit({FinishedCmd});
+  std::vector<Command *> CmdsToDelete;
   std::vector<Command *> Visited;
 
   // Traverse the graph using BFS
@@ -948,12 +949,16 @@ void Scheduler::GraphBuilder::cleanupFinishedCommands(Command *FinishedCmd) {
       Command *DepCmd = Dep.MDepCommand;
       DepCmd->MUsers.erase(Cmd);
     }
-    Cmd->getEvent()->setCommand(nullptr);
 
+    CmdsToDelete.push_back(Cmd);
     Visited.pop_back();
-    delete Cmd;
   }
   unmarkVisitedNodes(Visited);
+
+  for (Command *Cmd : CmdsToDelete) {
+    Cmd->getEvent()->setCommand(nullptr);
+    delete Cmd;
+  }
 }
 
 void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) {

From 08ba184e4c6547048287cb74be7708a4364ee137 Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Thu, 23 Jul 2020 10:28:24 +0300
Subject: [PATCH 4/7] Switch the solution to adding another mark to command
 nodes

This approach shows better performance with floating point reduction
which is cleanup-intensive.

Signed-off-by: Sergey Semenov <sergey.semenov@intel.com>
---
 sycl/source/detail/scheduler/commands.hpp     | 12 +++++-
 .../source/detail/scheduler/graph_builder.cpp | 41 +++++++++----------
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 6c2330fed7061..01eb2beb08af9 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -219,8 +219,16 @@ class Command {
   bool MIsBlockable = false;
   /// Counts the number of memory objects this command is a leaf for.
   unsigned MLeafCounter = 0;
-  /// Used for marking the node as visited during graph traversal.
-  bool MVisited = false;
+
+  struct Marks {
+    /// Used for marking the node as visited during graph traversal.
+    bool MVisited = false;
+    /// Used for marking the node for deletion during cleanup.
+    bool MToBeDeleted = false;
+  };
+  /// Used for marking the node during graph traversal.
+  Marks MMarks;
+
 
   enum class BlockReason : int { HostAccessor = 0, HostTask };
 
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 990def535c903..2467471f827e6 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -93,16 +93,26 @@ Scheduler::GraphBuilder::GraphBuilder() {
 }
 
 static bool markNodeAsVisited(Command *Cmd, std::vector<Command *> &Visited) {
-  if (Cmd->MVisited)
+  if (Cmd->MMarks.MVisited)
     return false;
-  Cmd->MVisited = true;
+  Cmd->MMarks.MVisited = true;
   Visited.push_back(Cmd);
   return true;
 }
 
 static void unmarkVisitedNodes(std::vector<Command *> &Visited) {
   for (Command *Cmd : Visited)
-    Cmd->MVisited = false;
+    Cmd->MMarks.MVisited = false;
+}
+
+static void handleVisitedNodes(std::vector<Command *> &Visited) {
+  for (Command *Cmd : Visited) {
+    if (Cmd->MMarks.MToBeDeleted) {
+      Cmd->getEvent()->setCommand(nullptr);
+      delete Cmd;
+    }
+    Cmd->MMarks.MVisited = false;
+  }
 }
 
 static void printDotRecursive(std::fstream &Stream,
@@ -825,7 +835,6 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
 
   std::queue<Command *> ToVisit;
   std::vector<Command *> Visited;
-  std::vector<Command *> CmdsToDelete;
   // First, mark all allocas for deletion and their direct users for traversal
   // Dependencies of the users will be cleaned up during the traversal
   for (Command *AllocaCmd : AllocaCommands) {
@@ -839,7 +848,7 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
       else
         markNodeAsVisited(UserCmd, Visited);
 
-    CmdsToDelete.push_back(AllocaCmd);
+    AllocaCmd->MMarks.MToBeDeleted = true;
     // These commands will be deleted later, clear users now to avoid
     // updating them during edge removal
     AllocaCmd->MUsers.clear();
@@ -851,7 +860,7 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
     AllocaCommandBase *LinkedCmd = AllocaCmd->MLinkedAllocaCmd;
 
     if (LinkedCmd) {
-      assert(LinkedCmd->MVisited);
+      assert(LinkedCmd->MMarks.MVisited);
 
       for (DepDesc &Dep : AllocaCmd->MDeps)
         if (Dep.MDepCommand)
@@ -896,22 +905,16 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
     // If all dependencies have been removed this way, mark the command for
     // deletion
     if (Cmd->MDeps.empty()) {
-      CmdsToDelete.push_back(Cmd);
+      Cmd->MMarks.MToBeDeleted = true;
       Cmd->MUsers.clear();
     }
   }
 
-  unmarkVisitedNodes(Visited);
-
-  for (Command *Cmd : CmdsToDelete) {
-    Cmd->getEvent()->setCommand(nullptr);
-    delete Cmd;
-  }
+  handleVisitedNodes(Visited);
 }
 
 void Scheduler::GraphBuilder::cleanupFinishedCommands(Command *FinishedCmd) {
   std::queue<Command *> CmdsToVisit({FinishedCmd});
-  std::vector<Command *> CmdsToDelete;
   std::vector<Command *> Visited;
 
   // Traverse the graph using BFS
@@ -950,15 +953,9 @@ void Scheduler::GraphBuilder::cleanupFinishedCommands(Command *FinishedCmd) {
       DepCmd->MUsers.erase(Cmd);
     }
 
-    CmdsToDelete.push_back(Cmd);
-    Visited.pop_back();
-  }
-  unmarkVisitedNodes(Visited);
-
-  for (Command *Cmd : CmdsToDelete) {
-    Cmd->getEvent()->setCommand(nullptr);
-    delete Cmd;
+    Cmd->MMarks.MToBeDeleted = true;
   }
+  handleVisitedNodes(Visited);
 }
 
 void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) {

From 144321160f8b037f5c01d4dbeb2bbb3fe4071ef1 Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Thu, 23 Jul 2020 22:01:39 +0300
Subject: [PATCH 5/7] Fix handleVisitedNodes

Signed-off-by: Sergey Semenov <sergey.semenov@intel.com>
---
 sycl/source/detail/scheduler/graph_builder.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 2467471f827e6..46bfdb212f6d4 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -110,8 +110,8 @@ static void handleVisitedNodes(std::vector<Command *> &Visited) {
     if (Cmd->MMarks.MToBeDeleted) {
       Cmd->getEvent()->setCommand(nullptr);
       delete Cmd;
-    }
-    Cmd->MMarks.MVisited = false;
+    } else
+      Cmd->MMarks.MVisited = false;
   }
 }
 

From 5f85af5957e1818e4915e271e2c19b2bf3714522 Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Fri, 24 Jul 2020 07:35:26 +0300
Subject: [PATCH 6/7] Appease clang-format check

Signed-off-by: Sergey Semenov <sergey.semenov@intel.com>
---
 sycl/source/detail/scheduler/commands.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 01eb2beb08af9..984ba3222156e 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -229,7 +229,6 @@ class Command {
   /// Used for marking the node during graph traversal.
   Marks MMarks;
 
-
   enum class BlockReason : int { HostAccessor = 0, HostTask };
 
   // Only have reasonable value while MIsBlockable is true

From 3496f581a2b9c2cd6ab583abc317dab303e82d1c Mon Sep 17 00:00:00 2001
From: Vlad Romanov <vlad.romanov@intel.com>
Date: Fri, 24 Jul 2020 14:18:14 +0300
Subject: [PATCH 7/7] reorder includes

---
 sycl/include/CL/sycl/stream.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/stream.hpp b/sycl/include/CL/sycl/stream.hpp
index ce01e6b233fe5..20cdef064b2ef 100644
--- a/sycl/include/CL/sycl/stream.hpp
+++ b/sycl/include/CL/sycl/stream.hpp
@@ -8,10 +8,10 @@
 
 #pragma once
 
+#include <CL/sycl/builtins.hpp>
 #include <CL/sycl/detail/defines.hpp>
 #include <CL/sycl/detail/export.hpp>
 #include <CL/sycl/handler.hpp>
-#include <CL/sycl/builtins.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {