diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6769952d..ffa8a63d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,7 @@ register_model(hip HIP HIPStream.cpp)
 register_model(cuda CUDA CUDAStream.cu)
 register_model(kokkos KOKKOS KokkosStream.cpp)
 register_model(sycl SYCL SYCLStream.cpp)
+register_model(syclusm SYCLUSM SYCLStream.cpp)
 register_model(sycl2020 SYCL2020 SYCLStream2020.cpp)
 register_model(acc ACC ACCStream.cpp)
 # defining RAJA collides with the RAJA namespace so USE_RAJA
diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu
index 778a0445..a0d8f259 100644
--- a/src/cuda/CUDAStream.cu
+++ b/src/cuda/CUDAStream.cu
@@ -8,6 +8,8 @@
 
 #include "CUDAStream.h"
 
+int DOT_NUM_BLOCKS;
+
 void check_error(void)
 {
   cudaError_t err = cudaGetLastError();
@@ -38,10 +40,16 @@ CUDAStream<T>::CUDAStream(const int ARRAY_SIZE, const int device_index)
     throw std::runtime_error("Invalid device index");
   cudaSetDevice(device_index);
   check_error();
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, device_index);
+  check_error();
 
   // Print out device information
   std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl;
   std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
+  DOT_NUM_BLOCKS = 4 * prop.multiProcessorCount;
+  std::cout << "dot " << DOT_NUM_BLOCKS << " " << TBSIZE << " " << array_size
+    << "\n";
 
   array_size = ARRAY_SIZE;
 
diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h
index 83b8c665..2f19eeba 100644
--- a/src/cuda/CUDAStream.h
+++ b/src/cuda/CUDAStream.h
@@ -22,7 +22,7 @@
 #endif
 
 #define TBSIZE 1024
-#define DOT_NUM_BLOCKS 256
+extern int DOT_NUM_BLOCKS;
 
 template <class T>
 class CUDAStream : public Stream<T>
diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp
index 6aed1ee1..44a20e15 100644
--- a/src/hip/HIPStream.cpp
+++ b/src/hip/HIPStream.cpp
@@ -8,8 +8,8 @@
 #include "HIPStream.h"
 #include "hip/hip_runtime.h"
 
-#define TBSIZE 1024
-#define DOT_NUM_BLOCKS 256
+#define TBSIZE 256
+int DOT_NUM_BLOCKS;
 
 void check_error(void)
 {
@@ -42,9 +42,16 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
   hipSetDevice(device_index);
   check_error();
 
+  // get properties
+  hipDeviceProp_t props;
+  hipGetDeviceProperties(&props, 0);
+  DOT_NUM_BLOCKS = props.multiProcessorCount * 4;
+
   // Print out device information
   std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl;
   std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
+  std::cout << "DOT_NUM_BLOCKS: " << DOT_NUM_BLOCKS << " TBSIZE " <<
+	  TBSIZE << "\n";
 
   array_size = ARRAY_SIZE;
 
@@ -52,9 +59,7 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
   sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
 
   // Check buffers fit on the device
-  hipDeviceProp_t props;
-  hipGetDeviceProperties(&props, 0);
-  if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
+  if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
     throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
   // Create device buffers
diff --git a/src/main.cpp b/src/main.cpp
index 3035da0c..a30969ba 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -47,6 +47,8 @@
 #include "SYCLStream.h"
 #elif defined(SYCL2020)
 #include "SYCLStream2020.h"
+#elif defined(SYCLUSM)
+#include "SYCLStreamUSM.h"
 #elif defined(OMP)
 #include "OMPStream.h"
 #endif
@@ -290,7 +292,7 @@ void run()
   // Use the OpenACC implementation
   stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
 
-#elif defined(SYCL) || defined(SYCL2020)
+#elif defined(SYCL) || defined(SYCL2020) || defined(SYCLUSM)
   // Use the SYCL implementation
   stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
 
diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp
index 00c043f9..76d18cbc 100644
--- a/src/sycl/SYCLStream.cpp
+++ b/src/sycl/SYCLStream.cpp
@@ -45,7 +45,7 @@ SYCLStream<T>::SYCLStream(const int ARRAY_SIZE, const int device_index)
   else
   {
     dot_num_groups = dev.get_info<info::device::max_compute_units>() * 4;
-    dot_wgsize     = dev.get_info<info::device::max_work_group_size>();
+    dot_wgsize     = 256;       // good for AMD, doesn't hurt elsewhere
   }
 
   // Print out device information
diff --git a/src/syclusm/SYCLStream.cpp b/src/syclusm/SYCLStream.cpp
new file mode 100644
index 00000000..6a209cfa
--- /dev/null
+++ b/src/syclusm/SYCLStream.cpp
@@ -0,0 +1,330 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "SYCLStreamUSM.h"
+
+#include <iostream>
+
+using namespace cl::sycl;
+
+// Cache list of devices
+bool cached = false;
+std::vector<device> devices;
+void getDeviceList(void);
+
+template <class T>
+SYCLStream<T>::SYCLStream(const int ARRAY_SIZE, const int device_index)
+{
+  if (!cached)
+    getDeviceList();
+
+  array_size = ARRAY_SIZE;
+
+  if (device_index >= devices.size())
+    throw std::runtime_error("Invalid device index");
+  device dev = devices[device_index];
+
+  // Check device can support FP64 if needed
+  if (sizeof(T) == sizeof(double))
+  {
+    if (dev.get_info<info::device::double_fp_config>().size() == 0) {
+      throw std::runtime_error("Device does not support double precision, please use --float");
+    }
+  }
+
+  // Determine sensible dot kernel NDRange configuration
+  if (dev.is_cpu())
+  {
+    dot_num_groups = dev.get_info<info::device::max_compute_units>();
+    dot_wgsize     = dev.get_info<info::device::native_vector_width_double>() * 2;
+  }
+  else
+  {
+    dot_num_groups = dev.get_info<info::device::max_compute_units>() * 4;
+    //dot_wgsize     = dev.get_info<info::device::max_work_group_size>();
+    dot_wgsize     = 256;
+  }
+
+  // Print out device information
+  std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl;
+  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
+  std::cout << "Reduction kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl;
+
+  queue = new cl::sycl::queue(dev, cl::sycl::async_handler{[&](cl::sycl::exception_list l)
+  {
+    bool error = false;
+    for(auto e: l)
+    {
+      try
+      {
+        std::rethrow_exception(e);
+      }
+      catch (cl::sycl::exception e)
+      {
+        std::cout << e.what();
+        error = true;
+      }
+    }
+    if(error)
+    {
+      throw std::runtime_error("SYCL errors detected");
+    }
+  }});
+  
+  // Create device memory
+  d_a = static_cast<T *>(malloc_device(array_size * sizeof(T), *queue));
+  d_b = static_cast<T *>(malloc_device(array_size * sizeof(T), *queue));
+  d_c = static_cast<T *>(malloc_device(array_size * sizeof(T), *queue));
+  d_sum = static_cast<T *>(malloc_device(dot_num_groups * sizeof(T), *queue));
+
+  // create host copy of partial sums
+  h_sum = static_cast<T *>(malloc(dot_num_groups * sizeof(T)));
+}
+
+template <class T>
+SYCLStream<T>::~SYCLStream()
+{
+  sycl::free(d_a, *queue);
+  sycl::free(d_b, *queue);
+  sycl::free(d_c, *queue);
+  sycl::free(d_sum, *queue);
+  free(h_sum);
+  delete queue;
+  devices.clear();
+}
+
+template <class T>
+void SYCLStream<T>::copy()
+{
+  T *l_c = d_c;
+  const T *l_a = d_a;
+  queue->submit([&](handler &cgh)
+  {
+    cgh.parallel_for<copy_kernel>(range<1>{array_size}, [=](id<1> idx)
+    {
+      l_c[idx] = l_a[idx];
+    });
+  });
+  queue->wait();
+}
+
+template <class T>
+void SYCLStream<T>::mul()
+{
+  const T scalar = startScalar;
+  T *l_b = d_b;
+  const T *l_c = d_c;
+  queue->submit([&](handler &cgh)
+  {
+    cgh.parallel_for<mul_kernel>(range<1>{array_size}, [=](id<1> idx)
+    {
+      l_b[idx] = scalar * l_c[idx];
+    });
+  });
+  queue->wait();
+}
+
+template <class T>
+void SYCLStream<T>::add()
+{
+  const T *l_a = d_a;
+  const T *l_b = d_b;
+  T *l_c = d_c;
+  queue->submit([&](handler &cgh)
+  {
+    cgh.parallel_for<add_kernel>(range<1>{array_size}, [=](id<1> idx)
+    {
+      l_c[idx] = l_a[idx] + l_b[idx];
+    });
+  });
+  queue->wait();
+}
+
+template <class T>
+void SYCLStream<T>::triad()
+{
+  const T scalar = startScalar;
+  T *l_a = d_a;
+  const T *l_b = d_b;
+  const T *l_c = d_c;
+  queue->submit([&](handler &cgh)
+  {
+    cgh.parallel_for<triad_kernel>(range<1>{array_size}, [=](id<1> idx)
+    {
+      l_a[idx] = l_b[idx] + scalar * l_c[idx];
+    });
+  });
+  queue->wait();
+}
+
+template <class T>
+void SYCLStream<T>::nstream()
+{
+  const T scalar = startScalar;
+  T *l_a = d_a;
+  const T *l_b = d_b;
+  const T *l_c = d_c;
+  queue->submit([&](handler &cgh)
+  {
+    cgh.parallel_for<nstream_kernel>(range<1>{array_size}, [=](id<1> idx)
+    {
+      l_a[idx] += l_b[idx] + scalar * l_c[idx];
+    });
+  });
+  queue->wait();
+}
+
+template <class T>
+T SYCLStream<T>::dot()
+{
+  const T *l_a = d_a;
+  const T *l_b = d_b;
+  T *l_sum = d_sum;
+  queue->submit([&](handler &cgh)
+  {
+    auto wg_sum = accessor<T, 1, 
+        access::mode::read_write,
+        access::target::local>(range<1>(dot_wgsize), cgh);
+
+    size_t N = array_size;
+    cgh.parallel_for<dot_kernel>(nd_range<1>(
+        dot_num_groups*dot_wgsize, dot_wgsize),
+        [=](nd_item<1> item)
+    {
+      size_t i = item.get_global_id(0);
+      size_t li = item.get_local_id(0);
+      size_t global_size = item.get_global_range()[0];
+
+      wg_sum[li] = 0.0;
+      for (; i < N; i += global_size)
+        wg_sum[li] += l_a[i] * l_b[i];
+
+      size_t local_size = item.get_local_range()[0];
+      for (int offset = local_size / 2; offset > 0; offset /= 2)
+      {
+        item.barrier(cl::sycl::access::fence_space::local_space);
+        if (li < offset)
+          wg_sum[li] += wg_sum[li + offset];
+      }
+
+      if (li == 0)
+        l_sum[item.get_group(0)] = wg_sum[0];
+    });
+  });
+
+  queue->memcpy(h_sum, l_sum, sizeof(T) * dot_num_groups);
+  queue->wait();
+
+  T sum = 0.0;
+  for (int i = 0; i < dot_num_groups; i++)
+  {
+    sum += h_sum[i];
+  }
+
+  return sum;
+}
+
+template <class T>
+void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
+{
+  T *l_a = d_a;
+  T *l_b = d_b;
+  T *l_c = d_c;
+  queue->submit([&](handler &cgh)
+  {
+    cgh.parallel_for<init_kernel>(range<1>{array_size}, [=](item<1> item)
+    {
+      auto id = item.get_id(0);
+      l_a[id] = initA;
+      l_b[id] = initB;
+      l_c[id] = initC;
+    });
+  });
+  queue->wait();
+}
+
+template <class T>
+void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  const T *l_a = d_a;
+  const T *l_b = d_b;
+  const T *l_c = d_c;
+  queue->memcpy(a.data(), l_a, sizeof(T) * array_size);
+  queue->memcpy(b.data(), l_b, sizeof(T) * array_size);
+  queue->memcpy(c.data(), l_c, sizeof(T) * array_size);
+  queue->wait();
+}
+
+void getDeviceList(void)
+{
+  // Ask SYCL runtime for all devices in system
+  devices = cl::sycl::device::get_devices();
+  cached = true;
+}
+
+void listDevices(void)
+{
+  getDeviceList();
+
+  // Print device names
+  if (devices.size() == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << std::endl;
+    std::cout << "Devices:" << std::endl;
+    for (int i = 0; i < devices.size(); i++)
+    {
+      std::cout << i << ": " << getDeviceName(i) << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
+std::string getDeviceName(const int device)
+{
+  if (!cached)
+    getDeviceList();
+
+  std::string name;
+
+  if (device < devices.size())
+  {
+    name = devices[device].get_info<info::device::name>();
+  }
+  else
+  {
+    throw std::runtime_error("Error asking for name for non-existant device");
+  }
+
+  return name;
+}
+
+std::string getDeviceDriver(const int device)
+{
+  if (!cached)
+    getDeviceList();
+
+  std::string driver;
+
+  if (device < devices.size())
+  {
+    driver = devices[device].get_info<info::device::driver_version>();
+  }
+  else
+  {
+    throw std::runtime_error("Error asking for driver for non-existant device");
+  }
+
+  return driver;
+}
+
+// TODO: Fix kernel names to allow multiple template specializations
+template class SYCLStream<float>;
+template class SYCLStream<double>;
diff --git a/src/syclusm/SYCLStreamUSM.h b/src/syclusm/SYCLStreamUSM.h
new file mode 100644
index 00000000..5c7bfb1d
--- /dev/null
+++ b/src/syclusm/SYCLStreamUSM.h
@@ -0,0 +1,76 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <sstream>
+
+#include "Stream.h"
+#include "CL/sycl.hpp"
+
+#define IMPLEMENTATION_STRING "SYCLUSM"
+
+namespace sycl_kernels
+{
+  template <class T> class init;
+  template <class T> class copy;
+  template <class T> class mul;
+  template <class T> class add;
+  template <class T> class triad;
+  template <class T> class nstream;
+  template <class T> class dot;
+}
+
+template <class T>
+class SYCLStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    size_t array_size;
+
+    // SYCL objects
+    cl::sycl::queue *queue;
+    T *d_a;
+    T *d_b;
+    T *d_c;
+    T *d_sum;
+
+    // host copy of partial sums
+    T *h_sum;
+
+    // SYCL kernel names
+    typedef sycl_kernels::init<T> init_kernel;
+    typedef sycl_kernels::copy<T> copy_kernel;
+    typedef sycl_kernels::mul<T> mul_kernel;
+    typedef sycl_kernels::add<T> add_kernel;
+    typedef sycl_kernels::triad<T> triad_kernel;
+    typedef sycl_kernels::nstream<T> nstream_kernel;
+    typedef sycl_kernels::dot<T> dot_kernel;
+
+    // NDRange configuration for the dot kernel
+    size_t dot_num_groups;
+    size_t dot_wgsize;
+
+  public:
+
+    SYCLStream(const int, const int);
+    ~SYCLStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+    virtual void nstream() override;
+    virtual T    dot() override;
+
+    virtual void init_arrays(T initA, T initB, T initC) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
+// Populate the devices list
+void getDeviceList(void);
diff --git a/src/syclusm/model.cmake b/src/syclusm/model.cmake
new file mode 100644
index 00000000..e7b5a1c5
--- /dev/null
+++ b/src/syclusm/model.cmake
@@ -0,0 +1,86 @@
+
+register_flag_optional(CMAKE_CXX_COMPILER
+        "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler"
+        "c++")
+
+register_flag_required(SYCL_COMPILER
+        "Compile using the specified SYCL compiler implementation
+        Supported values are
+           ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html)
+           DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
+           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
+           COMPUTECPP   - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)")
+
+register_flag_optional(SYCL_COMPILER_DIR
+        "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
+           ONEAPI-DPCPP             - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first)
+           HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
+        "")
+
+register_flag_optional(OpenCL_LIBRARY
+        "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so"
+        "${OpenCL_LIBRARY}")
+
+macro(setup)
+    set(CMAKE_CXX_STANDARD 17)
+
+
+    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
+
+
+        set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
+
+        if (NOT EXISTS "${hipSYCL_DIR}")
+            message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure")
+            set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
+        endif ()
+        if (NOT EXISTS "${hipSYCL_DIR}")
+            message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL")
+        endif ()
+
+        # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+        find_package(hipSYCL CONFIG REQUIRED)
+        message(STATUS "ok")
+
+    elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP")
+
+        list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
+        set(ComputeCpp_DIR ${SYCL_COMPILER_DIR})
+
+        setup_opencl_header_includes()
+
+        register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0)
+        # ComputeCpp needs OpenCL
+        find_package(ComputeCpp REQUIRED)
+
+        # this must come after FindComputeCpp (!)
+        set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop)
+
+    elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
+        set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
+        include_directories(${SYCL_COMPILER_DIR}/include/sycl)
+        register_definitions(CL_TARGET_OPENCL_VERSION=220)
+        register_append_cxx_flags(ANY -fsycl)
+        register_append_link_flags(-fsycl)
+    elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP")
+        set(CMAKE_CXX_COMPILER dpcpp)
+        register_definitions(CL_TARGET_OPENCL_VERSION=220)
+    else ()
+        message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
+    endif ()
+
+endmacro()
+
+
+macro(setup_target NAME)
+    if (
+    (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR
+    (${SYCL_COMPILER} STREQUAL "HIPSYCL"))
+        # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their
+        # own custom integration header flags AFTER the target has been specified
+        # hence this macro here
+        add_sycl_to_target(
+                TARGET ${NAME}
+                SOURCES ${IMPL_SOURCES})
+    endif ()
+endmacro()