diff --git a/CMakeLists.txt b/CMakeLists.txt
index c03f5c78..88f9b732 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -154,9 +154,7 @@ include(cmake/register_models.cmake)
 register_model(serial SERIAL SerialStream.cpp)
 register_model(omp OMP OMPStream.cpp)
 register_model(ocl OCL OCLStream.cpp)
-register_model(std-data STD_DATA STDDataStream.cpp)
-register_model(std-indices STD_INDICES STDIndicesStream.cpp)
-register_model(std-ranges STD_RANGES STDRangesStream.cpp)
+register_model(std STD STDStream.cpp)
 register_model(hip HIP HIPStream.cpp)
 register_model(cuda CUDA CUDAStream.cu)
 register_model(kokkos KOKKOS KokkosStream.cpp)
diff --git a/src/Stream.h b/src/Stream.h
index c8c6af1c..f233f54a 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -7,11 +7,14 @@
 
 #pragma once
 
+#include <cstdint>
 #include <array>
 #include <vector>
 #include <string>
 #include "benchmark.h"
 
+using std::intptr_t;
+
 template <class T>
 class Stream
 {
diff --git a/src/StreamModels.h b/src/StreamModels.h
index b13d5b34..820c08a4 100644
--- a/src/StreamModels.h
+++ b/src/StreamModels.h
@@ -3,12 +3,8 @@
 
 #if defined(CUDA)
 #include "CUDAStream.h"
-#elif defined(STD_DATA)
-#include "STDDataStream.h"
-#elif defined(STD_INDICES)
-#include "STDIndicesStream.h"
-#elif defined(STD_RANGES)
-#include "STDRangesStream.hpp"
+#elif defined(STD)
+#include "STDStream.h"
 #elif defined(TBB)
 #include "TBBStream.hpp"
 #elif defined(THRUST)
@@ -63,17 +59,9 @@ std::unique_ptr<Stream<T>> make_stream(Args... args) {
   // Use the Kokkos implementation
   return std::make_unique<KokkosStream<T>>(args...);
 
-#elif defined(STD_DATA)
+#elif defined(STD)
   // Use the C++ STD data-oriented implementation
-  return std::make_unique<STDDataStream<T>>(args...);
-
-#elif defined(STD_INDICES)
-  // Use the C++ STD index-oriented implementation
-  return std::make_unique<STDIndicesStream<T>>(args...);
-
-#elif defined(STD_RANGES)
-  // Use the C++ STD ranges implementation
-  return std::make_unique<STDRangesStream<T>>(args...);
+  return std::make_unique<STDStream<T>>(args...);
 
 #elif defined(TBB)
   // Use the C++20 implementation
diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh
index 57b89afb..bb6d5fa0 100755
--- a/src/ci-test-compile.sh
+++ b/src/ci-test-compile.sh
@@ -152,9 +152,9 @@ build_gcc() {
       *)   dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
     esac
     # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
-    run_build $name "${GCC_CXX:?}" std-data    "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-    run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-    run_build $name "${GCC_CXX:?}" std-ranges  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
+    run_build $name "${GCC_CXX:?}" std    "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA17"
+    run_build $name "${GCC_CXX:?}" std    "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA20"
+    run_build $name "${GCC_CXX:?}" std    "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=INDICES"
   done
 
   run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
@@ -251,9 +251,10 @@ build_clang() {
       OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
       *)   dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0"  ;;
     esac
-    run_build $name "${CLANG_CXX:?}" std-data     "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-    run_build $name "${CLANG_CXX:?}" std-indices  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-    # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported
+    run_build $name "${CLANG_CXX:?}" std  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA17"
+    # Requires GCC 14
+    # run_build $name "${CLANG_CXX:?}" std  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA20"
+    run_build $name "${CLANG_CXX:?}" std  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=INDICES"    
   done
 
   run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
@@ -270,8 +271,10 @@ build_clang() {
 build_nvhpc() {
   local name="nvhpc_build"
   local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}"
-  run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
-  run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
+  run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=DATA17"
+  # Requires GCC 14
+  # run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=DATA20"
+  run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=INDICES"  
 
   run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
   run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
diff --git a/src/dpl_shim.h b/src/dpl_shim.h
index 226693bd..9b8a7acc 100644
--- a/src/dpl_shim.h
+++ b/src/dpl_shim.h
@@ -29,12 +29,15 @@ T *alloc_raw(size_t size) { return sycl::malloc_shared<T>(size, exe_policy.queue
 template<typename T>
 void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); }
 
+#define WORKAROUND
+
 #else
 
 // auto exe_policy = dpl::execution::seq;
 // auto exe_policy = dpl::execution::par;
 static constexpr auto exe_policy = dpl::execution::par_unseq;
 #define USE_STD_PTR_ALLOC_DEALLOC
+#define WORKAROUND
 
 #endif
 
diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp
deleted file mode 100644
index 8c280f8a..00000000
--- a/src/std-data/STDDataStream.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-// Updated 2021 by University of Bristol
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#include "STDDataStream.h"
-
-template <class T>
-STDDataStream<T>::STDDataStream(BenchId bs, const intptr_t array_size, const int device_id,
-				T initA, T initB, T initC)
-  noexcept : array_size{array_size},
-  a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
-{
-    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
-#ifdef USE_ONEDPL
-    std::cout << "Using oneDPL backend: ";
-#if ONEDPL_USE_DPCPP_BACKEND
-    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
-#elif ONEDPL_USE_TBB_BACKEND
-    std::cout << "TBB " TBB_VERSION_STRING;
-#elif ONEDPL_USE_OPENMP_BACKEND
-    std::cout << "OpenMP";
-#else
-    std::cout << "Default";
-#endif
-    std::cout << std::endl;
-#endif
-    init_arrays(initA, initB, initC);
-}
-
-template<class T>
-STDDataStream<T>::~STDDataStream() {
-  dealloc_raw(a);
-  dealloc_raw(b);
-  dealloc_raw(c);
-}
-
-template <class T>
-void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
-{
-  std::fill(exe_policy, a, a + array_size, initA);
-  std::fill(exe_policy, b, b + array_size, initB);
-  std::fill(exe_policy, c, c + array_size, initC);
-}
-
-template <class T>
-void STDDataStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
-{
-  h_a = a;
-  h_b = b;
-  h_c = c;
-}
-
-template <class T>
-void STDDataStream<T>::copy()
-{
-  // c[i] = a[i]
-  std::copy(exe_policy, a, a + array_size, c);
-}
-
-template <class T>
-void STDDataStream<T>::mul()
-{
-  //  b[i] = scalar * c[i];
-  std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; });
-}
-
-template <class T>
-void STDDataStream<T>::add()
-{
-  //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, a, a + array_size, b, c, std::plus<T>());
-}
-
-template <class T>
-void STDDataStream<T>::triad()
-{
-  //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
-}
-
-template <class T>
-void STDDataStream<T>::nstream()
-{
-  //  a[i] += b[i] + scalar * c[i];
-  //  Need to do in two stages with C++11 STL.
-  //  1: a[i] += b[i]
-  //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; });
-  std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
-}
-   
-
-template <class T>
-T STDDataStream<T>::dot()
-{
-  // sum = 0; sum += a[i]*b[i]; return sum;
-  return std::transform_reduce(exe_policy, a, a + array_size, b, T{});
-}
-
-void listDevices(void)
-{
-  std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
-}
-
-std::string getDeviceName(const int)
-{
-  return std::string("Device name unavailable");
-}
-
-std::string getDeviceDriver(const int)
-{
-  return std::string("Device driver unavailable");
-}
-template class STDDataStream<float>;
-template class STDDataStream<double>;
diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp
deleted file mode 100644
index 4f8efe20..00000000
--- a/src/std-indices/STDIndicesStream.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2021 Tom Deakin and Tom Lin
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#include "STDIndicesStream.h"
-
-#ifndef ALIGNMENT
-#define ALIGNMENT (2*1024*1024) // 2MB
-#endif
-
-template <class T>
-STDIndicesStream<T>::STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id,
-				      T initA, T initB, T initC)
-noexcept : array_size{array_size}, range(0, array_size),
-  a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
-{
-    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
-#ifdef USE_ONEDPL
-    std::cout << "Using oneDPL backend: ";
-#if ONEDPL_USE_DPCPP_BACKEND
-    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
-#elif ONEDPL_USE_TBB_BACKEND
-    std::cout << "TBB " TBB_VERSION_STRING;
-#elif ONEDPL_USE_OPENMP_BACKEND
-    std::cout << "OpenMP";
-#else
-    std::cout << "Default";
-#endif
-    std::cout << std::endl;
-#endif
-    init_arrays(initA, initB, initC);
-}
-
-template<class T>
-STDIndicesStream<T>::~STDIndicesStream() {
-  dealloc_raw(a);
-  dealloc_raw(b);
-  dealloc_raw(c);
-}
-
-template <class T>
-void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
-{
-  std::fill(exe_policy, a, a + array_size, initA);
-  std::fill(exe_policy, b, b + array_size, initB);
-  std::fill(exe_policy, c, c + array_size, initC);
-}
-
-template <class T>
-void STDIndicesStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
-{
-  h_a = a;
-  h_b = b;
-  h_c = c;
-}
-
-template <class T>
-void STDIndicesStream<T>::copy()
-{
-  // c[i] = a[i]
-  std::copy(exe_policy, a, a + array_size, c);
-}
-
-template <class T>
-void STDIndicesStream<T>::mul()
-{
-  //  b[i] = scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](intptr_t i) {
-    return scalar * c[i];
-  });
-}
-
-template <class T>
-void STDIndicesStream<T>::add()
-{
-  //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](intptr_t i) {
-    return a[i] + b[i];
-  });
-}
-
-template <class T>
-void STDIndicesStream<T>::triad()
-{
-  //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](intptr_t i) {
-    return b[i] + scalar * c[i];
-  });
-}
-
-template <class T>
-void STDIndicesStream<T>::nstream()
-{
-  //  a[i] += b[i] + scalar * c[i];
-  //  Need to do in two stages with C++11 STL.
-  //  1: a[i] += b[i]
-  //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](intptr_t i) {
-    return a[i] + b[i] + scalar * c[i];
-  });
-}
-   
-
-template <class T>
-T STDIndicesStream<T>::dot()
-{
-  // sum = 0; sum += a[i]*b[i]; return sum;
-  return std::transform_reduce(exe_policy, a, a + array_size, b, T{});
-}
-
-void listDevices(void)
-{
-  std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
-}
-
-std::string getDeviceName(const int)
-{
-  return std::string("Device name unavailable");
-}
-
-std::string getDeviceDriver(const int)
-{
-  return std::string("Device driver unavailable");
-}
-template class STDIndicesStream<float>;
-template class STDIndicesStream<double>;
diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h
deleted file mode 100644
index 7a43b1ec..00000000
--- a/src/std-indices/STDIndicesStream.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2021 Tom Deakin and Tom Lin
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#pragma once
-#include "dpl_shim.h"
-
-#include <iostream>
-#include <stdexcept>
-#include "Stream.h"
-
-#define IMPLEMENTATION_STRING "STD (index-oriented)"
-
-// A lightweight counting iterator which will be used by the STL algorithms
-// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
-// implementation doesn't target
-template <typename N>
-class ranged {
-public:
-  class iterator {
-    friend class ranged;
-    public:
-    using difference_type = N;
-    using value_type = N;
-    using pointer = const N*;
-    using reference = N;
-    using iterator_category = std::random_access_iterator_tag;
-
-    // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled.
-    // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device.
-    // This type is unused for any nother STL impl.
-    using is_passed_directly = std::true_type;
-
-    reference operator *() const { return i_; }
-    iterator &operator ++() { ++i_; return *this; }
-    iterator operator ++(int) { iterator copy(*this); ++i_; return copy; }
-
-    iterator &operator --() { --i_; return *this; }
-    iterator operator --(int) { iterator copy(*this); --i_; return copy; }
-
-    iterator &operator +=(N by) { i_+=by; return *this; }
-
-    value_type operator[](const difference_type &i) const { return i_ + i; }
-
-    difference_type operator-(const iterator &it) const { return i_ - it.i_; }
-    iterator operator+(const value_type v) const { return iterator(i_ + v); }
-
-    bool operator ==(const iterator &other) const { return i_ == other.i_; }
-    bool operator !=(const iterator &other) const { return i_ != other.i_; }
-    bool operator < (const iterator &other) const { return i_ < other.i_; }
-
-    protected:
-      explicit iterator(N start) : i_ (start) {}
-
-    private:
-        N i_;
-  };
-
-  [[nodiscard]] iterator begin() const { return begin_; }
-  [[nodiscard]] iterator end() const { return end_; }
-  ranged(N begin, N end) : begin_(begin), end_(end) {}
-private:
-  iterator begin_;
-  iterator end_;
-};
-
-template <class T>
-class STDIndicesStream : public Stream<T>
-{
-  protected:
-    // Size of arrays
-    intptr_t array_size;
-
-    // induction range
-    ranged<intptr_t> range;
-
-    // Device side pointers
-    T *a, *b, *c;
-
-  public:
-    STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id,
-		     T initA, T initB, T initC) noexcept;
-    ~STDIndicesStream();
-
-    void copy() override;
-    void add() override;
-    void mul() override;
-    void triad() override;
-    void nstream() override;
-    T dot() override;
-
-    void get_arrays(T const*& a, T const*& b, T const*& c) override;
-    void init_arrays(T initA, T initB, T initC);
-};
-
diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake
deleted file mode 100644
index 60ef575f..00000000
--- a/src/std-indices/model.cmake
+++ /dev/null
@@ -1,53 +0,0 @@
-
-register_flag_optional(CMAKE_CXX_COMPILER
-        "Any CXX compiler that is supported by CMake detection"
-        "c++")
-
-register_flag_optional(NVHPC_OFFLOAD
-        "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
-         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
-
-         Possible values are:
-           cc35  - Compile for compute capability 3.5
-           cc50  - Compile for compute capability 5.0
-           cc60  - Compile for compute capability 6.0
-           cc62  - Compile for compute capability 6.2
-           cc70  - Compile for compute capability 7.0
-           cc72  - Compile for compute capability 7.2
-           cc75  - Compile for compute capability 7.5
-           cc80  - Compile for compute capability 8.0
-           ccall - Compile for all supported compute capabilities"
-        "")
-
-register_flag_optional(USE_TBB
-        "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
-        "OFF")
-
-register_flag_optional(USE_ONEDPL
-        "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
-
-        Possible values are:
-          OPENMP - Implements policies using OpenMP.
-                   CMake will handle any flags needed to enable OpenMP if the compiler supports it.
-          TBB    - Implements policies using TBB.
-                   TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
-          DPCPP  - Implements policies through SYCL2020.
-                   This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
-        "OFF")
-
-macro(setup)
-    set(CMAKE_CXX_STANDARD 17)
-    if (NVHPC_OFFLOAD)
-        set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
-        # propagate flags to linker so that it links with the gpu stuff as well
-        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
-        register_append_link_flags(${NVHPC_FLAGS})
-    endif ()
-    if (USE_TBB)
-        register_link_library(TBB::tbb)
-    endif ()
-    if (USE_ONEDPL)
-        register_definitions(USE_ONEDPL)
-        register_link_library(oneDPL)
-    endif ()
-endmacro()
diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp
deleted file mode 100644
index 02bd56b2..00000000
--- a/src/std-ranges/STDRangesStream.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright (c) 2020 Tom Deakin
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#include "STDRangesStream.hpp"
-#include <ranges>
-
-#ifndef ALIGNMENT
-#define ALIGNMENT (2*1024*1024) // 2MB
-#endif
-
-template <class T>
-STDRangesStream<T>::STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id,
-				    T initA, T initB, T initC)
-  noexcept : array_size{array_size},
-	     a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
-{
-    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
-#ifdef USE_ONEDPL
-    std::cout << "Using oneDPL backend: ";
-#if ONEDPL_USE_DPCPP_BACKEND
-    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
-#elif ONEDPL_USE_TBB_BACKEND
-    std::cout << "TBB " TBB_VERSION_STRING;
-#elif ONEDPL_USE_OPENMP_BACKEND
-    std::cout << "OpenMP";
-#else
-    std::cout << "Default";
-#endif
-    std::cout << std::endl;
-#endif
-    init_arrays(initA, initB, initC);
-}
-
-template<class T>
-STDRangesStream<T>::~STDRangesStream() {
-  dealloc_raw(a);
-  dealloc_raw(b);
-  dealloc_raw(c);
-}
-
-template <class T>
-void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
-{
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size, // loop range
-    [=, this] (intptr_t i) {
-      a[i] = initA;
-      b[i] = initB;
-      c[i] = initC;
-    }
-  );
-}
-
-template <class T>
-void STDRangesStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
-{
-  h_a = a;
-  h_b = b;
-  h_c = c;
-}
-
-template <class T>
-void STDRangesStream<T>::copy()
-{
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size,
-    [=, this] (intptr_t i) {
-      c[i] = a[i];
-    }
-  );
-}
-
-template <class T>
-void STDRangesStream<T>::mul()
-{
-  const T scalar = startScalar;
-
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size,
-    [=, this] (intptr_t i) {
-      b[i] = scalar * c[i];
-    }
-  );
-}
-
-template <class T>
-void STDRangesStream<T>::add()
-{
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size,
-    [=, this] (intptr_t i) {
-      c[i] = a[i] + b[i];
-    }
-  );
-}
-
-template <class T>
-void STDRangesStream<T>::triad()
-{
-  const T scalar = startScalar;
-
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size,
-    [=, this] (intptr_t i) {
-      a[i] = b[i] + scalar * c[i];
-    }
-  );
-}
-
-template <class T>
-void STDRangesStream<T>::nstream()
-{
-  const T scalar = startScalar;
-
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size,
-    [=, this] (intptr_t i) {
-      a[i] += b[i] + scalar * c[i];
-    }
-  );
-}
-
-template <class T>
-T STDRangesStream<T>::dot()
-{
-  // sum += a[i] * b[i];
-  return
-    std::transform_reduce(
-      exe_policy,
-      a, a + array_size, b, T{});
-}
-
-void listDevices(void)
-{
-  std::cout << "C++20 does not expose devices" << std::endl;
-}
-
-std::string getDeviceName(const int)
-{
-  return std::string("Device name unavailable");
-}
-
-std::string getDeviceDriver(const int)
-{
-  return std::string("Device driver unavailable");
-}
-
-template class STDRangesStream<float>;
-template class STDRangesStream<double>;
diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp
deleted file mode 100644
index da04f1f4..00000000
--- a/src/std-ranges/STDRangesStream.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2020 Tom Deakin
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#pragma once
-#include "dpl_shim.h"
-
-#include <iostream>
-#include <stdexcept>
-#include "Stream.h"
-
-#define IMPLEMENTATION_STRING "STD C++ ranges"
-
-template <class T>
-class STDRangesStream : public Stream<T>
-{
-  protected:
-    // Size of arrays
-    intptr_t array_size;
-
-    // Device side pointers
-    T *a, *b, *c;
-
-  public:
-    STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id,
-		    T initA, T initB, T initC) noexcept;
-    ~STDRangesStream();
-
-    void copy() override;
-    void add() override;
-    void mul() override;
-    void triad() override;
-    void nstream() override;
-    T dot() override;
-
-    void get_arrays(T const*& a, T const*& b, T const*& c) override;
-    void init_arrays(T initA, T initB, T initC);
-};
-
diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake
deleted file mode 100644
index d7fd6a8b..00000000
--- a/src/std-ranges/model.cmake
+++ /dev/null
@@ -1,68 +0,0 @@
-
-register_flag_optional(CMAKE_CXX_COMPILER
-        "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges"
-        "c++")
-
-register_flag_optional(NVHPC_OFFLOAD
-        "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK.
-         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
-
-         Possible values are:
-           cc35  - Compile for compute capability 3.5
-           cc50  - Compile for compute capability 5.0
-           cc60  - Compile for compute capability 6.0
-           cc62  - Compile for compute capability 6.2
-           cc70  - Compile for compute capability 7.0
-           cc72  - Compile for compute capability 7.2
-           cc75  - Compile for compute capability 7.5
-           cc80  - Compile for compute capability 8.0
-           ccall - Compile for all supported compute capabilities"
-        "")
-
-register_flag_optional(USE_TBB
-        "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
-        "OFF")
-
-register_flag_optional(USE_ONEDPL
-        "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
-
-        Possible values are:
-          OPENMP - Implements policies using OpenMP.
-                   CMake will handle any flags needed to enable OpenMP if the compiler supports it.
-          TBB    - Implements policies using TBB.
-                   TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
-          DPCPP  - Implements policies through SYCL2020.
-                   This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
-        "OFF")
-
-macro(setup)
-
-    # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here
-
-    # C++ 2a is too new, disable CMake's std flags completely:
-    set(CMAKE_CXX_EXTENSIONS OFF)
-    set(CMAKE_CXX_STANDARD_REQUIRED OFF)
-    unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
-    # and append our own:
-    register_append_cxx_flags(ANY -std=c++20)
-    if (NVHPC_OFFLOAD)
-        set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD})
-        # propagate flags to linker so that it links with the gpu stuff as well
-        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
-        register_append_link_flags(${NVHPC_FLAGS})
-    endif ()
-    if (USE_TBB)
-        register_link_library(TBB::tbb)
-    endif ()
-    if (USE_ONEDPL)
-        register_definitions(USE_ONEDPL)
-        register_link_library(oneDPL)
-    endif ()
-endmacro()
-
-macro(setup_target NAME)
-    if (USE_ONEDPL)
-        target_compile_features(${NAME} INTERFACE cxx_std_20)
-        target_compile_features(oneDPL INTERFACE cxx_std_20)
-    endif ()
-endmacro()
diff --git a/src/std/STDStream.cpp b/src/std/STDStream.cpp
new file mode 100644
index 00000000..678457ae
--- /dev/null
+++ b/src/std/STDStream.cpp
@@ -0,0 +1,209 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Updated 2021 by University of Bristol
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "STDStream.h"
+#include <algorithm>
+#include <execution>
+
+#if defined(DATA20) || defined(INDICES)
+#include <ranges>
+#endif
+
+ // OneDPL workaround; TODO: remove this eventually
+#include "dpl_shim.h"
+
+#ifdef INDICES
+// NVHPC workaround: TODO: remove this eventually 
+#if defined(__NVCOMPILER) && defined(_NVHPC_STDPAR_GPU)
+#define WORKAROUND   
+#include <thrust/iterator/counting_iterator.h>
+auto counting_iter(intptr_t i) { return thrust::counting_iterator<intptr_t>(i); }
+auto counting_range(intptr_t b, intptr_t e) {
+  struct R {
+    thrust::counting_iterator<intptr_t> b, e;
+    thrust::counting_iterator<intptr_t> begin() { return b; }
+    thrust::counting_iterator<intptr_t> end() { return e; }
+  };
+  return R { .b = counting_iter(b), .e = counting_iter(e) };
+}
+#else  // NVHPC Workaround
+auto counting_iter(intptr_t i) { return std::views::iota(i).begin(); }
+auto counting_range(intptr_t b, intptr_t e) { return std::views::iota(b, e); }
+#endif // NVHPC Workaround
+#endif // INDICES  
+
+template <class T>
+STDStream<T>::STDStream(BenchId bs, const intptr_t array_size, const int device_id,
+			      T initA, T initB, T initC)
+  noexcept : array_size{array_size},
+  a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
+{
+    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
+#ifdef USE_ONEDPL
+    std::cout << "Using oneDPL backend: ";
+#if ONEDPL_USE_DPCPP_BACKEND
+    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
+#elif ONEDPL_USE_TBB_BACKEND
+    std::cout << "TBB " TBB_VERSION_STRING;
+#elif ONEDPL_USE_OPENMP_BACKEND
+    std::cout << "OpenMP";
+#else
+    std::cout << "Default";
+#endif
+    std::cout << std::endl;
+#endif
+
+#ifdef WORKAROUND
+    std::cout << "Non-conforming implementation: requires non-portable workarounds to run STREAM" << std::endl;
+#endif      
+    init_arrays(initA, initB, initC);
+}
+
+template<class T>
+STDStream<T>::~STDStream() {
+  dealloc_raw(a);
+  dealloc_raw(b);
+  dealloc_raw(c);
+}
+
+template <class T>
+void STDStream<T>::init_arrays(T initA, T initB, T initC)
+{
+  std::fill_n(exe_policy, a, array_size, initA);
+  std::fill_n(exe_policy, b, array_size, initB);
+  std::fill_n(exe_policy, c, array_size, initC);
+}
+
+template <class T>
+void STDStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
+{
+  h_a = a;
+  h_b = b;
+  h_c = c;
+}
+
+template <class T>
+void STDStream<T>::copy()
+{
+  // c[i] = a[i]
+#if defined(DATA17) || defined(DATA20)
+  std::copy(exe_policy, a, a + array_size, c);
+#elif INDICES
+  std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a,c=c](intptr_t i) {
+      c[i] = a[i];
+  });
+#else
+  #error unimplemented
+#endif  
+}
+
+template <class T>
+void STDStream<T>::mul()
+{
+  //  b[i] = scalar * c[i];
+#if defined(DATA17) || defined(DATA20)  
+  std::transform(exe_policy, c, c + array_size, b, [](T ci){ return startScalar*ci; });
+#elif INDICES
+  std::for_each_n(exe_policy, counting_iter(0), array_size, [b=b, c=c](intptr_t i) {
+    b[i] = startScalar * c[i];
+  });
+#else
+  #error unimplemented
+#endif  
+}
+
+template <class T>
+void STDStream<T>::add()
+{
+  //  c[i] = a[i] + b[i];
+#if defined(DATA17) || defined(DATA20)  
+  std::transform(exe_policy, a, a + array_size, b, c, std::plus<T>());
+#elif INDICES
+  std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a, b=b, c=c](intptr_t i) {
+      c[i] = a[i] + b[i];
+  });
+#else
+  #error unimplemented
+#endif  
+}
+
+template <class T>
+void STDStream<T>::triad()
+{
+  //  a[i] = b[i] + scalar * c[i];
+#if defined(DATA17) || defined(DATA20)
+  std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
+#elif INDICES
+  std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a, b=b, c=c](intptr_t i) {
+      a[i] = b[i] + startScalar * c[i];
+  });
+#else
+  #error unimplemented
+#endif  
+}
+
+template <class T>
+void STDStream<T>::nstream()
+{
+  //  a[i] += b[i] + scalar * c[i];
+#if defined(DATA17) || defined(DATA20) // Until we can require GCC 14.1
+  //  Need to do in two round-trips with C++17 STL.
+  //  1: a[i] += b[i]
+  //  2: a[i] += scalar * c[i];
+  std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; });
+  std::transform(exe_policy, a, a + array_size, c, a, [](T ai, T ci){ return ai + startScalar*ci; });
+#elif DATA20
+  // Requires GCC 14.1 (Ubuntu 24.04):
+  auto as = std::ranges::subrange(a, a + array_size);
+  auto bs = std::ranges::subrange(b, b + array_size);
+  auto cs = std::ranges::subrange(c, c + array_size);
+  auto r = std::views::zip(as, bs, cs);
+  std::transform(exe_policy, r.begin(), r.end(), a, [](auto vs) {
+      auto [a, b, c] = vs;
+      return a + b + startScalar * c;
+  });
+#elif INDICES
+  std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a,b=b,c=c](intptr_t i) {
+    a[i] += b[i] + startScalar * c[i];
+  });
+#else
+  #error unimplemented
+#endif  
+}
+   
+
+template <class T>
+T STDStream<T>::dot()
+{
+#if defined(DATA17) || defined(DATA20)  
+  // sum = 0; sum += a[i] * b[i]; return sum;
+  return std::transform_reduce(exe_policy, a, a + array_size, b, T{0});
+#elif INDICES
+  auto r = counting_range(intptr_t(0), array_size);
+  return std::transform_reduce(exe_policy, r.begin(), r.end(), T{0}, std::plus<T>{}, [a=a, b=b](intptr_t i) {
+      return a[i] * b[i];
+  });
+#else
+  #error unimplemented
+#endif
+}
+
+void listDevices(void)
+{
+  std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+template class STDStream<float>;
+template class STDStream<double>;
diff --git a/src/std-data/STDDataStream.h b/src/std/STDStream.h
similarity index 69%
rename from src/std-data/STDDataStream.h
rename to src/std/STDStream.h
index 6db998b2..aa26eb33 100644
--- a/src/std-data/STDDataStream.h
+++ b/src/std/STDStream.h
@@ -5,17 +5,26 @@
 // source code
 
 #pragma once
-#include "dpl_shim.h"
 
 #include <iostream>
 #include <stdexcept>
 #include "Stream.h"
 
-#define IMPLEMENTATION_STRING "STD (data-oriented)"
+#ifdef DATA17
+#define STDIMPL "DATA17"
+#elif DATA20
+#define STDIMPL "DATA20"
+#elif INDICES
+#define STDIMPL "INDICES"
+#else
+#error unimplemented
+#endif
+
+#define IMPLEMENTATION_STRING "STD (" STDIMPL ")"
 
 
 template <class T>
-class STDDataStream : public Stream<T>
+class STDStream : public Stream<T>
 {
   protected:
     // Size of arrays
@@ -25,9 +34,9 @@ class STDDataStream : public Stream<T>
     T *a, *b, *c;
 
   public:
-    STDDataStream(BenchId bs, const intptr_t array_size, const int device_id,
+    STDStream(BenchId bs, const intptr_t array_size, const int device_id,
 		  T initA, T initB, T initC) noexcept;
-    ~STDDataStream();
+    ~STDStream();
 
     void copy() override;
     void add() override;
diff --git a/src/std-data/model.cmake b/src/std/model.cmake
similarity index 78%
rename from src/std-data/model.cmake
rename to src/std/model.cmake
index 837d26bf..01e8bc7d 100644
--- a/src/std-data/model.cmake
+++ b/src/std/model.cmake
@@ -1,4 +1,3 @@
-
 register_flag_optional(CMAKE_CXX_COMPILER
         "Any CXX compiler that is supported by CMake detection"
         "c++")
@@ -16,8 +15,10 @@ register_flag_optional(NVHPC_OFFLOAD
            cc72  - Compile for compute capability 7.2
            cc75  - Compile for compute capability 7.5
            cc80  - Compile for compute capability 8.0
-           ccall - Compile for all supported compute capabilities"
-        "")
+           cc90  - Compile for compute capability 8.0
+           ccall - Compile for all supported compute capabilities
+           ccnative - Compiles for compute capability of current device"
+	"")
 
 register_flag_optional(USE_TBB
         "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
@@ -35,8 +36,21 @@ register_flag_optional(USE_ONEDPL
                    This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
         "OFF")
 
+register_flag_optional(STDIMPL
+  "Implementation strategy (default = DATA20):
+     DATA17    - Parallel algorithms over data (requires C++17).
+     DATA20    - (default) Parallel algorithms over data (requires C++20).
+     INDICES - Parallel algorithms over indices (requires C++20)."
+  "DATA20"
+)
+
 macro(setup)
-    set(CMAKE_CXX_STANDARD 17)
+    register_definitions(${STDIMPL})
+    if (${STDIMPL} STREQUAL "DATA17")
+      set(CMAKE_CXX_STANDARD 17)
+    else ()
+      set(CMAKE_CXX_STANDARD 20)
+    endif ()
     if (NVHPC_OFFLOAD)
         set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD})
         # propagate flags to linker so that it links with the gpu stuff as well