NVIDIA · bernhardmgruber · Sep 16, 2025 · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025
@@ -92,6 +92,12 @@ function(thrust_add_example target_name_var example_name example_src thrust_targ
     thrust_configure_cuda_target(${example_target} RDC ${THRUST_FORCE_RDC})
   endif()
 
+  # We do not want to explicitly include `host_device.h` if not needed, so force include the file for non CUDA targets
+  target_compile_options(${example_target} PRIVATE
+    $<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/FI include/host_device.h>
+    $<$<COMPILE_LANGUAGE:CXX>:-include include/host_device.h>
+  )
+
   # Add to the active configuration's meta target
   add_dependencies(${config_meta_target} ${example_target})
 

@@ -7,8 +7,6 @@
 
 #include <iostream>
 
-#include "include/host_device.h"
-
 // This example shows how to implement an arbitrary transformation of
 // the form output[i] = F(first[i], second[i], third[i], ... ).
 // In this example, we use a function with 3 inputs and 1 output.
@@ -62,29 +60,20 @@ struct arbitrary_functor2
 
 int main()
 {
-  // allocate storage
-  thrust::device_vector<float> A(5);
-  thrust::device_vector<float> B(5);
-  thrust::device_vector<float> C(5);
+  // allocate and initialize
+  thrust::device_vector<float> A{3, 4, 0, 8, 2};
+  thrust::device_vector<float> B{6, 7, 2, 1, 8};
+  thrust::device_vector<float> C{2, 5, 7, 4, 3};
   thrust::device_vector<float> D1(5);
 
-  // clang-format off
-  // initialize input vectors
-  A[0] = 3;  B[0] = 6;  C[0] = 2;
-  A[1] = 4;  B[1] = 7;  C[1] = 5;
-  A[2] = 0;  B[2] = 2;  C[2] = 7;
-  A[3] = 8;  B[3] = 1;  C[3] = 4;
-  A[4] = 2;  B[4] = 8;  C[4] = 3;
-  // clang-format on
-
   // apply the transformation
   thrust::for_each(thrust::make_zip_iterator(A.begin(), B.begin(), C.begin(), D1.begin()),
                    thrust::make_zip_iterator(A.end(), B.end(), C.end(), D1.end()),
                    arbitrary_functor1());
 
   // print the output
   std::cout << "Tuple functor" << std::endl;
-  for (int i = 0; i < 5; i++)
+  for (size_t i = 0; i < A.size(); i++)
   {
     std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D1[i] << std::endl;
   }
@@ -97,7 +86,7 @@ int main()
 
   // print the output
   std::cout << "N-ary functor" << std::endl;
-  for (int i = 0; i < 5; i++)
+  for (size_t i = 0; i < A.size(); i++)
   {
     std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D2[i] << std::endl;
   }

@@ -5,14 +5,8 @@
 
 int main()
 {
-  // H has storage for 4 integers
-  thrust::host_vector<int> H(4);
-
-  // initialize individual elements
-  H[0] = 14;
-  H[1] = 20;
-  H[2] = 38;
-  H[3] = 46;
+  // H holds 4 integers
+  thrust::host_vector<int> H{14, 20, 38, 46};
 
   // H.size() returns the size of vector H
   std::cout << "H has size " << H.size() << std::endl;

@@ -4,8 +4,6 @@
 #include <thrust/random.h>
 #include <thrust/transform_reduce.h>
 
-#include "include/host_device.h"
-
 // This example shows how to compute a bounding box
 // for a set of points in two dimensions.
 
@@ -54,7 +52,7 @@ struct bbox
 };
 
 // reduce a pair of bounding boxes (a,b) to a bounding box containing a and b
-struct bbox_reduction
+struct bbox_union
 {
   __host__ __device__ bbox operator()(bbox a, bbox b)
   {
@@ -71,13 +69,13 @@ struct bbox_reduction
 int main()
 {
   const size_t N = 40;
-  thrust::default_random_engine rng;
-  thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
 
   // allocate storage for points
   thrust::device_vector<point2d> points(N);
 
   // generate some random points in the unit square
+  thrust::default_random_engine rng;
+  thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
   for (size_t i = 0; i < N; i++)
   {
     float x   = u01(rng);
@@ -86,13 +84,10 @@ int main()
   }
 
   // initial bounding box contains first point
-  bbox init = bbox(points[0], points[0]);
-
-  // binary reduction operation
-  bbox_reduction binary_op;
+  bbox init(points[0], points[0]);
 
   // compute the bounding box for the point set
-  bbox result = thrust::reduce(points.begin(), points.end(), init, binary_op);
+  bbox result = thrust::reduce(points.begin(), points.end(), init, bbox_union{});
 
   // print output
   std::cout << "bounding box " << std::fixed;

@@ -9,8 +9,6 @@
 #include <iomanip>
 #include <iostream>
 
-#include "include/host_device.h"
-
 // define a 2d float vector
 using vec2 = thrust::tuple<float, float>;
 

@@ -9,11 +9,7 @@
 
 int main()
 {
-  thrust::device_vector<int> data(4);
-  data[0] = 3;
-  data[1] = 7;
-  data[2] = 2;
-  data[3] = 5;
+  thrust::device_vector<int> data{3, 7, 2, 5};
 
   // add 10 to all values in data
   thrust::transform(

@@ -11,15 +11,7 @@ int main()
   // this example computes indices for all the nonzero values in a sequence
 
   // sequence of zero and nonzero values
-  thrust::device_vector<int> stencil(8);
-  stencil[0] = 0;
-  stencil[1] = 1;
-  stencil[2] = 1;
-  stencil[3] = 0;
-  stencil[4] = 0;
-  stencil[5] = 1;
-  stencil[6] = 0;
-  stencil[7] = 1;
+  thrust::device_vector<int> stencil{0, 1, 1, 0, 0, 1, 0, 1};
 
   // storage for the nonzero indices
   thrust::device_vector<int> indices(8);
@@ -35,7 +27,7 @@ int main()
   // indices now contains [1,2,5,7]
 
   // print result
-  std::cout << "found " << (indices_end - indices.begin()) << " nonzero values at indices:\n";
+  std::cout << "found " << cuda::std::distance(indices.begin(), indices_end) << " nonzero values at indices:\n";
   thrust::copy(indices.begin(), indices_end, std::ostream_iterator<int>(std::cout, "\n"));
 
   return 0;

@@ -15,7 +15,7 @@ int main()
   // device_ptr supports pointer arithmetic
   thrust::device_ptr<int> first = d_ptr;
   thrust::device_ptr<int> last  = d_ptr + 10;
-  std::cout << "device array contains " << (last - first) << " values\n";
+  std::cout << "device array contains " << cuda::std::distance(first, last) << " values\n";
 
   // algorithms work as expected
   thrust::sequence(first, last);

@@ -10,7 +10,6 @@
 #include <iomanip>
 #include <iostream>
 
-#include "include/host_device.h"
 #include "include/timer.h"
 
 // Compute an approximate Voronoi Diagram with a Jump Flooding Algorithm (JFA)
@@ -26,11 +25,11 @@
 // Tuple  = <seeds,seeds + k,seeds + m*k, seeds - k,
 //           seeds - m*k, seeds+ k+m*k,seeds + k-m*k,
 //           seeds- k+m*k,seeds - k+m*k, i>
-struct minFunctor
+struct voronoi_site_selector
 {
   int m, n, k;
 
-  __host__ __device__ minFunctor(int m, int n, int k)
+  __host__ __device__ voronoi_site_selector(int m, int n, int k)
       : m(m)
       , n(n)
       , k(k)
@@ -199,7 +198,7 @@ void jfa(thrust::device_vector<int>& in, thrust::device_vector<int>& out, unsign
       thrust::counting_iterator<int>(0))
       + n * m,
     out.begin(),
-    minFunctor(m, n, k));
+    voronoi_site_selector(m, n, k));
 }
 /********************************************/
 

@@ -5,8 +5,6 @@
 #include <thrust/random.h>
 #include <thrust/transform.h>
 
-#include "include/host_device.h"
-
 // This example shows how thrust::zip_iterator can be used to create a
 // 'virtual' array of structures.  In this case the structure is a 3d
 // vector type (Float3) whose (x,y,z) components will be stored in
@@ -33,7 +31,7 @@ thrust::host_vector<float> random_vector(const size_t N, unsigned int seed = thr
   thrust::default_random_engine rng(seed);
   thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
   thrust::host_vector<float> temp(N);
-  for (size_t i = 0; i < N; i++)
+  for (size_t i = 0; i < N; ++i)
   {
     temp[i] = u01(rng);
   }

@@ -40,7 +40,7 @@ OutputIterator expand(InputIterator1 first1, InputIterator1 last1, InputIterator
 
   // compute max-scan over the output indices, filling in the holes
   thrust::inclusive_scan(
-    output_indices.begin(), output_indices.end(), output_indices.begin(), ::cuda::maximum<difference_type>());
+    output_indices.begin(), output_indices.end(), output_indices.begin(), cuda::maximum<difference_type>{});
 
   // gather input values according to index array (output = first2[output_indices])
   thrust::gather(output_indices.begin(), output_indices.end(), first2, output);
@@ -62,15 +62,10 @@ void print(const std::string& s, const Vector& v)
 
 int main()
 {
-  int counts[] = {3, 5, 2, 0, 1, 3, 4, 2, 4};
-  int values[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  thrust::device_vector<int> d_counts = {3, 5, 2, 0, 1, 3, 4, 2, 4};
+  thrust::device_vector<int> d_values = {1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-  size_t input_size  = sizeof(counts) / sizeof(int);
-  size_t output_size = thrust::reduce(counts, counts + input_size);
-
-  // copy inputs to device
-  thrust::device_vector<int> d_counts(counts, counts + input_size);
-  thrust::device_vector<int> d_values(values, values + input_size);
+  const size_t output_size = thrust::reduce(d_counts.begin(), d_counts.end());
   thrust::device_vector<int> d_output(output_size);
 
   // expand values according to counts

@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuda/__cccl_config>
+
 #if !_CCCL_HAS_CUDA_COMPILER()
 
 #  ifndef __host__

@@ -4,8 +4,6 @@
 
 #include <iostream>
 
-#include "include/host_device.h"
-
 // This example demonstrates the use of placeholders to implement
 // the SAXPY operation (i.e. Y[i] = a * X[i] + Y[i]).
 //
@@ -41,14 +39,14 @@ struct saxpy_functor
 int main()
 {
   // input data
-  float a    = 2.0f;
-  float x[4] = {1, 2, 3, 4};
-  float y[4] = {1, 1, 1, 1};
+  float a                             = 2.0f;
+  thrust::device_vector<float> x_data = {1, 2, 3, 4};
+  thrust::device_vector<float> y_data = {1, 1, 1, 1};
 
   // SAXPY implemented with a functor (function object)
   {
-    thrust::device_vector<float> X(x, x + 4);
-    thrust::device_vector<float> Y(y, y + 4);
+    thrust::device_vector<float> X = x_data;
+    thrust::device_vector<float> Y = y_data;
 
     thrust::transform(
       X.begin(),
@@ -58,16 +56,16 @@ int main()
       saxpy_functor(a)); // functor
 
     std::cout << "SAXPY (functor method)" << std::endl;
-    for (size_t i = 0; i < 4; i++)
+    for (size_t i = 0; i < Y.size(); i++)
     {
-      std::cout << a << " * " << x[i] << " + " << y[i] << " = " << Y[i] << std::endl;
+      std::cout << a << " * " << x_data[i] << " + " << y_data[i] << " = " << Y[i] << std::endl;
     }
   }
 
   // SAXPY implemented with a placeholders
   {
-    thrust::device_vector<float> X(x, x + 4);
-    thrust::device_vector<float> Y(y, y + 4);
+    thrust::device_vector<float> X = x_data;
+    thrust::device_vector<float> Y = y_data;
 
     thrust::transform(
       X.begin(),
@@ -77,9 +75,9 @@ int main()
       a * _1 + _2); // placeholder expression
 
     std::cout << "SAXPY (placeholder method)" << std::endl;
-    for (size_t i = 0; i < 4; i++)
+    for (size_t i = 0; i < Y.size(); i++)
     {
-      std::cout << a << " * " << x[i] << " + " << y[i] << " = " << Y[i] << std::endl;
+      std::cout << a << " * " << x_data[i] << " + " << y_data[i] << " = " << Y[i] << std::endl;
     }
   }
 

@@ -59,7 +59,7 @@ int main()
   thrust::device_vector<int> lower  = random_vector(N);
 
   std::cout << "Unsorted Keys" << std::endl;
-  for (size_t i = 0; i < N; i++)
+  for (size_t i = 0; i < upper.size(); i++)
   {
     std::cout << "(" << upper[i] << "," << middle[i] << "," << lower[i] << ")" << std::endl;
   }
@@ -82,7 +82,7 @@ int main()
   apply_permutation(upper, permutation);
 
   std::cout << "Sorted Keys" << std::endl;
-  for (size_t i = 0; i < N; i++)
+  for (size_t i = 0; i < upper.size(); i++)
   {
     std::cout << "(" << upper[i] << "," << middle[i] << "," << lower[i] << ")" << std::endl;
   }

@@ -2,10 +2,9 @@
 #include <thrust/functional.h>
 #include <thrust/inner_product.h>
 
-#include <cmath>
-#include <iostream>
+#include <cuda/functional>
 
-#include "include/host_device.h"
+#include <iostream>
 
 // this example computes the maximum absolute difference
 // between the elements of two vectors
@@ -21,21 +20,14 @@ struct abs_diff
 
 int main()
 {
-  thrust::device_vector<float> d_a(4);
-  thrust::device_vector<float> d_b(4);
-
-  // clang-format off
-  d_a[0] = 1.0;  d_b[0] = 2.0;
-  d_a[1] = 2.0;  d_b[1] = 4.0;
-  d_a[2] = 3.0;  d_b[2] = 3.0;
-  d_a[3] = 4.0;  d_b[3] = 0.0;
-  // clang-format on
+  thrust::device_vector<float> d_a = {1.0, 2.0, 3.0, 4.0};
+  thrust::device_vector<float> d_b = {2.0, 4.0, 3.0, 0.0};
 
   // initial value of the reduction
   float init = 0;
 
   // binary operations
-  ::cuda::maximum<float> binary_op1;
+  cuda::maximum<float> binary_op1{};
   abs_diff<float> binary_op2;
 
   float max_abs_diff = thrust::inner_product(d_a.begin(), d_a.end(), d_b.begin(), init, binary_op1, binary_op2);