Cover par_nosync in new stream example.

Co-authored-by: Jake Hemstad <jhemstad@nvidia.com>
NVIDIA · May 5, 2022 · 4f652f2 · 4f652f2
1 parent 2027d89
commit 4f652f2
Showing 1 changed file with 24 additions and 7 deletions.
diff --git a/examples/cuda/explicit_cuda_stream.cu b/examples/cuda/explicit_cuda_stream.cu
@@ -10,7 +10,12 @@
 // This example shows how to execute a Thrust device algorithm on an explicit
 // CUDA stream. The simple program below fills a vector with the numbers
 // [0, 1000) (thrust::sequence) and then sums them (thrust::reduce), executing
-// both algorithms on the same custom CUDA stream.
+// both algorithms on the same custom CUDA stream using the execution policy. 
+// Thrust provides two execution policies that accept CUDA streams that differ in when/if they synchronize the stream:
+// 1. thrust::cuda::par.on(stream)
+//      - `stream` will *always* be synchronized before an algorithm returns
+// 2. thrust::cuda::par_nosync.on(stream)
+//      - `stream` will only be synchronized when necessary for correctness (e.g., returning a result from `thrust::reduce`)
 
 int main()
 {
@@ -25,15 +30,27 @@ int main()
     return 1;
   }
 
-  // Create a new execution policy with the custom stream:
-  auto exec_policy = thrust::device.on(custom_stream);
+  // Construct a new `nosync` execution policy with the custom stream
+  auto nosync_exec_policy = thrust::cuda::par_nosync.on(custom_stream);
 
   // Fill the vector with sequential data.
-  // This will execute using the custom stream.
-  thrust::sequence(exec_policy, d_vec.begin(), d_vec.end());
+  // This will execute using the custom stream and the stream will *not* be synchronized before the
+  // function returns, meaning asynchronous work may still be executing after returning and the contents of
+  // `d_vec` are undefined. Synchronization is not needed here because the following `inclusive_scan` is
+  // executed on the same stream and is therefore guaranteed to be ordered after the `sequence`
+  thrust::sequence(nosync_exec_policy, d_vec.begin(), d_vec.end());
 
-  // Sum the data in the vector. This also executes in the custom stream.
-  int sum = thrust::reduce(exec_policy, d_vec.cbegin(), d_vec.cend());
+   // Construct a new *synchronous* execution policy with the same custom stream
+   auto sync_exec_policy = thrust::cuda::par.on(stream);
+
+  // Compute inclusive sum scan of data in the vector.
+  // This also executes in the custom stream, but the execution policy ensures the stream is 
+  // synchronized before the algorithm returns. This guarantees there is no pending asynchronous work
+  // and the contents of `d_vec` are immediately accessible. 
+  thrust::inclusive_scan(sync_exec_policy, d_vec.cbegin(), d_vec.cend(), d_vec.begin());
+
+  // This access is only valid because the stream has been synchronized
+  int sum = d_vec.back();
 
   // Free the stream:
   err = cudaStreamDestroy(custom_stream);