pytorch · vmoens · Oct 3, 2024 · Oct 2, 2024 · Oct 2, 2024 · Oct 3, 2024
diff --git a/tensordict/base.py b/tensordict/base.py
@@ -10328,17 +10328,20 @@ def to(self, *args, **kwargs) -> T:
         if device is not None and dtype is None and device == self.device:
             return result
 
+        if self.is_consolidated() and dtype is None:
+            return self._to_consolidated(
+                device=device,
+                pin_memory=non_blocking_pin,
+                num_threads=num_threads,
+                non_blocking=non_blocking,
+            )
+
         if non_blocking is None:
             sub_non_blocking = True
             non_blocking = False
         else:
             sub_non_blocking = non_blocking
 
-        if self.is_consolidated() and dtype is None:
-            return self._to_consolidated(
-                device=device, pin_memory=non_blocking_pin, num_threads=num_threads
-            )
-
         if convert_to_format is not None:
 
             def to(tensor):
@@ -10388,7 +10391,7 @@ def to_pinmem(tensor, _to=to):
             self._sync_all()
         return result
 
-    def _to_consolidated(self, *, device, pin_memory, num_threads):
+    def _to_consolidated(self, *, device, pin_memory, num_threads, non_blocking):
         if num_threads is None:
             # unspecified num_threads should mean 0
             num_threads = 0
@@ -10414,6 +10417,18 @@ def set_(x):
         result._consolidated = {"storage": storage_cast}
         if "metadata" in self._consolidated:
             result._consolidated["metadata"] = deepcopy(self._consolidated["metadata"])
+        if non_blocking in (False, None):
+            if device.type == "cuda" and non_blocking is False:
+                # sending to CUDA force sync
+                cuda_device = device
+            elif storage.device.type == "cuda":
+                # sending from cuda: need sync unless intentionally not asked for
+                cuda_device = storage.device.type
+            else:
+                cuda_device = None
+            if cuda_device is not None:
+                torch.cuda.current_stream(cuda_device).synchronize()
+
         return result
 
     def _sync_all(self):