Fix TF32 convergence issue with TF32 (pytorch#1244)

* Fix TF32 convergence issue with TF32 * save
mthrok · Nov 17, 2020 · 9ecc44a · 9ecc44a
1 parent e6167a9
commit 9ecc44a
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 2 deletions.
diff --git a/beginner_source/examples_autograd/two_layer_net_autograd.py b/beginner_source/examples_autograd/two_layer_net_autograd.py
@@ -18,7 +18,15 @@
 
 dtype = torch.float
 device = torch.device("cpu")
-# device = torch.device("cuda:0") # Uncomment this to run on GPU
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+# torch.backends.cuda.matmul.allow_tf32 = False  # Uncomment this to run on GPU
+
+# The above line disables TensorFloat32. This a feature that allows
+# networks to run at a much faster speed while sacrificing precision.
+# Although TensorFloat32 works well on most real models, for our toy model
+# in this tutorial, the sacrificed precision causes convergence issue.
+# For more information, see:
+# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.

diff --git a/beginner_source/examples_autograd/two_layer_net_custom_function.py b/beginner_source/examples_autograd/two_layer_net_custom_function.py
@@ -48,7 +48,15 @@ def backward(ctx, grad_output):
 
 dtype = torch.float
 device = torch.device("cpu")
-# device = torch.device("cuda:0") # Uncomment this to run on GPU
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+# torch.backends.cuda.matmul.allow_tf32 = False  # Uncomment this to run on GPU
+
+# The above line disables TensorFloat32. This a feature that allows
+# networks to run at a much faster speed while sacrificing precision.
+# Although TensorFloat32 works well on most real models, for our toy model
+# in this tutorial, the sacrificed precision causes convergence issue.
+# For more information, see:
+# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.