fix RuntimeError: not allowed to set torch.backends.cudnn flags after disable_global_flags (#3343)

TroyGarden · facebook-github-bot · commit 7b1ea843baf4 · 2025-09-02T21:09:27.000-07:00
Summary: Pull Request resolved: #3343 # context * after fix github CI workflow (GPU unit tests) we found lots of errors come from the same root cause: ``` torchrec/test_utils/__init__.py:129: in _wrapper return wrapped_func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ torchrec/distributed/test_utils/multi_process.py:131: in setUp torch.backends.cudnn.allow_tf32 = False ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <torch.backends.ContextProp object at 0x7f4e8bb3ba10> obj = <module 'torch.backends.cudnn' from '/opt/conda/envs/build_binary/lib/python3.11/site-packages/torch/backends/cudnn/__init__.py'> val = False def __set__(self, obj, val): if not flags_frozen(): self.setter(val) else: > raise RuntimeError( f"not allowed to set {obj.__name__} flags " "after disable_global_flags; please use flags() context manager instead" ) E RuntimeError: not allowed to set torch.backends.cudnn flags after disable_global_flags; please use flags() context manager instead ``` * according to D77758554, the issue is due to D78326114 introducing `torch.testing._internal.common_utils` ``` # torch/testing/_internal/common_utils.py calls `disable_global_flags()` # workaround RuntimeError: not allowed to set ... after disable_global_flags ``` Reviewed By: aporialiao Differential Revision: D81529616 fbshipit-source-id: 33e70fcf1895635e2533b529ae39445c7793302b
diff --git a/.github/workflows/unittest_ci.yml b/.github/workflows/unittest_ci.yml
@@ -106,7 +106,6 @@ jobs:
         else
           skip_expression=${skip_expression:5}  # Remove the leading " and "
         fi
-        nvidia-smi
         conda run -n build_binary \
           python -m pytest torchrec -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors \
           --ignore=torchrec/distributed/tests/test_comm.py --ignore=torchrec/distributed/tests/test_infer_shardings.py \
diff --git a/torchrec/distributed/test_utils/multi_process.py b/torchrec/distributed/test_utils/multi_process.py
@@ -90,15 +90,7 @@ def __exit__(self, exc_type, exc_instance, traceback) -> None:
         dist.destroy_process_group(self.pg)
         torch.use_deterministic_algorithms(False)
         if torch.cuda.is_available() and self.disable_cuda_tf_32:
-            # torch/testing/_internal/common_utils.py calls `disable_global_flags()`
-            # workaround RuntimeError: not allowed to set ... after disable_global_flags
-            setattr(  # noqa: B010
-                torch.backends, "__allow_nonbracketed_mutation_flag", True
-            )
             torch.backends.cudnn.allow_tf32 = True
-            setattr(  # noqa: B010
-                torch.backends, "__allow_nonbracketed_mutation_flag", False
-            )
 
 
 class MultiProcessTestBase(unittest.TestCase):
diff --git a/torchrec/optim/tests/test_clipping.py b/torchrec/optim/tests/test_clipping.py
@@ -8,21 +8,10 @@
 # pyre-strict
 
 import unittest
-from typing import Dict, List, Union
 from unittest.mock import MagicMock, patch
 
 import torch
 from torch.autograd import Variable
-from torch.distributed import ProcessGroup
-from torch.distributed.tensor import distribute_tensor, DTensor, init_device_mesh, Shard
-from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
-    parametrize,
-)
-from torch.testing._internal.distributed._tensor.common_dtensor import (
-    DTensorTestBase,
-    with_comms,
-)
 from torchrec.optim.clipping import GradientClipping, GradientClippingOptimizer
 from torchrec.optim.test_utils import DummyKeyedOptimizer
 
@@ -240,202 +229,3 @@ def test_clip_no_gradients_norm_meta_device(
         gradient_clipping_optimizer.step()
 
         mock_clip_grad_norm.assert_not_called()
-
-
-@unittest.skipIf(not torch.cuda.is_available(), "Skip when CUDA is not available")
-@instantiate_parametrized_tests
-class TestGradientClippingDTensor(DTensorTestBase):
-    """No tests for Replicated DTensors as handled prior to GradientClippingOptimizer"""
-
-    def _get_params_to_pg(
-        self, params: List[DTensor]
-    ) -> Dict[DTensor, List[ProcessGroup]]:
-        return {param: [param.device_mesh.get_group()] for param in params}
-
-    @with_comms
-    @parametrize("norm_type", ("inf", 1, 2))
-    def test_tensor_and_sharded_dtensor_clip_all_gradients_norm(
-        self, norm_type: Union[float, str]
-    ) -> None:
-        """
-        Test to ensure that the gradient clipping optimizer clips gradients
-        correctly with mixed sharded DTensor and tensor by comparing gradients to its
-        torch.tensor counterpart.
-
-        Note that clipping for DTensor may require communication.
-        """
-
-        # data for testing clipping
-        data_1 = torch.tensor([1.0, 2.0, 3.0], device=self.device_type)
-        data_2 = torch.tensor([4.0, 5.0, 6.0], device=self.device_type)
-        data_1_grad = torch.tensor([12.0, 15.0, 18.0], device=self.device_type)
-        data_2_grad = torch.tensor([20.0, 30.0, 15.0], device=self.device_type)
-
-        # create gradient clipping optimizer containing no dtensor for reference
-        ref_param_1 = torch.nn.Parameter(data_1.clone())
-        ref_param_2 = torch.nn.Parameter(data_2.clone())
-        ref_param_1.grad = data_1_grad.clone()
-        ref_param_2.grad = data_2_grad.clone()
-        ref_keyed_optimizer = DummyKeyedOptimizer(
-            params={"param_1": ref_param_1, "param_2": ref_param_2},
-            state={},
-            param_groups=[{"params": [ref_param_1, ref_param_2]}],
-        )
-        ref_gradient_clipping_optimizer = GradientClippingOptimizer(
-            optimizer=ref_keyed_optimizer,
-            clipping=GradientClipping.NORM,
-            max_gradient=10.0,
-            norm_type=norm_type,
-        )
-        ref_gradient_clipping_optimizer.step()
-
-        # create gradient clipping optimizer containing a DTensor and a tensor
-        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
-        param_1 = distribute_tensor(
-            tensor=torch.tensor(
-                data_1.clone(), requires_grad=True, device=self.device_type
-            ),
-            device_mesh=device_mesh,
-            placements=[Shard(0)],
-        )
-        param_2 = torch.tensor(
-            data_2.clone(), requires_grad=True, device=self.device_type
-        )
-        param_1.grad = distribute_tensor(
-            tensor=data_1_grad.clone(),
-            device_mesh=device_mesh,
-            placements=[Shard(0)],
-        )
-        param_2.grad = data_2_grad.clone()
-        param_to_pgs = self._get_params_to_pg([param_1])
-        keyed_optimizer = DummyKeyedOptimizer(
-            params={"dtensor_param_1": param_1, "dtensor_param_2": param_2},
-            state={},
-            param_groups=[{"params": [param_1, param_2]}],
-        )
-        gradient_clipping_optimizer = GradientClippingOptimizer(
-            optimizer=keyed_optimizer,
-            clipping=GradientClipping.NORM,
-            max_gradient=10.0,
-            norm_type=norm_type,
-            enable_global_grad_clip=True,
-            param_to_pgs=param_to_pgs,  # pyre-ignore[6]
-        )
-        gradient_clipping_optimizer.step()
-
-        for param_group, ref_param_group in zip(
-            gradient_clipping_optimizer.param_groups,
-            ref_gradient_clipping_optimizer.param_groups,
-            strict=True,
-        ):
-            for param, ref_param in zip(
-                param_group["params"], ref_param_group["params"], strict=True
-            ):
-                param_grad = (
-                    param.grad.full_tensor()  # pyre-ignore[16]
-                    if isinstance(param, DTensor)
-                    else param.grad
-                )
-                self.assertEqual(
-                    param_grad,
-                    ref_param.grad,
-                    f"Expect gradient to be the same. However, found {param_grad=}, {ref_param.grad=}",
-                )
-
-    @with_comms
-    @parametrize("norm_type", ("inf", 1, 2))
-    def test_multiple_sharded_dtensors_clip_all_gradients_norm(
-        self, norm_type: Union[float, str]
-    ) -> None:
-        """
-        Test to ensure that the gradient clipping optimizer clips gradients
-        correctly with multiple sharded DTensors by comparing gradients to their
-        torch.tensor counterpart.
-
-        Note that clipping for DTensor may require communication.
-        """
-
-        # data for testing clipping
-        data_1 = torch.tensor([1.0, 2.0, 3.0], device=self.device_type)
-        data_2 = torch.tensor([4.0, 5.0, 6.0], device=self.device_type)
-        data_1_grad = torch.tensor([12.0, 15.0, 18.0], device=self.device_type)
-        data_2_grad = torch.tensor([20.0, 30.0, 15.0], device=self.device_type)
-
-        # create gradient clipping optimizer containing no dtensor for reference
-        ref_param_1 = torch.nn.Parameter(data_1.clone())
-        ref_param_2 = torch.nn.Parameter(data_2.clone())
-        ref_param_1.grad = data_1_grad.clone()
-        ref_param_2.grad = data_2_grad.clone()
-        ref_keyed_optimizer = DummyKeyedOptimizer(
-            params={"param_1": ref_param_1, "param_2": ref_param_2},
-            state={},
-            param_groups=[{"params": [ref_param_1, ref_param_2]}],
-        )
-        ref_gradient_clipping_optimizer = GradientClippingOptimizer(
-            optimizer=ref_keyed_optimizer,
-            clipping=GradientClipping.NORM,
-            max_gradient=10.0,
-            norm_type=norm_type,
-        )
-        ref_gradient_clipping_optimizer.step()
-
-        # create gradient clipping optimizer containing 2 DTensors
-        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
-        param_1 = distribute_tensor(
-            tensor=torch.tensor(
-                data_1.clone(), requires_grad=True, device=self.device_type
-            ),
-            device_mesh=device_mesh,
-            placements=[Shard(0)],
-        )
-        param_2 = distribute_tensor(
-            tensor=torch.tensor(
-                data_2.clone(), requires_grad=True, device=self.device_type
-            ),
-            device_mesh=device_mesh,
-            placements=[Shard(0)],
-        )
-        param_1.grad = distribute_tensor(
-            tensor=data_1_grad.clone(),
-            device_mesh=device_mesh,
-            placements=[Shard(0)],
-        )
-        param_2.grad = distribute_tensor(
-            tensor=data_2_grad.clone(),
-            device_mesh=device_mesh,
-            placements=[Shard(0)],
-        )
-        param_to_pgs = self._get_params_to_pg([param_1, param_2])
-        keyed_optimizer = DummyKeyedOptimizer(
-            params={"dtensor_param_1": param_1, "dtensor_param_2": param_2},
-            state={},
-            param_groups=[{"params": [param_1, param_2]}],
-        )
-        gradient_clipping_optimizer = GradientClippingOptimizer(
-            optimizer=keyed_optimizer,
-            clipping=GradientClipping.NORM,
-            max_gradient=10.0,
-            norm_type=norm_type,
-            enable_global_grad_clip=True,
-            param_to_pgs=param_to_pgs,  # pyre-ignore[6]
-        )
-        gradient_clipping_optimizer.step()
-
-        for param_group, ref_param_group in zip(
-            gradient_clipping_optimizer.param_groups,
-            ref_gradient_clipping_optimizer.param_groups,
-            strict=True,
-        ):
-            for param, ref_param in zip(
-                param_group["params"], ref_param_group["params"], strict=True
-            ):
-                param_grad = (
-                    param.grad.full_tensor()  # pyre-ignore[16]
-                    if isinstance(param, DTensor)
-                    else param.grad
-                )
-                self.assertEqual(
-                    param_grad,
-                    ref_param.grad,
-                    f"Expect gradient to be the same. However, found {param_grad=}, {ref_param.grad=}",
-                )