fix RuntimeError: not allowed to set torch.backends.cudnn flags after disable_global_flags (#3343)

TroyGarden · facebook-github-bot · commit 9396aa748c25 · 2025-09-02T17:07:43.000-07:00
Summary:

# context
* after fix github CI workflow (GPU unit tests) we found lots of errors come from the same root cause:
```
torchrec/test_utils/__init__.py:129: in _wrapper
    return wrapped_func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torchrec/distributed/test_utils/multi_process.py:131: in setUp
    torch.backends.cudnn.allow_tf32 = False
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = &lt;torch.backends.ContextProp object at 0x7f4e8bb3ba10&gt;
obj = &lt;module 'torch.backends.cudnn' from '/opt/conda/envs/build_binary/lib/python3.11/site-packages/torch/backends/cudnn/__init__.py'&gt;
val = False

    def __set__(self, obj, val):
        if not flags_frozen():
            self.setter(val)
        else:
&gt;           raise RuntimeError(
                f"not allowed to set {obj.__name__} flags "
                "after disable_global_flags; please use flags() context manager instead"
            )
E           RuntimeError: not allowed to set torch.backends.cudnn flags after disable_global_flags; please use flags() context manager instead
```
* according to D77758554, the issue is due to D78326114 introducing `torch.testing._internal.common_utils`
```
# torch/testing/_internal/common_utils.py calls `disable_global_flags()`
# workaround RuntimeError: not allowed to set ... after disable_global_flags
```

Differential Revision: D81529616
diff --git a/torchrec/distributed/test_utils/multi_process.py b/torchrec/distributed/test_utils/multi_process.py
@@ -90,15 +90,7 @@ def __exit__(self, exc_type, exc_instance, traceback) -> None:
         dist.destroy_process_group(self.pg)
         torch.use_deterministic_algorithms(False)
         if torch.cuda.is_available() and self.disable_cuda_tf_32:
-            # torch/testing/_internal/common_utils.py calls `disable_global_flags()`
-            # workaround RuntimeError: not allowed to set ... after disable_global_flags
-            setattr(  # noqa: B010
-                torch.backends, "__allow_nonbracketed_mutation_flag", True
-            )
             torch.backends.cudnn.allow_tf32 = True
-            setattr(  # noqa: B010
-                torch.backends, "__allow_nonbracketed_mutation_flag", False
-            )
 
 
 class MultiProcessTestBase(unittest.TestCase):
diff --git a/torchrec/optim/tests/test_clipping.py b/torchrec/optim/tests/test_clipping.py
@@ -12,17 +12,18 @@
 from unittest.mock import MagicMock, patch
 
 import torch
+
+from parameterized import parameterized
 from torch.autograd import Variable
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import distribute_tensor, DTensor, init_device_mesh, Shard
-from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
-    parametrize,
-)
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
 )
+
+setattr(torch.backends, "__allow_nonbracketed_mutation_flag", True)  # noqa: B010
+
 from torchrec.optim.clipping import GradientClipping, GradientClippingOptimizer
 from torchrec.optim.test_utils import DummyKeyedOptimizer
 
@@ -243,17 +244,24 @@ def test_clip_no_gradients_norm_meta_device(
 
 
 @unittest.skipIf(not torch.cuda.is_available(), "Skip when CUDA is not available")
-@instantiate_parametrized_tests
 class TestGradientClippingDTensor(DTensorTestBase):
     """No tests for Replicated DTensors as handled prior to GradientClippingOptimizer"""
 
+    def setUp(self) -> None:
+        setattr(torch.backends, "__allow_nonbracketed_mutation_flag", False)  # noqa: B010
+        return super().setUp()
+
+    def tearDown(self) -> None:
+        setattr(torch.backends, "__allow_nonbracketed_mutation_flag", True)  # noqa: B010
+        return super().tearDown()
+
     def _get_params_to_pg(
         self, params: List[DTensor]
     ) -> Dict[DTensor, List[ProcessGroup]]:
         return {param: [param.device_mesh.get_group()] for param in params}
 
+    @parameterized.expand(["inf", 1, 2])
     @with_comms
-    @parametrize("norm_type", ("inf", 1, 2))
     def test_tensor_and_sharded_dtensor_clip_all_gradients_norm(
         self, norm_type: Union[float, str]
     ) -> None:
@@ -342,8 +350,8 @@ def test_tensor_and_sharded_dtensor_clip_all_gradients_norm(
                     f"Expect gradient to be the same. However, found {param_grad=}, {ref_param.grad=}",
                 )
 
+    @parameterized.expand(["inf", 1, 2])
     @with_comms
-    @parametrize("norm_type", ("inf", 1, 2))
     def test_multiple_sharded_dtensors_clip_all_gradients_norm(
         self, norm_type: Union[float, str]
     ) -> None: