ROCm
diff --git a/‎aten/src/ATen/FunctionalStorageImpl.h‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/FunctionalStorageImpl.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/FunctionalTensorWrapper.h‎
Lines changed: 3 additions & 1 deletion b/‎aten/src/ATen/FunctionalTensorWrapper.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmarks/dynamo/pr_time_benchmarks/expected_results.csv‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/dynamo/pr_time_benchmarks/expected_results.csv‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/functorch/test_aotdispatch.py‎
Lines changed: 47 additions & 0 deletions b/‎test/functorch/test_aotdispatch.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎tools/pyi/gen_pyi.py‎
Lines changed: 7 additions & 0 deletions b/‎tools/pyi/gen_pyi.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py‎
Lines changed: 2 additions & 0 deletions b/‎torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py‎
Lines changed: 2 additions & 0 deletions
@@ -122,6 +122,9 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
 
   ~FunctionalStorageImpl() override = default;
 
+  uint64_t mutation_counter() {
+    return mutation_counter_;
+  }
   void mark_mutation() {
     mutation_counter_++;
   }
 
@@ -74,7 +74,9 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   bool has_metadata_mutation() const {
     return has_metadata_mutation_;
   }
-
+  uint64_t mutation_counter() const {
+    return functional_storage_impl()->mutation_counter();
+  }
   void mark_mutation() {
     functional_storage_impl()->mark_mutation();
   }
 
@@ -14,7 +14,7 @@ add_loop_inductor_dynamic_gpu,compile_time_instruction_count,39110000000,0.025
 
 
 
-add_loop_inductor_gpu,compile_time_instruction_count,26180000000,0.015
+add_loop_inductor_gpu,compile_time_instruction_count,25780000000,0.015
 
 
 
@@ -62,7 +62,7 @@ aotdispatcher_partitioner_cpu,compile_time_instruction_count,8844000000,0.015
 
 
 
-aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1963000000,0.015
+aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1917000000,0.015
 
 
 
 
@@ -7842,6 +7842,53 @@ def _inps():
         self.assertEqual(ref_inps_after_fw, inps_after_fw)
         self.assertEqual(ref_inps_after_bw, inps_after_bw)
 
+    def test_mutation_of_input_in_fw_and_bw(self):
+        class AF(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, dummy, inplace_tensor):
+                inplace_tensor.add_(1)
+
+                ctx.inplace_tensor = inplace_tensor
+                return dummy.clone()
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                inplace_tensor = ctx.inplace_tensor
+                inplace_tensor.add_(1)
+                return grad_output, None, None
+
+        def fn(dummy, inplace_tensor):
+            return AF.apply(dummy, inplace_tensor)
+
+        def inps():
+            dummy = torch.randn((2,), requires_grad=True)
+            inplace_tensor = torch.zeros((2,), requires_grad=False)
+            return dummy, inplace_tensor
+
+        def sc_inps():
+            dummy = TwoTensor(
+                torch.randn((2,), requires_grad=True),
+                torch.randn((2,), requires_grad=True),
+            )
+            inplace_tensor = TwoTensor(
+                torch.zeros((2,), requires_grad=False),
+                torch.zeros((2,), requires_grad=False),
+            )
+            return dummy, inplace_tensor
+
+        for _inps in [inps, sc_inps]:
+            dummy, inplace = _inps()
+            y = fn(dummy, inplace)
+            ref0 = inplace.clone().detach()
+            y.sum().backward()
+            ref = inplace.clone().detach()
+
+            dummy, inplace = _inps()
+            y = torch.compile(fn, backend="aot_eager", fullgraph=True)(dummy, inplace)
+            self.assertEqual(ref0, inplace)
+            y.sum().backward()
+            self.assertEqual(ref, inplace)
+
 
 class MockFXGraphCache:
     """
 
@@ -912,6 +912,13 @@ def gen_pyi(
                     "None",
                 )
             ],
+            "_functionalize_mutation_counter": [
+                defs(
+                    "_functionalize_mutation_counter",
+                    ["t: Tensor"],
+                    "_int",
+                )
+            ],
             "_functionalize_are_all_mutations_hidden_from_autograd": [
                 defs(
                     "_functionalize_are_all_mutations_hidden_from_autograd",
 
@@ -265,13 +265,15 @@ def aot_dispatch_autograd_graph(
         fw_metadata,
     )
     joint_fn_to_trace = create_joint(fn_prepared_for_autograd, aot_config=aot_config)
+    joint_fn_handle = joint_fn_to_trace.handle
 
     joint_fn_to_trace, updated_joint_inputs = create_functionalized_fn(
         joint_fn_to_trace,
         joint_inputs,
         meta=fw_metadata,
         aot_config=aot_config,
         trace_joint=True,
+        joint_fn_handle=joint_fn_handle,
     )
 
     # TODO: replace with AOTDispatchSubclassWrapper once we refactor
Original file line number	Diff line number	Diff line change
`@@ -122,6 +122,9 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {`
`122`	`122`
`123`	`123`	`~FunctionalStorageImpl() override = default;`
`124`	`124`
	`125`	`+ uint64_t mutation_counter() {`
	`126`	`+ return mutation_counter_;`
	`127`	`+ }`
`125`	`128`	`void mark_mutation() {`
`126`	`129`	`mutation_counter_++;`
`127`	`130`	`}`
Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,9 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {`
`74`	`74`	`bool has_metadata_mutation() const {`
`75`	`75`	`return has_metadata_mutation_;`
`76`	`76`	`}`
`77`		`-`
	`77`	`+ uint64_t mutation_counter() const {`
	`78`	`+ return functional_storage_impl()->mutation_counter();`
	`79`	`+ }`
`78`	`80`	`void mark_mutation() {`
`79`	`81`	`functional_storage_impl()->mark_mutation();`
`80`	`82`	`}`
Original file line number	Diff line number	Diff line change
`@@ -265,13 +265,15 @@ def aot_dispatch_autograd_graph(`
`265`	`265`	`fw_metadata,`
`266`	`266`	`)`
`267`	`267`	`joint_fn_to_trace = create_joint(fn_prepared_for_autograd, aot_config=aot_config)`
	`268`	`+ joint_fn_handle = joint_fn_to_trace.handle`
`268`	`269`
`269`	`270`	`joint_fn_to_trace, updated_joint_inputs = create_functionalized_fn(`
`270`	`271`	`joint_fn_to_trace,`
`271`	`272`	`joint_inputs,`
`272`	`273`	`meta=fw_metadata,`
`273`	`274`	`aot_config=aot_config,`
`274`	`275`	`trace_joint=True,`
	`276`	`+ joint_fn_handle=joint_fn_handle,`
`275`	`277`	`)`
`276`	`278`
`277`	`279`	`# TODO: replace with AOTDispatchSubclassWrapper once we refactor`