From 437ae82ad39f9154f1d649e277ba9398d1236ac0 Mon Sep 17 00:00:00 2001
From: Narine Kokhlikyan <narine@meta.com>
Date: Mon, 27 Nov 2023 14:17:54 -0800
Subject: [PATCH] add influence gpu tests not using `DataParallel`

Summary:
Currently, when testing implementations of `TracInCPBase`, if the model to be tested is on gpu, we always wrap it in `DataParallel`.  However, it is also worth testing when the model is on gpu, but is *not* wrapped in `DataParallel`.  Whether the model is on gpu is currently specified by a `use_gpu` flag, which is boolean.  In this diff, we change `use_gpu` to have type `Union[bool, str]`, which allowable values of `False` (model on cpu), `'cuda'` (model on gpu, not using `DataParallel`, and `'cuda_data_parallel'` (model on gpu, using `DataParallel`).  This has backwards compatibility with classes like `ExplicitDataset`, which moves data to gpu `if use_gpu`, as strings are interpreted as being true.  In further detail, the changes are as follows:
- for tests (`TestTracInSelfInfluence`, `TestTracInKMostInfluential`) where `use_gpu` was called with `True`, now call them with values of `'cuda'` and `'cuda_parallel'` (in addition to `False`)
- in those tests, make the layer names have the 'module' prefix only when `use_gpu='cuda_data_parallel'`
- change `get_random_model_and_data`, which is where the `use_gpu` flag is used to create model and data, to reflect the new logic

Reviewed By: vivekmig

Differential Revision: D47190429
---
 .../_core/test_tracin_k_most_influential.py   | 10 ++++---
 .../_core/test_tracin_self_influence.py       | 12 +++++----
 tests/influence/_utils/common.py              | 27 ++++++++++++++++---
 3 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/tests/influence/_core/test_tracin_k_most_influential.py b/tests/influence/_core/test_tracin_k_most_influential.py
index 3b4bc89d33..7709ef3c23 100644
--- a/tests/influence/_core/test_tracin_k_most_influential.py
+++ b/tests/influence/_core/test_tracin_k_most_influential.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable
+from typing import Callable, Union
 
 import torch
 import torch.nn as nn
@@ -18,7 +18,7 @@
 class TestTracInGetKMostInfluential(BaseTest):
 
     use_gpu_list = (
-        [True, False]
+        [False, "cuda", "cuda_data_parallel"]
         if torch.cuda.is_available() and torch.cuda.device_count() != 0
         else [False]
     )
@@ -48,7 +48,9 @@ class TestTracInGetKMostInfluential(BaseTest):
                             DataInfluenceConstructor(
                                 TracInCP,
                                 name="linear2",
-                                layers=["module.linear2"] if use_gpu else ["linear2"],
+                                layers=["module.linear2"]
+                                if use_gpu == "cuda_data_parallel"
+                                else ["linear2"],
                             ),
                             False,
                         ),
@@ -83,7 +85,7 @@ def test_tracin_k_most_influential(
         proponents: bool,
         batch_size: int,
         k: int,
-        use_gpu: bool,
+        use_gpu: Union[bool, str],
         aggregate: bool,
     ) -> None:
         """
diff --git a/tests/influence/_core/test_tracin_self_influence.py b/tests/influence/_core/test_tracin_self_influence.py
index e79656e08f..767aed6b02 100644
--- a/tests/influence/_core/test_tracin_self_influence.py
+++ b/tests/influence/_core/test_tracin_self_influence.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable
+from typing import Callable, Union
 
 import torch
 import torch.nn as nn
@@ -19,7 +19,7 @@
 class TestTracInSelfInfluence(BaseTest):
 
     use_gpu_list = (
-        [True, False]
+        [False, "cuda", "cuda_data_parallel"]
         if torch.cuda.is_available() and torch.cuda.device_count() != 0
         else [False]
     )
@@ -37,7 +37,9 @@ class TestTracInSelfInfluence(BaseTest):
                     DataInfluenceConstructor(
                         TracInCP,
                         name="TracInCP_linear1",
-                        layers=["module.linear1"] if use_gpu else ["linear1"],
+                        layers=["module.linear1"]
+                        if use_gpu == "cuda_data_parallel"
+                        else ["linear1"],
                     ),
                 ),
                 (
@@ -46,7 +48,7 @@ class TestTracInSelfInfluence(BaseTest):
                         TracInCP,
                         name="TracInCP_linear1_linear2",
                         layers=["module.linear1", "module.linear2"]
-                        if use_gpu
+                        if use_gpu == "cuda_data_parallel"
                         else ["linear1", "linear2"],
                     ),
                 ),
@@ -87,7 +89,7 @@ def test_tracin_self_influence(
         reduction: str,
         tracin_constructor: Callable,
         unpack_inputs: bool,
-        use_gpu: bool,
+        use_gpu: Union[bool, str],
     ) -> None:
         with tempfile.TemporaryDirectory() as tmpdir:
             (net, train_dataset,) = get_random_model_and_data(
diff --git a/tests/influence/_utils/common.py b/tests/influence/_utils/common.py
index f65f8b6b8d..17fe5b46cb 100644
--- a/tests/influence/_utils/common.py
+++ b/tests/influence/_utils/common.py
@@ -183,6 +183,16 @@ def forward(self, *inputs):
 def get_random_model_and_data(
     tmpdir, unpack_inputs, return_test_data=True, use_gpu=False
 ):
+    """
+    `use_gpu` can either be
+    - `False`: returned model is on cpu
+    - `'cuda'`: returned model is on gpu
+    - `'cuda_data_parallel``: returned model is a `DataParallel` model, and on cpu
+    The need to differentiate between `'cuda'` and `'cuda_data_parallel'`
+    is that sometimes we may want to test a model that is on cpu, but is *not*
+    wrapped in `DataParallel`.
+    """
+    assert use_gpu in [False, "cuda", "cuda_data_parallel"]
 
     in_features, hidden_nodes, out_features = 5, 4, 3
     num_inputs = 2
@@ -209,7 +219,11 @@ def get_random_model_and_data(
         if hasattr(net, "pre"):
             net.pre.weight.data = net.pre.weight.data.double()
         checkpoint_name = "-".join(["checkpoint-reg", str(i + 1) + ".pt"])
-        net_adjusted = _wrap_model_in_dataparallel(net) if use_gpu else net
+        net_adjusted = (
+            _wrap_model_in_dataparallel(net)
+            if use_gpu == "cuda_data_parallel"
+            else (net.to(device="cuda") if use_gpu == "cuda" else net)
+        )
         torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name))
 
     num_samples = 50
@@ -238,7 +252,9 @@ def get_random_model_and_data(
 
     if return_test_data:
         return (
-            _wrap_model_in_dataparallel(net) if use_gpu else net,
+            _wrap_model_in_dataparallel(net)
+            if use_gpu == "cuda_data_parallel"
+            else (net.to(device="cuda") if use_gpu == "cuda" else net),
             dataset,
             _move_sample_to_cuda(test_samples)
             if isinstance(test_samples, list) and use_gpu
@@ -248,7 +264,12 @@ def get_random_model_and_data(
             test_labels.cuda() if use_gpu else test_labels,
         )
     else:
-        return _wrap_model_in_dataparallel(net) if use_gpu else net, dataset
+        return (
+            _wrap_model_in_dataparallel(net)
+            if use_gpu == "cuda_data_parallel"
+            else (net.to(device="cuda") if use_gpu == "cuda" else net),
+            dataset,
+        )
 
 
 class DataInfluenceConstructor: