From 437ae82ad39f9154f1d649e277ba9398d1236ac0 Mon Sep 17 00:00:00 2001 From: Narine Kokhlikyan Date: Mon, 27 Nov 2023 14:17:54 -0800 Subject: [PATCH] add influence gpu tests not using `DataParallel` Summary: Currently, when testing implementations of `TracInCPBase`, if the model to be tested is on gpu, we always wrap it in `DataParallel`. However, it is also worth testing when the model is on gpu, but is *not* wrapped in `DataParallel`. Whether the model is on gpu is currently specified by a `use_gpu` flag, which is boolean. In this diff, we change `use_gpu` to have type `Union[bool, str]`, which allowable values of `False` (model on cpu), `'cuda'` (model on gpu, not using `DataParallel`, and `'cuda_data_parallel'` (model on gpu, using `DataParallel`). This has backwards compatibility with classes like `ExplicitDataset`, which moves data to gpu `if use_gpu`, as strings are interpreted as being true. In further detail, the changes are as follows: - for tests (`TestTracInSelfInfluence`, `TestTracInKMostInfluential`) where `use_gpu` was called with `True`, now call them with values of `'cuda'` and `'cuda_parallel'` (in addition to `False`) - in those tests, make the layer names have the 'module' prefix only when `use_gpu='cuda_data_parallel'` - change `get_random_model_and_data`, which is where the `use_gpu` flag is used to create model and data, to reflect the new logic Reviewed By: vivekmig Differential Revision: D47190429 --- .../_core/test_tracin_k_most_influential.py | 10 ++++--- .../_core/test_tracin_self_influence.py | 12 +++++---- tests/influence/_utils/common.py | 27 ++++++++++++++++--- 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/tests/influence/_core/test_tracin_k_most_influential.py b/tests/influence/_core/test_tracin_k_most_influential.py index 3b4bc89d33..7709ef3c23 100644 --- a/tests/influence/_core/test_tracin_k_most_influential.py +++ b/tests/influence/_core/test_tracin_k_most_influential.py @@ -1,5 +1,5 @@ import tempfile -from typing import Callable +from typing import Callable, Union import torch import torch.nn as nn @@ -18,7 +18,7 @@ class TestTracInGetKMostInfluential(BaseTest): use_gpu_list = ( - [True, False] + [False, "cuda", "cuda_data_parallel"] if torch.cuda.is_available() and torch.cuda.device_count() != 0 else [False] ) @@ -48,7 +48,9 @@ class TestTracInGetKMostInfluential(BaseTest): DataInfluenceConstructor( TracInCP, name="linear2", - layers=["module.linear2"] if use_gpu else ["linear2"], + layers=["module.linear2"] + if use_gpu == "cuda_data_parallel" + else ["linear2"], ), False, ), @@ -83,7 +85,7 @@ def test_tracin_k_most_influential( proponents: bool, batch_size: int, k: int, - use_gpu: bool, + use_gpu: Union[bool, str], aggregate: bool, ) -> None: """ diff --git a/tests/influence/_core/test_tracin_self_influence.py b/tests/influence/_core/test_tracin_self_influence.py index e79656e08f..767aed6b02 100644 --- a/tests/influence/_core/test_tracin_self_influence.py +++ b/tests/influence/_core/test_tracin_self_influence.py @@ -1,5 +1,5 @@ import tempfile -from typing import Callable +from typing import Callable, Union import torch import torch.nn as nn @@ -19,7 +19,7 @@ class TestTracInSelfInfluence(BaseTest): use_gpu_list = ( - [True, False] + [False, "cuda", "cuda_data_parallel"] if torch.cuda.is_available() and torch.cuda.device_count() != 0 else [False] ) @@ -37,7 +37,9 @@ class TestTracInSelfInfluence(BaseTest): DataInfluenceConstructor( TracInCP, name="TracInCP_linear1", - layers=["module.linear1"] if use_gpu else ["linear1"], + layers=["module.linear1"] + if use_gpu == "cuda_data_parallel" + else ["linear1"], ), ), ( @@ -46,7 +48,7 @@ class TestTracInSelfInfluence(BaseTest): TracInCP, name="TracInCP_linear1_linear2", layers=["module.linear1", "module.linear2"] - if use_gpu + if use_gpu == "cuda_data_parallel" else ["linear1", "linear2"], ), ), @@ -87,7 +89,7 @@ def test_tracin_self_influence( reduction: str, tracin_constructor: Callable, unpack_inputs: bool, - use_gpu: bool, + use_gpu: Union[bool, str], ) -> None: with tempfile.TemporaryDirectory() as tmpdir: (net, train_dataset,) = get_random_model_and_data( diff --git a/tests/influence/_utils/common.py b/tests/influence/_utils/common.py index f65f8b6b8d..17fe5b46cb 100644 --- a/tests/influence/_utils/common.py +++ b/tests/influence/_utils/common.py @@ -183,6 +183,16 @@ def forward(self, *inputs): def get_random_model_and_data( tmpdir, unpack_inputs, return_test_data=True, use_gpu=False ): + """ + `use_gpu` can either be + - `False`: returned model is on cpu + - `'cuda'`: returned model is on gpu + - `'cuda_data_parallel``: returned model is a `DataParallel` model, and on cpu + The need to differentiate between `'cuda'` and `'cuda_data_parallel'` + is that sometimes we may want to test a model that is on cpu, but is *not* + wrapped in `DataParallel`. + """ + assert use_gpu in [False, "cuda", "cuda_data_parallel"] in_features, hidden_nodes, out_features = 5, 4, 3 num_inputs = 2 @@ -209,7 +219,11 @@ def get_random_model_and_data( if hasattr(net, "pre"): net.pre.weight.data = net.pre.weight.data.double() checkpoint_name = "-".join(["checkpoint-reg", str(i + 1) + ".pt"]) - net_adjusted = _wrap_model_in_dataparallel(net) if use_gpu else net + net_adjusted = ( + _wrap_model_in_dataparallel(net) + if use_gpu == "cuda_data_parallel" + else (net.to(device="cuda") if use_gpu == "cuda" else net) + ) torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name)) num_samples = 50 @@ -238,7 +252,9 @@ def get_random_model_and_data( if return_test_data: return ( - _wrap_model_in_dataparallel(net) if use_gpu else net, + _wrap_model_in_dataparallel(net) + if use_gpu == "cuda_data_parallel" + else (net.to(device="cuda") if use_gpu == "cuda" else net), dataset, _move_sample_to_cuda(test_samples) if isinstance(test_samples, list) and use_gpu @@ -248,7 +264,12 @@ def get_random_model_and_data( test_labels.cuda() if use_gpu else test_labels, ) else: - return _wrap_model_in_dataparallel(net) if use_gpu else net, dataset + return ( + _wrap_model_in_dataparallel(net) + if use_gpu == "cuda_data_parallel" + else (net.to(device="cuda") if use_gpu == "cuda" else net), + dataset, + ) class DataInfluenceConstructor: