Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make more cuda-only tests device-agnostic #2876

Merged
merged 7 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions tests/test_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def test_save_model_offload(self, use_safetensors):
assert torch.allclose(expected, output, atol=1e-5)

@parameterized.expand([True, False], name_func=parameterized_custom_name_func)
@require_cuda
@require_non_cpu
def test_get_state_dict_from_offload(self, use_safetensors):
accelerator = Accelerator()

Expand All @@ -312,18 +312,18 @@ def test_get_state_dict_from_offload(self, use_safetensors):
cpu_onloaded_layer = get_state_dict_from_offload(
model.linear2, "linear2.weight", {"linear2.weight": ""}, device_to_put_offload="cpu"
)
cuda_onloaded_layer = get_state_dict_from_offload(
device_onloaded_layer = get_state_dict_from_offload(
model.linear2, "linear2.weight", {"linear2.weight": ""}, device_to_put_offload=0
)
cpu_onloaded_layer_weight = cpu_onloaded_layer["linear2.weight"]
cuda_onloaded_layer_weight = cuda_onloaded_layer["linear2.weight"]
device_onloaded_layer_weight = device_onloaded_layer["linear2.weight"]

assert torch.allclose(offloaded_layer_weight, cpu_onloaded_layer_weight)
assert torch.allclose(
offloaded_layer_weight, cuda_onloaded_layer_weight.to("cpu")
offloaded_layer_weight, device_onloaded_layer_weight.to("cpu")
) # must be on the same device for torch.allclose()
assert cpu_onloaded_layer_weight.device.type == "cpu"
assert cuda_onloaded_layer_weight.device.type == "cuda"
assert device_onloaded_layer_weight.device.type == torch_device

@parameterized.expand([True, False], name_func=parameterized_custom_name_func)
def test_save_load_model_with_hooks(self, use_safetensors):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_big_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def test_dispatch_model_move_offloaded_model(self):
with self.assertRaises(RuntimeError):
model.to(0)

@require_multi_gpu
@require_multi_device
def test_dispatch_model_move_model_warning(self):
model = ModelForTest()
device_map = {"linear1": 0, "batchnorm": 0, "linear2": 1}
Expand All @@ -664,7 +664,7 @@ def test_dispatch_model_move_model_warning(self):
with self.assertLogs("accelerate.big_modeling", level="WARNING"):
model.to("cpu")
with self.assertLogs("accelerate.big_modeling", level="WARNING"):
model.cuda(0)
model.to(torch_device)
with self.assertRaises(RuntimeError):
x = torch.randn(2, 3)
model(x)
Expand Down
7 changes: 4 additions & 3 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
TempDirTestCase,
get_launch_command,
require_huggingface_suite,
require_multi_device,
require_multi_gpu,
require_pippy,
require_schedulefree,
Expand Down Expand Up @@ -253,17 +254,17 @@ def test_profiler(self):
testargs = ["examples/by_feature/profiler.py"]
run_command(self.launch_args + testargs)

@require_multi_gpu
@require_multi_device
def test_ddp_comm_hook(self):
testargs = ["examples/by_feature/ddp_comm_hook.py", "--ddp_comm_hook", "fp16"]
run_command(self.launch_args + testargs)

@require_multi_gpu
@require_multi_device
def test_distributed_inference_examples_stable_diffusion(self):
testargs = ["examples/inference/distributed/stable_diffusion.py"]
run_command(self.launch_args + testargs)

@require_multi_gpu
@require_multi_device
def test_distributed_inference_examples_phi2(self):
testargs = ["examples/inference/distributed/phi2.py"]
run_command(self.launch_args + testargs)
Expand Down
51 changes: 30 additions & 21 deletions tests/test_modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@
from safetensors.torch import save_file

from accelerate import init_empty_weights
from accelerate.test_utils import require_cuda, require_huggingface_suite, require_multi_gpu
from accelerate.test_utils import (
require_cuda,
require_huggingface_suite,
require_multi_device,
require_non_cpu,
torch_device,
)
from accelerate.utils.modeling import (
check_device_map,
clean_device_map,
Expand All @@ -44,6 +50,9 @@
)


torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"


class ModelForTest(nn.Module):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -150,20 +159,20 @@ def test_set_module_tensor_to_meta_and_cpu(self):
model = ModelForTest()
self.check_set_module_tensor_for_device(model, "cpu", "meta")

@require_cuda
@require_non_cpu
def test_set_module_tensor_to_cpu_and_gpu(self):
model = ModelForTest()
self.check_set_module_tensor_for_device(model, "cpu", 0)
self.check_set_module_tensor_for_device(model, "cpu", torch_device)

@require_cuda
@require_non_cpu
def test_set_module_tensor_to_meta_and_gpu(self):
model = ModelForTest().to(0)
self.check_set_module_tensor_for_device(model, 0, "meta")
model = ModelForTest().to(torch_device)
self.check_set_module_tensor_for_device(model, torch_device, "meta")

@require_multi_gpu
@require_multi_device
def test_set_module_tensor_between_gpus(self):
model = ModelForTest().to(0)
self.check_set_module_tensor_for_device(model, 0, 1)
model = ModelForTest().to(torch_device)
self.check_set_module_tensor_for_device(model, torch_device, torch_device.replace("0", "1"))

def test_set_module_tensor_sets_dtype(self):
model = ModelForTest()
Expand Down Expand Up @@ -361,7 +370,7 @@ def test_load_checkpoint_in_model(self):
self.shard_test_model(model, tmp_dir)
load_checkpoint_in_model(model, tmp_dir)

@require_cuda
@require_non_cpu
def test_load_checkpoint_in_model_one_gpu(self):
device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": "cpu"}

Expand All @@ -371,7 +380,7 @@ def test_load_checkpoint_in_model_one_gpu(self):
fname = os.path.join(tmp_dir, "pt_model.bin")
torch.save(model.state_dict(), fname)
load_checkpoint_in_model(model, fname, device_map=device_map)
assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device("cpu")

Expand All @@ -382,7 +391,7 @@ def test_load_checkpoint_in_model_one_gpu(self):
index_file = os.path.join(tmp_dir, "weight_map.index.json")
load_checkpoint_in_model(model, index_file, device_map=device_map)

assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device("cpu")

Expand All @@ -392,11 +401,11 @@ def test_load_checkpoint_in_model_one_gpu(self):
self.shard_test_model(model, tmp_dir)
load_checkpoint_in_model(model, tmp_dir, device_map=device_map)

assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device("cpu")

@require_cuda
@require_non_cpu
def test_load_checkpoint_in_model_disk_offload(self):
device_map = {"linear1": "cpu", "batchnorm": "disk", "linear2": "cpu"}

Expand All @@ -421,7 +430,7 @@ def test_load_checkpoint_in_model_disk_offload(self):
assert model.batchnorm.running_mean.device == torch.device("meta")
assert model.linear2.weight.device == torch.device("cpu")

@require_multi_gpu
@require_multi_device
def test_load_checkpoint_in_model_two_gpu(self):
device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": 1}

Expand All @@ -431,9 +440,9 @@ def test_load_checkpoint_in_model_two_gpu(self):
fname = os.path.join(tmp_dir, "pt_model.bin")
torch.save(model.state_dict(), fname)
load_checkpoint_in_model(model, fname, device_map=device_map)
assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device(1)
assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))

# Check with sharded index
model = ModelForTest()
Expand All @@ -442,19 +451,19 @@ def test_load_checkpoint_in_model_two_gpu(self):
index_file = os.path.join(tmp_dir, "weight_map.index.json")
load_checkpoint_in_model(model, index_file, device_map=device_map)

assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device(1)
assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))

# Check with sharded checkpoint
model = ModelForTest()
with tempfile.TemporaryDirectory() as tmp_dir:
self.shard_test_model(model, tmp_dir)
load_checkpoint_in_model(model, tmp_dir, device_map=device_map)

assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device(1)
assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))

def test_load_checkpoint_in_model_dtype(self):
with tempfile.NamedTemporaryFile(suffix=".pt") as tmpfile:
Expand Down
14 changes: 12 additions & 2 deletions tests/test_multigpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
require_non_torch_xla,
require_pippy,
require_torchvision,
torch_device,
)
from accelerate.utils import patch_environment

Expand Down Expand Up @@ -72,15 +73,24 @@ def test_multi_device_merge_fsdp_weights(self):
execute_subprocess_async(cmd)

@require_non_torch_xla
@require_multi_gpu
@require_multi_device
def test_distributed_data_loop(self):
"""
This TestCase checks the behaviour that occurs during distributed training or evaluation,
when the batch size does not evenly divide the dataset size.
"""
print(f"Found {device_count} devices, using 2 devices only")
cmd = get_launch_command(num_processes=2) + [self.data_loop_file_path]
with patch_environment(omp_num_threads=1, cuda_visible_devices="0,1"):
env_kwargs = dict(omp_num_threads=1)
if torch_device == "xpu":
env_kwargs.update(ze_affinity_mask="0,1")
elif torch_device == "npu":
env_kwargs.update(ascend_rt_visible_devices="0,1")
elif torch_device == "mlu":
env_kwargs.update(mlu_visible_devices="0,1")
else:
env_kwargs.update(cuda_visible_devices="0,1")
with patch_environment(**env_kwargs):
execute_subprocess_async(cmd)

@require_multi_gpu
Expand Down
Loading