diff --git a/docs/source/testing.rst b/docs/source/testing.rst index aef3b7efc8126d..0a9d3d525bfa9e 100644 --- a/docs/source/testing.rst +++ b/docs/source/testing.rst @@ -450,7 +450,8 @@ Inside tests: .. code-block:: bash - torch.cuda.device_count() + from transformers.testing_utils import get_gpu_count + n_gpu = get_gpu_count() # works with torch and tf diff --git a/examples/seq2seq/test_finetune_trainer.py b/examples/seq2seq/test_finetune_trainer.py index 923ecf6d945831..6da0e240c41959 100644 --- a/examples/seq2seq/test_finetune_trainer.py +++ b/examples/seq2seq/test_finetune_trainer.py @@ -2,9 +2,9 @@ import sys from unittest.mock import patch -from transformers import BertTokenizer, EncoderDecoderModel, is_torch_available +from transformers import BertTokenizer, EncoderDecoderModel from transformers.file_utils import is_datasets_available -from transformers.testing_utils import TestCasePlus, execute_subprocess_async, slow +from transformers.testing_utils import TestCasePlus, execute_subprocess_async, get_gpu_count, slow from transformers.trainer_callback import TrainerState from transformers.trainer_utils import set_seed @@ -13,9 +13,6 @@ from .test_seq2seq_examples import MBART_TINY -if is_torch_available(): - import torch - set_seed(42) MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1" @@ -196,7 +193,7 @@ def run_trainer(self, eval_steps: int, max_len: str, model_name: str, num_train_ """.split() # --eval_beams 2 - n_gpu = torch.cuda.device_count() + n_gpu = get_gpu_count() if n_gpu > 1: distributed_args = f""" -m torch.distributed.launch diff --git a/examples/seq2seq/test_seq2seq_examples_multi_gpu.py b/examples/seq2seq/test_seq2seq_examples_multi_gpu.py index 463ad1e7d9b8c4..efc23b5681e040 100644 --- a/examples/seq2seq/test_seq2seq_examples_multi_gpu.py +++ b/examples/seq2seq/test_seq2seq_examples_multi_gpu.py @@ -3,7 +3,14 @@ import os import sys -from transformers.testing_utils import TestCasePlus, execute_subprocess_async, require_torch_multigpu +from transformers.testing_utils import ( + TestCasePlus, + execute_subprocess_async, + get_gpu_count, + require_torch_gpu, + require_torch_multigpu, + slow, +) from .test_seq2seq_examples import CHEAP_ARGS, make_test_data_dir from .utils import load_json @@ -80,3 +87,30 @@ def convert(k, v): self.assertEqual(len(metrics["test"]), 1) desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) / 2 + 1) self.assertEqual(len(metrics["val"]), desired_n_evals) + + @slow + @require_torch_gpu + def test_distributed_eval(self): + output_dir = self.get_auto_remove_tmp_dir() + args = f""" + --model_name Helsinki-NLP/opus-mt-en-ro + --save_dir {output_dir} + --data_dir test_data/wmt_en_ro + --num_beams 2 + --task translation + """.split() + + # we want this test to run even if there is only one GPU, but if there are more we use them all + n_gpu = get_gpu_count() + distributed_args = f""" + -m torch.distributed.launch + --nproc_per_node={n_gpu} + {self.test_file_dir}/run_distributed_eval.py + """.split() + cmd = [sys.executable] + distributed_args + args + execute_subprocess_async(cmd, env=self.get_env()) + + metrics_save_path = os.path.join(output_dir, "test_bleu.json") + metrics = load_json(metrics_save_path) + # print(metrics) + self.assertGreaterEqual(metrics["bleu"], 25) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 8eb41ac85f8817..02998bcfd656b6 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -297,6 +297,22 @@ def require_ray(test_case): return test_case +def get_gpu_count(): + """ + Return the number of available gpus (regardless of whether torch or tf is used) + """ + if _torch_available: + import torch + + return torch.cuda.device_count() + elif _tf_available: + import tensorflow as tf + + return len(tf.config.list_physical_devices("GPU")) + else: + return 0 + + def get_tests_dir(append_path=None): """ Args: