From 32b1433f1c1846663e9e9913e6dca3f1b0145f61 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 5 Nov 2020 13:07:32 -0800
Subject: [PATCH 1/7] add a multi-gpu job for all example tests

---
 .github/workflows/self-scheduled.yml | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index decd61f4b09536..7022d6f362af61 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -227,7 +227,7 @@ jobs:
           python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
           python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
 
-      - name: Run all tests on GPU
+      - name: Run all tests on multi-GPU
         env:
           OMP_NUM_THREADS: 1
           RUN_SLOW: yes
@@ -238,8 +238,20 @@ jobs:
       - name: Failure short reports
         if: ${{ always() }}
         run: cat reports/tests_torch_multiple_gpu_failures_short.txt
-        
-      - name: Run all pipeline tests on GPU
+
+      - name: Run all examples tests on multi-GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_multiple_gpu examples
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/examples_torch_multiple_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on multi-GPU
         if: ${{ always() }}
         env:
           TF_FORCE_GPU_ALLOW_GROWTH: "true"
@@ -306,7 +318,7 @@ jobs:
           TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
           TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
 
-      - name: Run all tests on GPU
+      - name: Run all tests on multi-GPU
         env:
           OMP_NUM_THREADS: 1
           RUN_SLOW: yes
@@ -318,7 +330,7 @@ jobs:
         if: ${{ always() }}
         run: cat reports/tests_tf_multiple_gpu_failures_short.txt
 
-      - name: Run all pipeline tests on GPU
+      - name: Run all pipeline tests on multi-GPU
         if: ${{ always() }}
         env:
           TF_FORCE_GPU_ALLOW_GROWTH: "true"

From ab42b86e7e7e2a4d4ac3695e00d2be781ddcd9ba Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 5 Nov 2020 15:28:24 -0800
Subject: [PATCH 2/7] run only ported tests

---
 .github/workflows/self-scheduled.yml | 4 ++--
 examples/passing-multigpu-tests.txt  | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)
 create mode 100644 examples/passing-multigpu-tests.txt

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 7022d6f362af61..a67710d136ef7e 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -239,13 +239,13 @@ jobs:
         if: ${{ always() }}
         run: cat reports/tests_torch_multiple_gpu_failures_short.txt
 
-      - name: Run all examples tests on multi-GPU
+      - name: Run ported examples tests on multi-GPU
         env:
           OMP_NUM_THREADS: 1
           RUN_SLOW: yes
         run: |
           source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_multiple_gpu examples
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_multiple_gpu $(tr '\n' ' ' < examples/passing-multigpu-tests.txt)
 
       - name: Failure short reports
         if: ${{ always() }}
diff --git a/examples/passing-multigpu-tests.txt b/examples/passing-multigpu-tests.txt
new file mode 100644
index 00000000000000..2ead41cbd9c12a
--- /dev/null
+++ b/examples/passing-multigpu-tests.txt
@@ -0,0 +1,4 @@
+examples/seq2seq/test_finetune_trainer.py::TestFinetuneTrainer::test_finetune_trainer
+examples/seq2seq/test_finetune_trainer.py::TestFinetuneTrainer::test_finetune_trainer_slow
+examples/seq2seq/test_seq2seq_examples_multi_gpu.py::TestSummarizationDistillerMultiGPU::test_distributed_eval
+examples/seq2seq/test_seq2seq_examples_multi_gpu.py::TestSummarizationDistillerMultiGPU::test_multigpu

From 5cc730857b55a11483754fc900af1c3de8f10410 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 5 Nov 2020 15:31:23 -0800
Subject: [PATCH 3/7] rename

---
 .github/workflows/self-scheduled.yml                            | 2 +-
 .../{passing-multigpu-tests.txt => ported-multigpu-tests.txt}   | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/{passing-multigpu-tests.txt => ported-multigpu-tests.txt} (100%)

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index a67710d136ef7e..53470096214440 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -245,7 +245,7 @@ jobs:
           RUN_SLOW: yes
         run: |
           source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_multiple_gpu $(tr '\n' ' ' < examples/passing-multigpu-tests.txt)
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_multiple_gpu $(tr '\n' ' ' < examples/ported-multigpu-tests.txt)
 
       - name: Failure short reports
         if: ${{ always() }}
diff --git a/examples/passing-multigpu-tests.txt b/examples/ported-multigpu-tests.txt
similarity index 100%
rename from examples/passing-multigpu-tests.txt
rename to examples/ported-multigpu-tests.txt

From 28229d5aead8709476a9c787bb792d0adca16e2b Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 5 Nov 2020 15:49:19 -0800
Subject: [PATCH 4/7] explain why env is re-activated on each step

---
 .github/workflows/self-scheduled.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 53470096214440..760abb8b15ea8f 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -1,3 +1,8 @@
+# configuration notes:
+#
+# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
+# the step uses the system-wide python interpreter.
+
 name: Self-hosted runner (scheduled)
 
 on:

From 1e2d201b659a8b79ac68de9008e6a597fb4373fe Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 7 Nov 2020 17:41:34 -0800
Subject: [PATCH 5/7] mark all unported/checked tests with
 @require_torch_non_multigpu_but_fix_me

---
 .github/workflows/self-scheduled.yml          |  2 +-
 .../test_run_glue_with_pabee.py               |  3 ++-
 examples/deebert/test_glue_deebert.py         |  3 ++-
 examples/ported-multigpu-tests.txt            |  4 ----
 examples/rag/test_distributed_retriever.py    |  4 ++++
 examples/seq2seq/test_bash_script.py          |  5 ++++-
 examples/seq2seq/test_datasets.py             |  9 +++++++-
 examples/seq2seq/test_fsmt_bleu_score.py      |  9 +++++++-
 examples/seq2seq/test_make_student.py         |  7 ++++++-
 examples/seq2seq/test_seq2seq_examples.py     | 21 ++++++++++++++++++-
 examples/seq2seq/test_tatoeba_conversion.py   |  4 +++-
 examples/test_examples.py                     |  8 ++++++-
 examples/test_xla_examples.py                 |  4 +++-
 .../token-classification/test_ner_examples.py |  4 +++-
 src/transformers/testing_utils.py             |  6 ++++++
 15 files changed, 77 insertions(+), 16 deletions(-)
 delete mode 100644 examples/ported-multigpu-tests.txt

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 760abb8b15ea8f..1771dd95b586e3 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -250,7 +250,7 @@ jobs:
           RUN_SLOW: yes
         run: |
           source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_multiple_gpu $(tr '\n' ' ' < examples/ported-multigpu-tests.txt)
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_multiple_gpu examples
 
       - name: Failure short reports
         if: ${{ always() }}
diff --git a/examples/bert-loses-patience/test_run_glue_with_pabee.py b/examples/bert-loses-patience/test_run_glue_with_pabee.py
index 22c6f4de06f430..eaac5329379c11 100644
--- a/examples/bert-loses-patience/test_run_glue_with_pabee.py
+++ b/examples/bert-loses-patience/test_run_glue_with_pabee.py
@@ -4,7 +4,7 @@
 from unittest.mock import patch
 
 import run_glue_with_pabee
-from transformers.testing_utils import TestCasePlus
+from transformers.testing_utils import TestCasePlus, require_torch_non_multigpu_but_fix_me
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -20,6 +20,7 @@ def get_setup_file():
 
 
 class PabeeTests(TestCasePlus):
+    @require_torch_non_multigpu_but_fix_me
     def test_run_glue(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
diff --git a/examples/deebert/test_glue_deebert.py b/examples/deebert/test_glue_deebert.py
index 59f7f58024f4e9..66faa557c0d0ba 100644
--- a/examples/deebert/test_glue_deebert.py
+++ b/examples/deebert/test_glue_deebert.py
@@ -5,7 +5,7 @@
 from unittest.mock import patch
 
 import run_glue_deebert
-from transformers.testing_utils import slow
+from transformers.testing_utils import require_torch_non_multigpu_but_fix_me, slow
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -26,6 +26,7 @@ def setup(self) -> None:
         logger.addHandler(stream_handler)
 
     @slow
+    @require_torch_non_multigpu_but_fix_me
     def test_glue_deebert_train(self):
 
         train_args = """
diff --git a/examples/ported-multigpu-tests.txt b/examples/ported-multigpu-tests.txt
deleted file mode 100644
index 2ead41cbd9c12a..00000000000000
--- a/examples/ported-multigpu-tests.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-examples/seq2seq/test_finetune_trainer.py::TestFinetuneTrainer::test_finetune_trainer
-examples/seq2seq/test_finetune_trainer.py::TestFinetuneTrainer::test_finetune_trainer_slow
-examples/seq2seq/test_seq2seq_examples_multi_gpu.py::TestSummarizationDistillerMultiGPU::test_distributed_eval
-examples/seq2seq/test_seq2seq_examples_multi_gpu.py::TestSummarizationDistillerMultiGPU::test_multigpu
diff --git a/examples/rag/test_distributed_retriever.py b/examples/rag/test_distributed_retriever.py
index 80d8362d1edd1a..be874c83e8b37e 100644
--- a/examples/rag/test_distributed_retriever.py
+++ b/examples/rag/test_distributed_retriever.py
@@ -16,6 +16,7 @@
 from transformers.configuration_rag import RagConfig
 from transformers.file_utils import is_datasets_available, is_faiss_available, is_psutil_available, is_torch_available
 from transformers.retrieval_rag import CustomHFIndex
+from transformers.testing_utils import require_torch_non_multigpu_but_fix_me
 from transformers.tokenization_bart import BartTokenizer
 from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
 from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
@@ -178,6 +179,7 @@ def get_dummy_custom_hf_index_retriever(self, init_retrieval: bool, from_disk: b
             retriever.init_retrieval(port)
         return retriever
 
+    @require_torch_non_multigpu_but_fix_me
     def test_pytorch_distributed_retriever_retrieve(self):
         n_docs = 1
         retriever = self.get_dummy_pytorch_distributed_retriever(init_retrieval=True)
@@ -193,6 +195,7 @@ def test_pytorch_distributed_retriever_retrieve(self):
         self.assertEqual(doc_dicts[1]["id"][0], "0")  # max inner product is reached with first doc
         self.assertListEqual(doc_ids.tolist(), [[1], [0]])
 
+    @require_torch_non_multigpu_but_fix_me
     def test_custom_hf_index_retriever_retrieve(self):
         n_docs = 1
         retriever = self.get_dummy_custom_hf_index_retriever(init_retrieval=True, from_disk=False)
@@ -208,6 +211,7 @@ def test_custom_hf_index_retriever_retrieve(self):
         self.assertEqual(doc_dicts[1]["id"][0], "0")  # max inner product is reached with first doc
         self.assertListEqual(doc_ids.tolist(), [[1], [0]])
 
+    @require_torch_non_multigpu_but_fix_me
     def test_custom_pytorch_distributed_retriever_retrieve_from_disk(self):
         n_docs = 1
         retriever = self.get_dummy_custom_hf_index_retriever(init_retrieval=True, from_disk=True)
diff --git a/examples/seq2seq/test_bash_script.py b/examples/seq2seq/test_bash_script.py
index 71861ef4dbc6a3..3e30af217baf2d 100644
--- a/examples/seq2seq/test_bash_script.py
+++ b/examples/seq2seq/test_bash_script.py
@@ -15,7 +15,7 @@
 from finetune import SummarizationModule, main
 from test_seq2seq_examples import CUDA_AVAILABLE, MBART_TINY
 from transformers import BartForConditionalGeneration, MarianMTModel
-from transformers.testing_utils import TestCasePlus, slow
+from transformers.testing_utils import TestCasePlus, require_torch_non_multigpu_but_fix_me, slow
 from utils import load_json
 
 
@@ -26,6 +26,7 @@
 class TestAll(TestCasePlus):
     @slow
     @pytest.mark.skipif(not CUDA_AVAILABLE, reason="too slow to run on CPU")
+    @require_torch_non_multigpu_but_fix_me
     def test_model_download(self):
         """This warms up the cache so that we can time the next test without including download time, which varies between machines."""
         BartForConditionalGeneration.from_pretrained(MODEL_NAME)
@@ -34,6 +35,7 @@ def test_model_download(self):
     @timeout_decorator.timeout(120)
     @slow
     @pytest.mark.skipif(not CUDA_AVAILABLE, reason="too slow to run on CPU")
+    @require_torch_non_multigpu_but_fix_me
     def test_train_mbart_cc25_enro_script(self):
         data_dir = "examples/seq2seq/test_data/wmt_en_ro"
         env_vars_to_replace = {
@@ -110,6 +112,7 @@ def test_train_mbart_cc25_enro_script(self):
     @timeout_decorator.timeout(600)
     @slow
     @pytest.mark.skipif(not CUDA_AVAILABLE, reason="too slow to run on CPU")
+    @require_torch_non_multigpu_but_fix_me
     def test_opus_mt_distill_script(self):
         data_dir = "examples/seq2seq/test_data/wmt_en_ro"
         env_vars_to_replace = {
diff --git a/examples/seq2seq/test_datasets.py b/examples/seq2seq/test_datasets.py
index 4b5c95ed4e5770..625b6da347d3c1 100644
--- a/examples/seq2seq/test_datasets.py
+++ b/examples/seq2seq/test_datasets.py
@@ -11,7 +11,7 @@
 from test_seq2seq_examples import ARTICLES, BART_TINY, MARIAN_TINY, MBART_TINY, SUMMARIES, T5_TINY, make_test_data_dir
 from transformers import AutoTokenizer
 from transformers.modeling_bart import shift_tokens_right
-from transformers.testing_utils import TestCasePlus, slow
+from transformers.testing_utils import TestCasePlus, require_torch_non_multigpu_but_fix_me, slow
 from utils import FAIRSEQ_AVAILABLE, DistributedSortishSampler, LegacySeq2SeqDataset, Seq2SeqDataset
 
 
@@ -30,6 +30,7 @@ class TestAll(TestCasePlus):
         ],
     )
     @slow
+    @require_torch_non_multigpu_but_fix_me
     def test_seq2seq_dataset_truncation(self, tok_name):
         tokenizer = AutoTokenizer.from_pretrained(tok_name)
         tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
@@ -69,6 +70,7 @@ def test_seq2seq_dataset_truncation(self, tok_name):
             break  # No need to test every batch
 
     @parameterized.expand([BART_TINY, BERT_BASE_CASED])
+    @require_torch_non_multigpu_but_fix_me
     def test_legacy_dataset_truncation(self, tok):
         tokenizer = AutoTokenizer.from_pretrained(tok)
         tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
@@ -93,6 +95,7 @@ def test_legacy_dataset_truncation(self, tok):
             assert max_len_target > trunc_target  # Truncated
             break  # No need to test every batch
 
+    @require_torch_non_multigpu_but_fix_me
     def test_pack_dataset(self):
         tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
@@ -111,6 +114,7 @@ def test_pack_dataset(self):
         assert orig_paths == new_paths
 
     @pytest.mark.skipif(not FAIRSEQ_AVAILABLE, reason="This test requires fairseq")
+    @require_torch_non_multigpu_but_fix_me
     def test_dynamic_batch_size(self):
         if not FAIRSEQ_AVAILABLE:
             return
@@ -135,6 +139,7 @@ def test_dynamic_batch_size(self):
         if failures:
             raise AssertionError(f"too many tokens in {len(failures)} batches")
 
+    @require_torch_non_multigpu_but_fix_me
     def test_sortish_sampler_reduces_padding(self):
         ds, _, tokenizer = self._get_dataset(max_len=512)
         bs = 2
@@ -174,6 +179,7 @@ def _get_dataset(self, n_obs=1000, max_len=128):
         )
         return ds, max_tokens, tokenizer
 
+    @require_torch_non_multigpu_but_fix_me
     def test_distributed_sortish_sampler_splits_indices_between_procs(self):
         ds, max_tokens, tokenizer = self._get_dataset()
         ids1 = set(DistributedSortishSampler(ds, 256, num_replicas=2, rank=0, add_extra_examples=False))
@@ -189,6 +195,7 @@ def test_distributed_sortish_sampler_splits_indices_between_procs(self):
             PEGASUS_XSUM,
         ],
     )
+    @require_torch_non_multigpu_but_fix_me
     def test_dataset_kwargs(self, tok_name):
         tokenizer = AutoTokenizer.from_pretrained(tok_name)
         if tok_name == MBART_TINY:
diff --git a/examples/seq2seq/test_fsmt_bleu_score.py b/examples/seq2seq/test_fsmt_bleu_score.py
index beb7f2bc9857fd..2be6b7d5285282 100644
--- a/examples/seq2seq/test_fsmt_bleu_score.py
+++ b/examples/seq2seq/test_fsmt_bleu_score.py
@@ -19,7 +19,13 @@
 
 from parameterized import parameterized
 from transformers import FSMTForConditionalGeneration, FSMTTokenizer
-from transformers.testing_utils import get_tests_dir, require_torch, slow, torch_device
+from transformers.testing_utils import (
+    get_tests_dir,
+    require_torch,
+    require_torch_non_multigpu_but_fix_me,
+    slow,
+    torch_device,
+)
 from utils import calculate_bleu
 
 
@@ -48,6 +54,7 @@ def get_model(self, mname):
         ]
     )
     @slow
+    @require_torch_non_multigpu_but_fix_me
     def test_bleu_scores(self, pair, min_bleu_score):
         # note: this test is not testing the best performance since it only evals a small batch
         # but it should be enough to detect a regression in the output quality
diff --git a/examples/seq2seq/test_make_student.py b/examples/seq2seq/test_make_student.py
index 0a1688a95cc11e..28b5672f0e3a6d 100644
--- a/examples/seq2seq/test_make_student.py
+++ b/examples/seq2seq/test_make_student.py
@@ -4,7 +4,7 @@
 from make_student import create_student_by_copying_alternating_layers
 from transformers import AutoConfig
 from transformers.file_utils import cached_property
-from transformers.testing_utils import require_torch
+from transformers.testing_utils import require_torch, require_torch_non_multigpu_but_fix_me
 
 
 TINY_BART = "sshleifer/bart-tiny-random"
@@ -17,23 +17,28 @@ class MakeStudentTester(unittest.TestCase):
     def teacher_config(self):
         return AutoConfig.from_pretrained(TINY_BART)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_valid_t5(self):
         student, *_ = create_student_by_copying_alternating_layers(TINY_T5, tempfile.mkdtemp(), e=1, d=1)
         self.assertEqual(student.config.num_hidden_layers, 1)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_asymmetric_t5(self):
         student, *_ = create_student_by_copying_alternating_layers(TINY_T5, tempfile.mkdtemp(), e=1, d=None)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_same_decoder_small_encoder(self):
         student, *_ = create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=1, d=None)
         self.assertEqual(student.config.encoder_layers, 1)
         self.assertEqual(student.config.decoder_layers, self.teacher_config.encoder_layers)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_small_enc_small_dec(self):
         student, *_ = create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=1, d=1)
         self.assertEqual(student.config.encoder_layers, 1)
         self.assertEqual(student.config.decoder_layers, 1)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_raises_assert(self):
         with self.assertRaises(AssertionError):
             create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=None, d=None)
diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py
index 09b34e552a925b..9afa6ab0f20d19 100644
--- a/examples/seq2seq/test_seq2seq_examples.py
+++ b/examples/seq2seq/test_seq2seq_examples.py
@@ -19,7 +19,14 @@
 from run_eval_search import run_search
 from transformers import AutoConfig, AutoModelForSeq2SeqLM
 from transformers.hf_api import HfApi
-from transformers.testing_utils import CaptureStderr, CaptureStdout, TestCasePlus, require_torch_gpu, slow
+from transformers.testing_utils import (
+    CaptureStderr,
+    CaptureStdout,
+    TestCasePlus,
+    require_torch_gpu,
+    require_torch_non_multigpu_but_fix_me,
+    slow,
+)
 from utils import ROUGE_KEYS, label_smoothed_nll_loss, lmap, load_json
 
 
@@ -126,6 +133,7 @@ def setUpClass(cls):
 
     @slow
     @require_torch_gpu
+    @require_torch_non_multigpu_but_fix_me
     def test_hub_configs(self):
         """I put require_torch_gpu cause I only want this to run with self-scheduled."""
 
@@ -143,10 +151,12 @@ def test_hub_configs(self):
                 failures.append(m)
         assert not failures, f"The following models could not be loaded through AutoConfig: {failures}"
 
+    @require_torch_non_multigpu_but_fix_me
     def test_distill_no_teacher(self):
         updates = dict(student_encoder_layers=2, student_decoder_layers=1, no_teacher=True)
         self._test_distiller_cli(updates)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_distill_checkpointing_with_teacher(self):
         updates = dict(
             student_encoder_layers=2,
@@ -171,6 +181,7 @@ def test_distill_checkpointing_with_teacher(self):
         convert_pl_to_hf(ckpts[0], transformer_ckpts[0].parent, out_path_new)
         assert os.path.exists(os.path.join(out_path_new, "pytorch_model.bin"))
 
+    @require_torch_non_multigpu_but_fix_me
     def test_loss_fn(self):
         model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY, return_dict=True)
         input_ids, mask = model.dummy_inputs["input_ids"], model.dummy_inputs["attention_mask"]
@@ -191,6 +202,7 @@ def test_loss_fn(self):
             # TODO: understand why this breaks
             self.assertEqual(nll_loss, model_computed_loss)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_distill_mbart(self):
         updates = dict(
             student_encoder_layers=2,
@@ -215,6 +227,7 @@ def test_distill_mbart(self):
         assert len(all_files) > 2
         self.assertEqual(len(transformer_ckpts), 2)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_distill_t5(self):
         updates = dict(
             student_encoder_layers=1,
@@ -296,18 +309,21 @@ def run_eval_tester(self, model):
 
     # test one model to quickly (no-@slow) catch simple problems and do an
     # extensive testing of functionality with multiple models as @slow separately
+    @require_torch_non_multigpu_but_fix_me
     def test_run_eval(self):
         self.run_eval_tester(T5_TINY)
 
     # any extra models should go into the list here - can be slow
     @parameterized.expand([BART_TINY, MBART_TINY])
     @slow
+    @require_torch_non_multigpu_but_fix_me
     def test_run_eval_slow(self, model):
         self.run_eval_tester(model)
 
     # testing with 2 models to validate: 1. translation (t5) 2. summarization (mbart)
     @parameterized.expand([T5_TINY, MBART_TINY])
     @slow
+    @require_torch_non_multigpu_but_fix_me
     def test_run_eval_search(self, model):
         input_file_name = Path(self.get_auto_remove_tmp_dir()) / "utest_input.source"
         output_file_name = input_file_name.parent / "utest_output.txt"
@@ -358,6 +374,7 @@ def test_run_eval_search(self, model):
     @parameterized.expand(
         [T5_TINY, BART_TINY, MBART_TINY, MARIAN_TINY, FSMT_TINY],
     )
+    @require_torch_non_multigpu_but_fix_me
     def test_finetune(self, model):
         args_d: dict = CHEAP_ARGS.copy()
         task = "translation" if model in [MBART_TINY, MARIAN_TINY, FSMT_TINY] else "summarization"
@@ -409,6 +426,7 @@ def test_finetune(self, model):
         assert isinstance(example_batch, dict)
         assert len(example_batch) >= 4
 
+    @require_torch_non_multigpu_but_fix_me
     def test_finetune_extra_model_args(self):
         args_d: dict = CHEAP_ARGS.copy()
 
@@ -459,6 +477,7 @@ def test_finetune_extra_model_args(self):
             model = main(args)
         assert str(excinfo.value) == f"model config doesn't have a `{unsupported_param}` attribute"
 
+    @require_torch_non_multigpu_but_fix_me
     def test_finetune_lr_schedulers(self):
         args_d: dict = CHEAP_ARGS.copy()
 
diff --git a/examples/seq2seq/test_tatoeba_conversion.py b/examples/seq2seq/test_tatoeba_conversion.py
index 73a4f660fc1f16..4f97eca133ccd3 100644
--- a/examples/seq2seq/test_tatoeba_conversion.py
+++ b/examples/seq2seq/test_tatoeba_conversion.py
@@ -4,7 +4,7 @@
 
 from transformers.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter
 from transformers.file_utils import cached_property
-from transformers.testing_utils import slow
+from transformers.testing_utils import require_torch_non_multigpu_but_fix_me, slow
 
 
 @unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.")
@@ -15,10 +15,12 @@ def resolver(self):
         return TatoebaConverter(save_dir=tmp_dir)
 
     @slow
+    @require_torch_non_multigpu_but_fix_me
     def test_resolver(self):
         self.resolver.convert_models(["heb-eng"])
 
     @slow
+    @require_torch_non_multigpu_but_fix_me
     def test_model_card(self):
         content, mmeta = self.resolver.write_model_card("opus-mt-he-en", dry_run=True)
         assert mmeta["long_pair"] == "heb-eng"
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 4eda398537d715..4a8bc94e0af3e3 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -23,7 +23,7 @@
 import torch
 
 from transformers.file_utils import is_apex_available
-from transformers.testing_utils import TestCasePlus, torch_device
+from transformers.testing_utils import TestCasePlus, require_torch_non_multigpu_but_fix_me, torch_device
 
 
 SRC_DIRS = [
@@ -60,6 +60,7 @@ def is_cuda_and_apex_available():
 
 
 class ExamplesTests(TestCasePlus):
+    @require_torch_non_multigpu_but_fix_me
     def test_run_glue(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
@@ -92,6 +93,7 @@ def test_run_glue(self):
             for value in result.values():
                 self.assertGreaterEqual(value, 0.75)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_run_pl_glue(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
@@ -129,6 +131,7 @@ def test_run_pl_glue(self):
             #         self.assertGreaterEqual(v, 0.75, f"({k})")
             #
 
+    @require_torch_non_multigpu_but_fix_me
     def test_run_clm(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
@@ -160,6 +163,7 @@ def test_run_clm(self):
             result = run_clm.main()
             self.assertLess(result["perplexity"], 100)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_run_mlm(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
@@ -185,6 +189,7 @@ def test_run_mlm(self):
             result = run_mlm.main()
             self.assertLess(result["perplexity"], 42)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_run_squad(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
@@ -213,6 +218,7 @@ def test_run_squad(self):
             self.assertGreaterEqual(result["f1"], 25)
             self.assertGreaterEqual(result["exact"], 21)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_generation(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
diff --git a/examples/test_xla_examples.py b/examples/test_xla_examples.py
index ed1458a010ff36..f8026554b73cac 100644
--- a/examples/test_xla_examples.py
+++ b/examples/test_xla_examples.py
@@ -20,7 +20,7 @@
 from time import time
 from unittest.mock import patch
 
-from transformers.testing_utils import require_torch_tpu
+from transformers.testing_utils import require_torch_non_multigpu_but_fix_me, require_torch_tpu
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -30,6 +30,7 @@
 
 @require_torch_tpu
 class TorchXLAExamplesTests(unittest.TestCase):
+    @require_torch_non_multigpu_but_fix_me
     def test_run_glue(self):
         import xla_spawn
 
@@ -81,6 +82,7 @@ def test_run_glue(self):
             # Assert that the script takes less than 300 seconds to make sure it doesn't hang.
             self.assertLess(end - start, 500)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_trainer_tpu(self):
         import xla_spawn
 
diff --git a/examples/token-classification/test_ner_examples.py b/examples/token-classification/test_ner_examples.py
index d6bb0b25fa3bca..3772d780f62766 100644
--- a/examples/token-classification/test_ner_examples.py
+++ b/examples/token-classification/test_ner_examples.py
@@ -4,7 +4,7 @@
 from unittest.mock import patch
 
 import run_ner
-from transformers.testing_utils import slow
+from transformers.testing_utils import require_torch_non_multigpu_but_fix_me, slow
 
 
 logging.basicConfig(level=logging.INFO)
@@ -14,6 +14,7 @@
 
 class ExamplesTests(unittest.TestCase):
     @slow
+    @require_torch_non_multigpu_but_fix_me
     def test_run_ner(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
@@ -34,6 +35,7 @@ def test_run_ner(self):
             result = run_ner.main()
             self.assertLess(result["eval_loss"], 1.5)
 
+    @require_torch_non_multigpu_but_fix_me
     def test_run_ner_pl(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 02998bcfd656b6..119ff433df5627 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -227,6 +227,12 @@ def require_torch_non_multigpu(test_case):
         return test_case
 
 
+# this is a decorator identical to require_torch_non_multigpu, but is used as a quick band-aid to
+# allow all of examples to be run multi-gpu CI and it reminds us that tests decorated with this one
+# need to be ported and aren't so by design.
+require_torch_non_multigpu_but_fix_me = require_torch_non_multigpu
+
+
 def require_torch_tpu(test_case):
     """
     Decorator marking a test that requires a TPU (in PyTorch).

From 02287ee3b69b8dd1ce6359e4678432546168f0de Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 7 Nov 2020 17:50:08 -0800
Subject: [PATCH 6/7] style

---
 examples/seq2seq/test_bash_script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/seq2seq/test_bash_script.py b/examples/seq2seq/test_bash_script.py
index bc354e1e5cfb80..fffe6c4be73f13 100644
--- a/examples/seq2seq/test_bash_script.py
+++ b/examples/seq2seq/test_bash_script.py
@@ -13,7 +13,7 @@
 from finetune import SummarizationModule, main
 from transformers import MarianMTModel
 from transformers.file_utils import cached_path
-from transformers.testing_utils import TestCasePlus, require_torch_non_multigpu_but_fix_me, require_torch_gpu, slow
+from transformers.testing_utils import TestCasePlus, require_torch_gpu, require_torch_non_multigpu_but_fix_me, slow
 from utils import load_json
 
 

From 07f65f6d1c3dc16f7de84caecff1edbbd8942070 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Sun, 8 Nov 2020 17:26:03 -0800
Subject: [PATCH 7/7] Apply suggestions from code review

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
---
 .github/workflows/self-scheduled.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 1771dd95b586e3..6033a9e4e8322a 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -244,7 +244,7 @@ jobs:
         if: ${{ always() }}
         run: cat reports/tests_torch_multiple_gpu_failures_short.txt
 
-      - name: Run ported examples tests on multi-GPU
+      - name: Run examples tests on multi-GPU
         env:
           OMP_NUM_THREADS: 1
           RUN_SLOW: yes