Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tests] multiple improvements #12294

Merged
merged 6 commits into from
Jun 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/source/testing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ decorators are used to set the requirements of tests CPU/GPU/TPU-wise:
* ``require_torch_gpu`` - as ``require_torch`` plus requires at least 1 GPU
* ``require_torch_multi_gpu`` - as ``require_torch`` plus requires at least 2 GPUs
* ``require_torch_non_multi_gpu`` - as ``require_torch`` plus requires 0 or 1 GPUs
* ``require_torch_up_to_2_gpus`` - as ``require_torch`` plus requires 0 or 1 or 2 GPUs
* ``require_torch_tpu`` - as ``require_torch`` plus requires at least 1 TPU

Let's depict the GPU requirements in the following table:
Expand All @@ -447,6 +448,8 @@ Let's depict the GPU requirements in the following table:
+----------+----------------------------------+
| ``< 2`` | ``@require_torch_non_multi_gpu`` |
+----------+----------------------------------+
| ``< 3`` | ``@require_torch_up_to_2_gpus`` |
+----------+----------------------------------+


For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed:
Expand Down
15 changes: 15 additions & 0 deletions src/transformers/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,21 @@ def require_torch_non_multi_gpu(test_case):
return test_case


def require_torch_up_to_2_gpus(test_case):
"""
Decorator marking a test that requires 0 or 1 or 2 GPU setup (in PyTorch).
"""
if not is_torch_available():
return unittest.skip("test requires PyTorch")(test_case)

import torch

if torch.cuda.device_count() > 2:
return unittest.skip("test requires 0 or 1 or 2 GPUs")(test_case)
else:
return test_case


def require_torch_tpu(test_case):
"""
Decorator marking a test that requires a TPU (in PyTorch).
Expand Down
259 changes: 138 additions & 121 deletions tests/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

import dataclasses
import gc
import math
import os
import random
import re
Expand Down Expand Up @@ -53,6 +52,8 @@
require_torch,
require_torch_gpu,
require_torch_multi_gpu,
require_torch_non_multi_gpu,
require_torch_up_to_2_gpus,
slow,
)
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
Expand Down Expand Up @@ -337,7 +338,14 @@ def check_trainer_state_are_the_same(self, trainer_state, trainer_state1):
@require_torch
@require_sentencepiece
@require_tokenizers
class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
"""
Only tests that want to tap into the auto-pre-run 2 trainings:
- self.default_trained_model
- self.alternate_trained_model
directly, or via check_trained_model
"""

def setUp(self):
super().setUp()
args = TrainingArguments(".")
Expand All @@ -357,6 +365,115 @@ def check_trained_model(self, model, alternate_seed=False):
self.assertTrue(torch.allclose(model.a, a))
self.assertTrue(torch.allclose(model.b, b))

def test_reproducible_training(self):
# Checks that training worked, model trained and seed made a reproducible training.
trainer = get_regression_trainer(learning_rate=0.1)
trainer.train()
self.check_trained_model(trainer.model)

# Checks that a different seed gets different (reproducible) results.
trainer = get_regression_trainer(learning_rate=0.1, seed=314)
trainer.train()
self.check_trained_model(trainer.model, alternate_seed=True)

@require_datasets
def test_trainer_with_datasets(self):
import datasets

np.random.seed(42)
x = np.random.normal(size=(64,)).astype(np.float32)
y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,))
train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y})

# Base training. Should have the same results as test_reproducible_training
model = RegressionModel()
args = TrainingArguments("./regression", learning_rate=0.1)
trainer = Trainer(model, args, train_dataset=train_dataset)
trainer.train()
self.check_trained_model(trainer.model)

# Can return tensors.
train_dataset.set_format(type="torch", dtype=torch.float32)
model = RegressionModel()
trainer = Trainer(model, args, train_dataset=train_dataset)
trainer.train()
self.check_trained_model(trainer.model)

# Adding one column not used by the model should have no impact
z = np.random.normal(size=(64,)).astype(np.float32)
train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
model = RegressionModel()
trainer = Trainer(model, args, train_dataset=train_dataset)
trainer.train()
self.check_trained_model(trainer.model)

def test_model_init(self):
train_dataset = RegressionDataset()
args = TrainingArguments("./regression", learning_rate=0.1)
trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel())
trainer.train()
self.check_trained_model(trainer.model)

# Re-training should restart from scratch, thus lead the same results.
trainer.train()
self.check_trained_model(trainer.model)

# Re-training should restart from scratch, thus lead the same results and new seed should be used.
trainer.args.seed = 314
trainer.train()
self.check_trained_model(trainer.model, alternate_seed=True)

def test_gradient_accumulation(self):
# Training with half the batch size but accumulation steps as 2 should give the same results.
trainer = get_regression_trainer(
gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1
)
trainer.train()
self.check_trained_model(trainer.model)

def test_custom_optimizer(self):
train_dataset = RegressionDataset()
args = TrainingArguments("./regression")
model = RegressionModel()
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
trainer.train()

(a, b) = self.default_trained_model
self.assertFalse(torch.allclose(trainer.model.a, a))
self.assertFalse(torch.allclose(trainer.model.b, b))
self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)

def test_adafactor_lr_none(self):
# test the special case where lr=None, since Trainer can't not have lr_scheduler

from transformers.optimization import Adafactor, AdafactorSchedule

train_dataset = RegressionDataset()
args = TrainingArguments("./regression")
model = RegressionModel()
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)
trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
trainer.train()

(a, b) = self.default_trained_model
self.assertFalse(torch.allclose(trainer.model.a, a))
self.assertFalse(torch.allclose(trainer.model.b, b))
self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)


@require_torch
@require_sentencepiece
@require_tokenizers
class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
def setUp(self):
super().setUp()
args = TrainingArguments(".")
self.n_epochs = args.num_train_epochs
self.batch_size = args.train_batch_size

def test_trainer_works_with_dict(self):
# Edge case because Apex with mode O2 will change our models to return dicts. This test checks it doesn't break
# anything.
Expand Down Expand Up @@ -394,17 +511,6 @@ def test_training_arguments_are_left_untouched(self):
if key != "logging_dir":
self.assertEqual(dict1[key], dict2[key])

def test_reproducible_training(self):
# Checks that training worked, model trained and seed made a reproducible training.
trainer = get_regression_trainer(learning_rate=0.1)
trainer.train()
self.check_trained_model(trainer.model)

# Checks that a different seed gets different (reproducible) results.
trainer = get_regression_trainer(learning_rate=0.1, seed=314)
trainer.train()
self.check_trained_model(trainer.model, alternate_seed=True)

def test_number_of_steps_in_training(self):
# Regular training has n_epochs * len(train_dl) steps
trainer = get_regression_trainer(learning_rate=0.1)
Expand Down Expand Up @@ -558,70 +664,6 @@ def test_dynamic_shapes(self):
self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]]))
self.assertTrue(np.all(seen[expected.shape[0] :] == -100))

@require_datasets
def test_trainer_with_datasets(self):
import datasets

np.random.seed(42)
x = np.random.normal(size=(64,)).astype(np.float32)
y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,))
train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y})

# Base training. Should have the same results as test_reproducible_training
model = RegressionModel()
args = TrainingArguments("./regression", learning_rate=0.1)
trainer = Trainer(model, args, train_dataset=train_dataset)
trainer.train()
self.check_trained_model(trainer.model)

# Can return tensors.
train_dataset.set_format(type="torch", dtype=torch.float32)
model = RegressionModel()
trainer = Trainer(model, args, train_dataset=train_dataset)
trainer.train()
self.check_trained_model(trainer.model)

# Adding one column not used by the model should have no impact
z = np.random.normal(size=(64,)).astype(np.float32)
train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
model = RegressionModel()
trainer = Trainer(model, args, train_dataset=train_dataset)
trainer.train()
self.check_trained_model(trainer.model)

def test_custom_optimizer(self):
train_dataset = RegressionDataset()
args = TrainingArguments("./regression")
model = RegressionModel()
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
trainer.train()

(a, b) = self.default_trained_model
self.assertFalse(torch.allclose(trainer.model.a, a))
self.assertFalse(torch.allclose(trainer.model.b, b))
self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)

@require_torch
def test_adafactor_lr_none(self):
# test the special case where lr=None, since Trainer can't not have lr_scheduler

from transformers.optimization import Adafactor, AdafactorSchedule

train_dataset = RegressionDataset()
args = TrainingArguments("./regression")
model = RegressionModel()
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)
trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
trainer.train()

(a, b) = self.default_trained_model
self.assertFalse(torch.allclose(trainer.model.a, a))
self.assertFalse(torch.allclose(trainer.model.b, b))
self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)

def test_log_level(self):
# testing only --log_level (--log_level_replica requires multiple nodes)
logger = logging.get_logger()
Expand All @@ -645,22 +687,6 @@ def test_log_level(self):
trainer.train()
self.assertNotIn(log_info_string, cl.out)

def test_model_init(self):
train_dataset = RegressionDataset()
args = TrainingArguments("./regression", learning_rate=0.1)
trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel())
trainer.train()
self.check_trained_model(trainer.model)

# Re-training should restart from scratch, thus lead the same results.
trainer.train()
self.check_trained_model(trainer.model)

# Re-training should restart from scratch, thus lead the same results and new seed should be used.
trainer.args.seed = 314
trainer.train()
self.check_trained_model(trainer.model, alternate_seed=True)

def test_save_checkpoints(self):
with tempfile.TemporaryDirectory() as tmpdir:
trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
Expand All @@ -673,14 +699,6 @@ def test_save_checkpoints(self):
trainer.train()
self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)

def test_gradient_accumulation(self):
# Training with half the batch size but accumulation steps as 2 should give the same results.
trainer = get_regression_trainer(
gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1
)
trainer.train()
self.check_trained_model(trainer.model)

@require_torch_multi_gpu
def test_run_seq2seq_double_train_wrap_once(self):
# test that we don't wrap the model more than once
Expand All @@ -694,12 +712,11 @@ def test_run_seq2seq_double_train_wrap_once(self):
model_wrapped_after = trainer.model_wrapped
self.assertIs(model_wrapped_before, model_wrapped_after, "should be not wrapped twice")

@require_torch_up_to_2_gpus
def test_can_resume_training(self):
if torch.cuda.device_count() > 2:
# This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
# save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
# won't be the same since the training dataloader is shuffled).
return
# This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
# save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
# won't be the same since the training dataloader is shuffled).

with tempfile.TemporaryDirectory() as tmpdir:
kwargs = dict(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1)
Expand Down Expand Up @@ -782,10 +799,10 @@ def test_can_resume_training(self):
trainer.train(resume_from_checkpoint=True)
self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))

@require_torch_non_multi_gpu
def test_resume_training_with_randomness(self):
if torch.cuda.device_count() >= 2:
# This test will fail flakily for more than 2 GPUs since the result will be slightly more different.
return
# This test will fail flakily for more than 1 GPUs since the result will be slightly more different
# TODO: investigate why it fails for 2 GPUs?

if torch.cuda.is_available():
torch.backends.cudnn.deterministic = True
Expand All @@ -807,15 +824,15 @@ def test_resume_training_with_randomness(self):
trainer.train(resume_from_checkpoint=os.path.join(tmp_dir, "checkpoint-15"))
(a1, b1) = trainer.model.a.item(), trainer.model.b.item()

self.assertTrue(math.isclose(a, a1, rel_tol=1e-8))
self.assertTrue(math.isclose(b, b1, rel_tol=1e-8))
self.assertAlmostEqual(a, a1, delta=1e-8)
self.assertAlmostEqual(b, b1, delta=1e-8)

@require_torch_up_to_2_gpus
def test_resume_training_with_gradient_accumulation(self):
if torch.cuda.device_count() > 2:
# This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
# save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
# won't be the same since the training dataloader is shuffled).
return
# This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
# save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
# won't be the same since the training dataloader is shuffled).

with tempfile.TemporaryDirectory() as tmpdir:
trainer = get_regression_trainer(
output_dir=tmpdir,
Expand Down Expand Up @@ -848,12 +865,12 @@ def test_resume_training_with_gradient_accumulation(self):
self.assertEqual(b, b1)
self.check_trainer_state_are_the_same(state, state1)

@require_torch_up_to_2_gpus
def test_resume_training_with_frozen_params(self):
if torch.cuda.device_count() > 2:
# This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
# save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
# won't be the same since the training dataloader is shuffled).
return
# This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
# save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
# won't be the same since the training dataloader is shuffled).

with tempfile.TemporaryDirectory() as tmpdir:
trainer = get_regression_trainer(
output_dir=tmpdir,
Expand Down