Skip to content

Commit 12d6e44

Browse files
carmoccaawaelchli
andauthored
Grep for potential errors in standalone tests (#15341)
Co-authored-by: awaelchli <aedu.waelchli@gmail.com>
1 parent 7a8cf4e commit 12d6e44

File tree

7 files changed

+27
-9
lines changed

7 files changed

+27
-9
lines changed

src/pytorch_lightning/strategies/fully_sharded.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,8 @@ def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel:
197197
log.detail(f"setting up `Fairscale FSDP` model with device id: {self.root_device.index}.")
198198

199199
rank_zero_info(
200-
"When using FairScale FSDP auto-wrap, make sure to initalize your model using trainer else"
201-
" you will get an error.\ntorch.optim.Optimizer(self.trainer.model.parameters(), ...)"
200+
"When using FairScale FSDP auto-wrap, make sure to initialize your model using trainer: "
201+
"`torch.optim.Optimizer(self.trainer.model.parameters(), ...)`"
202202
)
203203

204204
return FullyShardedDataParallel(

tests/tests_lite/strategies/test_fairscale_integration.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,13 @@ def run(self, tmpdir, with_fairscale_oss=False):
4343
self.model.cpu()
4444

4545
checkpoint_path = os.path.join(tmpdir, "checkpoint.ckpt")
46+
# need to broadcast because tmpdir is different on each process
47+
checkpoint_path = self.broadcast(checkpoint_path)
48+
4649
checkpoint = {"model": self.model.state_dict(), "optimizer": self.optimizer.state_dict()}
4750
self.save(checkpoint, checkpoint_path)
4851

49-
self.barrier()
52+
self.barrier() # ensure the checkpoint is saved before load
5053

5154
loaded_checkpoint = self.load(checkpoint_path)
5255
new_model = self.get_model()

tests/tests_pytorch/run_standalone_tests.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@ set -e
1818
# Batch size for testing: Determines how many standalone test invocations run in parallel
1919
# It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE and defaults to 6 if not set
2020
test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-6}"
21-
source="${PL_STANDALONE_TESTS_SOURCE}"
21+
source="${PL_STANDALONE_TESTS_SOURCE:-"pytorch_lightning"}"
2222

2323
# this environment variable allows special tests to run
2424
export PL_RUN_STANDALONE_TESTS=1
2525
# python arguments
26-
defaults="-m coverage run --source $source --append -m pytest --no-header"
26+
defaults="-m coverage run --source $source --append -m pytest --no-header -v -s"
2727

2828
# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
2929
grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py')
@@ -49,6 +49,12 @@ rm -f standalone_test_output.txt # in case it exists, remove it
4949
function show_batched_output {
5050
if [ -f standalone_test_output.txt ]; then # if exists
5151
cat standalone_test_output.txt
52+
# heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail
53+
if grep --quiet --ignore-case --extended-regexp 'error|exception|traceback|failed' standalone_test_output.txt; then
54+
echo "Potential error! Stopping."
55+
rm standalone_test_output.txt
56+
exit 1
57+
fi
5258
rm standalone_test_output.txt
5359
fi
5460
}

tests/tests_pytorch/strategies/test_colossalai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def test_gradient_clip_algorithm_error(tmpdir):
101101

102102

103103
@RunIf(min_cuda_gpus=1, standalone=True, colossalai=True)
104-
def test_gradient_accumulation_error(tmpdir):
104+
def test_gradient_accumulation_raises(tmpdir):
105105
model = ModelParallelBoringModel()
106106
trainer = Trainer(
107107
default_root_dir=tmpdir,

tests/tests_pytorch/strategies/test_ddp_strategy.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,14 +267,17 @@ def configure_optimizers(self):
267267
@RunIf(min_cuda_gpus=2, fairscale=True)
268268
@pytest.mark.parametrize("strategy", (pytest.param("ddp", marks=RunIf(standalone=True)), "ddp_spawn"))
269269
def test_ddp_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy):
270-
"""Test to ensure that checkpoint is saved correctly when using faircale optimizer."""
270+
"""Test to ensure that checkpoint is saved correctly when using fairscale optimizer."""
271271
model = BoringFairScaleOptimizerModel()
272272
trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1)
273273

274274
trainer.fit(model)
275275

276276
checkpoint_path = os.path.join(tmpdir, "model.pt")
277+
# need to broadcast because tmpdir is different on each process
278+
checkpoint_path = trainer.strategy.broadcast(checkpoint_path)
277279
trainer.save_checkpoint(checkpoint_path)
280+
trainer.strategy.barrier() # ensure the checkpoint is saved before load
278281
saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
279282

280283
# Assert model parameters are identical after loading
@@ -297,7 +300,10 @@ def test_ddp_strategy_checkpoint_zero_redundancy_optimizer(tmpdir, strategy):
297300
trainer.fit(model)
298301

299302
checkpoint_path = os.path.join(tmpdir, "model.pt")
303+
# need to broadcast because tmpdir is different on each process
304+
checkpoint_path = trainer.strategy.broadcast(checkpoint_path)
300305
trainer.save_checkpoint(checkpoint_path)
306+
trainer.strategy.barrier() # ensure the checkpoint is saved before load
301307
saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
302308

303309
# Assert model parameters are identical after loading

tests/tests_pytorch/strategies/test_ddp_strategy_with_comm_hook.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,8 @@ def test_post_local_sgd_model_averaging(average_parameters_mock, tmpdir):
209209

210210
@RunIf(skip_windows=True, min_torch="1.10.0", min_cuda_gpus=2, standalone=True)
211211
@mock.patch("torch.distributed.algorithms.model_averaging.averagers.PeriodicModelAverager.average_parameters")
212-
def test_post_local_sgd_model_averaging_value_error(average_parameters_mock, tmpdir):
213-
"""Test that when using DDP with post-localSGD a ValueError is thrown when the optmizer is
212+
def test_post_local_sgd_model_averaging_raises(average_parameters_mock, tmpdir):
213+
"""Test that when using DDP with post-localSGD a ValueError is thrown when the optimizer is
214214
ZeroRedundancyOptimizer."""
215215
from torch.distributed.optim import ZeroRedundancyOptimizer
216216

tests/tests_pytorch/strategies/test_sharded_strategy.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,10 @@ def test_ddp_sharded_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, s
315315
trainer.fit(model)
316316

317317
checkpoint_path = os.path.join(tmpdir, "model.pt")
318+
# need to broadcast because tmpdir is different on each process
319+
checkpoint_path = trainer.strategy.broadcast(checkpoint_path)
318320
trainer.save_checkpoint(checkpoint_path)
321+
trainer.strategy.barrier() # ensure the checkpoint is saved before load
319322
saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
320323

321324
# Assert model parameters are identical after loading

0 commit comments

Comments
 (0)