Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

[scale] Implement activation checkpointing for transformers. #3864

Merged
merged 5 commits into from
Jul 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions parlai/agents/transformer/modules/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from parlai.utils.misc import warn_once
from parlai.utils.torch import PipelineHelper
from parlai.utils.fsdp import fsdp_wrap
from parlai.nn.checkpoint import checkpoint_wrapper


@swappable(
Expand Down Expand Up @@ -286,6 +287,8 @@ def build_layers(self) -> nn.ModuleList:
activation=self.activation,
variant=self.variant,
)
if self.opt.get('checkpoint_activations'):
layer = checkpoint_wrapper(layer)
layers.append(fsdp_wrap(layer)) # type: ignore
return layers

Expand Down
3 changes: 3 additions & 0 deletions parlai/agents/transformer/modules/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from parlai.utils.misc import warn_once
from parlai.utils.torch import PipelineHelper
from parlai.utils.fsdp import fsdp_wrap
from parlai.nn.checkpoint import checkpoint_wrapper


@swappable(self_attention=MultiHeadAttention, feedforward=TransformerFFN)
Expand Down Expand Up @@ -236,6 +237,8 @@ def build_layers(self) -> nn.ModuleList:
variant=self.variant,
activation=self.activation,
)
if self.opt.get('checkpoint_activations'):
layer = checkpoint_wrapper(layer)
layers.append(fsdp_wrap(layer))
return layers

Expand Down
6 changes: 6 additions & 0 deletions parlai/agents/transformer/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,12 @@ def add_common_cmdline_args(parser):
default=False,
help='Shard the layers across multiple GPUs.',
)
parser.add_argument(
'--checkpoint-activations',
type='bool',
default=False,
help='Recompute activations on backward pass to conserve memory.',
)


class Transformer(Agent):
Expand Down
18 changes: 18 additions & 0 deletions parlai/nn/checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

try:
from fairscale.nn import checkpoint_wrapper
except ImportError:

def checkpoint_wrapper(module):
"""
Dummy checkpoint wrapper that raises an error.
"""
raise ImportError(
'Please install fairscale with `pip install fairscale` to use '
'--checkpoint-activations true.'
)
17 changes: 17 additions & 0 deletions tests/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,23 @@ def _overfit_train(self, **args):
args.update(args)
return testing_utils.train_model(args)

def test_checkpoint(self):
"""
Checks --checkpoint-activations true
"""
valid, test = testing_utils.train_model(
dict(
task='integration_tests:overfit',
model='transformer/generator',
dict_file='zoo:unittest/transformer_generator2/model.dict',
batchsize=4,
skip_generation=True,
validation_metric='ppl',
max_train_steps=10,
checkpoint_activations=True,
)
)

def test_greedysearch(self):
"""
Test greedy search.
Expand Down