From 083d62f1ec0dae2cb8a2d27c77549e2cd2c8af43 Mon Sep 17 00:00:00 2001 From: Pengzhi Gao Date: Wed, 19 Feb 2020 14:24:24 -0500 Subject: [PATCH 1/2] Add ELMo modules --- requirements.txt | 1 + setup.py | 1 + .../data/tokenizers/elmo_tokenizer_utils.py | 136 ++ .../tokenizers/elmo_tokenizer_utils_test.py | 105 + texar/torch/modules/encoders/__init__.py | 1 + texar/torch/modules/encoders/elmo_encoder.py | 323 +++ .../modules/encoders/elmo_encoder_test.py | 146 ++ texar/torch/modules/pretrained/__init__.py | 1 + texar/torch/modules/pretrained/elmo.py | 104 + texar/torch/modules/pretrained/elmo_test.py | 71 + texar/torch/modules/pretrained/elmo_utils.py | 2166 +++++++++++++++++ .../modules/pretrained/elmo_utils_test.py | 882 +++++++ texar/torch/utils/test.py | 3 + 13 files changed, 3940 insertions(+) create mode 100644 texar/torch/data/tokenizers/elmo_tokenizer_utils.py create mode 100644 texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py create mode 100644 texar/torch/modules/encoders/elmo_encoder.py create mode 100644 texar/torch/modules/encoders/elmo_encoder_test.py create mode 100644 texar/torch/modules/pretrained/elmo.py create mode 100644 texar/torch/modules/pretrained/elmo_test.py create mode 100644 texar/torch/modules/pretrained/elmo_utils.py create mode 100644 texar/torch/modules/pretrained/elmo_utils_test.py diff --git a/requirements.txt b/requirements.txt index efdba2f84..22f7f8cdd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ numpy >= 1.15.4 mypy_extensions >= 0.4.1 regex >= 2018.01.10 sentencepiece >= 0.1.8 +h5py >= 2.10.0 diff --git a/setup.py b/setup.py index 85fef9cb4..3b86d933c 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ install_requires=[ 'regex>=2018.01.10', 'numpy', + 'h5py>=2.10.0', 'requests', 'funcsigs', 'sentencepiece>=0.1.8', diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py new file mode 100644 index 000000000..ea454d0d8 --- /dev/null +++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py @@ -0,0 +1,136 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils of pre-trained ELMo tokenizer. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/elmo_indexer.py` +""" +from typing import Dict, List, Optional + +import torch + +from torch.nn.utils.rnn import pad_sequence + + +__all__ = [ + "ELMoCharacterMapper", + "batch_to_ids", +] + + +def _make_bos_eos( + character: int, + padding_character: int, + beginning_of_word_character: int, + end_of_word_character: int, + max_word_length: int, +): + char_ids = [padding_character] * max_word_length + char_ids[0] = beginning_of_word_character + char_ids[1] = character + char_ids[2] = end_of_word_character + return char_ids + + +class ELMoCharacterMapper: + r"""Maps individual tokens to sequences of character ids, compatible with + ELMo. To be consistent with previously trained models, we include it here as + special of existing character indexers. + + We allow to add optional additional special tokens with designated + character ids with `tokens_to_add`. + """ + + max_word_length = 50 + + # char ids 0-255 come from utf-8 encoding bytes + # assign 256-300 to special chars + beginning_of_sentence_character = 256 # + end_of_sentence_character = 257 # + beginning_of_word_character = 258 # + end_of_word_character = 259 # + padding_character = 260 # + + beginning_of_sentence_characters = _make_bos_eos( + beginning_of_sentence_character, + padding_character, + beginning_of_word_character, + end_of_word_character, + max_word_length, + ) + end_of_sentence_characters = _make_bos_eos( + end_of_sentence_character, + padding_character, + beginning_of_word_character, + end_of_word_character, + max_word_length, + ) + + bos_token = "" + eos_token = "" + + def __init__(self, tokens_to_add: Optional[Dict[str, int]] = None) -> None: + self.tokens_to_add = tokens_to_add or {} + + def convert_word_to_char_ids(self, word: str) -> List[int]: + if word in self.tokens_to_add: + char_ids = ([ELMoCharacterMapper.padding_character] * + ELMoCharacterMapper.max_word_length) + char_ids[0] = ELMoCharacterMapper.beginning_of_word_character + char_ids[1] = self.tokens_to_add[word] + char_ids[2] = ELMoCharacterMapper.end_of_word_character + elif word == ELMoCharacterMapper.bos_token: + char_ids = ELMoCharacterMapper.beginning_of_sentence_characters + elif word == ELMoCharacterMapper.eos_token: + char_ids = ELMoCharacterMapper.end_of_sentence_characters + else: + word_encoded = word.encode( + "utf-8", "ignore")[: (ELMoCharacterMapper.max_word_length - 2)] + char_ids = ([ELMoCharacterMapper.padding_character] * + ELMoCharacterMapper.max_word_length) + char_ids[0] = ELMoCharacterMapper.beginning_of_word_character + for k, chr_id in enumerate(word_encoded, start=1): + char_ids[k] = chr_id + char_ids[len(word_encoded) + 1] = \ + ELMoCharacterMapper.end_of_word_character + + # +1 one for masking + return [c + 1 for c in char_ids] + + def __eq__(self, other) -> bool: + if isinstance(self, other.__class__): + return self.__dict__ == other.__dict__ + return NotImplemented + + +def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: + r"""Converts a batch of tokenized sentences to a tensor representing the + sentences with encoded characters (len(batch), max sentence length, + max word length). + + Args: + batch: A list of tokenized sentences. + + Returns: + A tensor of padded character ids. + """ + res = [] + mapper = ELMoCharacterMapper() + for sentence in batch: + character_ids = [mapper.convert_word_to_char_ids(token) + for token in sentence] + res.append(torch.tensor(character_ids)) + + return pad_sequence(res, batch_first=True) diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py new file mode 100644 index 000000000..f8dac6703 --- /dev/null +++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py @@ -0,0 +1,105 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for pre-trained ELMo tokenizer. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/token_indexers/elmo_indexer_test.py` +""" + +import unittest + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import ( + ELMoCharacterMapper, batch_to_ids) + + +class ELMoTokenizerUtilsTest(unittest.TestCase): + + def test_bos_to_char_ids(self): + mapper = ELMoCharacterMapper() + indices = mapper.convert_word_to_char_ids('') + expected_indices = [ + 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + ] + self.assertEqual(indices, expected_indices) + + def test_eos_to_char_ids(self): + mapper = ELMoCharacterMapper() + indices = mapper.convert_word_to_char_ids('') + expected_indices = [ + 259, 258, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + ] + self.assertEqual(indices, expected_indices) + + def test_unicode_to_char_ids(self): + mapper = ELMoCharacterMapper() + indices = mapper.convert_word_to_char_ids(chr(256) + "t") + expected_indices = [ + 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + ] + self.assertEqual(indices, expected_indices) + + def test_additional_tokens(self): + mapper = ELMoCharacterMapper(tokens_to_add={"": 1}) + indices = mapper.convert_word_to_char_ids("") + expected_indices = [ + 259, 2, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + ] + self.assertEqual(indices, expected_indices) + + def test_batch_to_ids(self): + sentences = [['First', 'sentence', '.'], ['Another', '.']] + indices = batch_to_ids(sentences) + expected_indices = [[[ + 259, 71, 106, 115, 116, 117, 260, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [ + 259, 116, 102, 111, 117, 102, 111, 100, 102, 260, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [ + 259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261]], + [[259, 66, 111, 112, 117, 105, 102, 115, 260, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], + [259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0]]] + self.assertEqual(indices.tolist(), expected_indices) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/modules/encoders/__init__.py b/texar/torch/modules/encoders/__init__.py index ce69fd985..1031dab5c 100644 --- a/texar/torch/modules/encoders/__init__.py +++ b/texar/torch/modules/encoders/__init__.py @@ -17,6 +17,7 @@ from texar.torch.modules.encoders.bert_encoder import * from texar.torch.modules.encoders.conv_encoders import * +from texar.torch.modules.encoders.elmo_encoder import * from texar.torch.modules.encoders.encoder_base import * from texar.torch.modules.encoders.gpt2_encoder import * from texar.torch.modules.encoders.multihead_attention import * diff --git a/texar/torch/modules/encoders/elmo_encoder.py b/texar/torch/modules/encoders/elmo_encoder.py new file mode 100644 index 000000000..2011d3dbc --- /dev/null +++ b/texar/torch/modules/encoders/elmo_encoder.py @@ -0,0 +1,323 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +ELMo encoder. +""" +import json +import os +import tempfile +import warnings + +from typing import Any, Dict, List, Optional, Union + +import torch + +from torch.nn.modules import Dropout + +from texar.torch.modules.encoders.encoder_base import EncoderBase +from texar.torch.modules.pretrained.elmo import PretrainedELMoMixin +from texar.torch.modules.pretrained.elmo_utils import ( + _ElmoBiLm, ScalarMix, remove_sentence_boundaries) + +__all__ = [ + "ELMoEncoder", +] + + +class ELMoEncoder(EncoderBase, PretrainedELMoMixin): + r"""ELMo model for encoding sequences. Please see + :class:`~texar.torch.modules.PretrainedELMoMixin` for a brief description + of ELMo. + + Args: + pretrained_model_name (optional): a `str`, the name + of pre-trained model (e.g., ``elmo-small``). Please refer to + :class:`~texar.torch.modules.PretrainedELMoMixin` for + all supported models. + If `None`, the model name in :attr:`hparams` is used. + cache_dir (optional): the path to a folder in which the + pre-trained models will be cached. If `None` (default), + a default directory (``texar_data`` folder under user's home + directory) will be used. + hparams (dict or HParams, optional): Hyperparameters. Missing + hyperparameter will be set to default values. See + :meth:`default_hparams` for the hyperparameter structure + and default values. + """ + def __init__(self, + pretrained_model_name: Optional[str] = None, + cache_dir: Optional[str] = None, + hparams=None): + super().__init__(hparams=hparams) + + self.load_pretrained_config(pretrained_model_name, cache_dir) + + options_file = None + weight_file = None + tmp_dir = tempfile.TemporaryDirectory() + if self.pretrained_model_dir is not None: + info = list(os.walk(self.pretrained_model_dir)) + root, _, files = info[0] + for file in files: + if file.endswith('options.json'): + options_file = os.path.join(root, file) + if file.endswith('weights.hdf5'): + weight_file = os.path.join(root, file) + else: + with open(os.path.join(tmp_dir.name, 'options.json'), "w") as fp: + json.dump(self.hparams.encoder.todict(), fp) + options_file = os.path.join(tmp_dir.name, 'options.json') + + assert options_file is not None + self._elmo_lstm = _ElmoBiLm( + options_file, + weight_file, # type: ignore + requires_grad=self.hparams.requires_grad, + vocab_to_cache=self.hparams.vocab_to_cache, + ) + tmp_dir.cleanup() + + self._has_cached_vocab = self.hparams.vocab_to_cache is not None + self._keep_sentence_boundaries = self.hparams.keep_sentence_boundaries + self._dropout = Dropout(p=self.hparams.dropout) + self._scalar_mixes: Any = [] + for k in range(self.hparams.num_output_representations): + scalar_mix = ScalarMix( + self._elmo_lstm.num_layers, + do_layer_norm=self.hparams.do_layer_norm, + initial_scalar_parameters=self.hparams.scalar_mix_parameters, + trainable=self.hparams.scalar_mix_parameters is None, + ) + self.add_module("scalar_mix_{}".format(k), scalar_mix) + self._scalar_mixes.append(scalar_mix) + + @staticmethod + def default_hparams(): + r"""Returns a dictionary of hyperparameters with default values. + + * The encoder arch is determined by the constructor argument + :attr:`pretrained_model_name` if it's specified. In this case, + `hparams` are ignored. + * Otherwise, the encoder arch is determined by + `hparams['pretrained_model_name']` if it's specified. All other + configurations in `hparams` are ignored. + * If the above two are `None`, the encoder arch is defined by the + configurations in `hparams` and weights are randomly initialized. + + .. code-block:: python + + { + "pretrained_model_name": "elmo-small", + "encoder": { + "lstm": { + "use_skip_connections": True, + "projection_dim": 128, + "cell_clip": 3, + "proj_clip": 3, + "dim": 1024, + "n_layers": 2 + }, + "char_cnn": { + "activation": "relu", + "filters": [[1, 32], [2, 32], [3, 64], [4, 128], + [5, 256], [6, 512], [7, 1024]], + "n_highway": 1, + "embedding": { + "dim": 16 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } + }, + "num_output_representations": 2, + "requires_grad": False, + "do_layer_norm": False, + "dropout": 0.5, + "vocab_to_cache": None, + "keep_sentence_boundaries": False, + "scalar_mix_parameters": None, + "name": "elmo_encoder", + } + + Here: + + The default parameters are values for elmo-small model. + + `"pretrained_model_name"`: str or None + The name of the pre-trained ELMo model. If None, the model + will be randomly initialized. + + `"encoder"`: dict + Hyperparameters for ELMo encoder. + + `"num_output_representations"`: int + The number of ELMo representation to output with different linear + weighted combination of the 3 layers (i.e., character-convnet + output, 1st lstm output, 2nd lstm output). + + `"requires_grad"`: bool + If True, compute gradient of ELMo parameters for fine tuning. + + `"do_layer_norm"`: bool + Should we apply layer normalization (passed to `ScalarMix`)? + + `"dropout"`: float + The dropout to be applied to the ELMo representations. + + `"vocab_to_cache"`: List[str] + A list of words to pre-compute and cache character convolutions + for. If you use this option, Elmo expects that you pass word + indices of shape (batch_size, timesteps) to forward, instead + of character indices. If you use this option and pass a word which + wasn't pre-cached, this will break. + + `"keep_sentence_boundaries"`: bool + If True, the representation of the sentence boundary tokens are + not removed. + + `"scalar_mix_parameters"`: List[float] + If not `None`, use these scalar mix parameters to weight the + representations produced by different layers. These mixing weights + are not updated during training. The mixing weights here should be + the unnormalized (i.e., pre-softmax) weights. So, if you wanted to + use only the 1st layer of a 2-layer ELMo, you can set this to + [-9e10, 1, -9e10 ]. + + `"name"`: str + Name of the module. + """ + return { + 'pretrained_model_name': 'elmo-small', + 'encoder': { + "lstm": { + "use_skip_connections": True, + "projection_dim": 128, + "cell_clip": 3, + "proj_clip": 3, + "dim": 1024, + "n_layers": 2 + }, + "char_cnn": { + "activation": "relu", + "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], + [6, 512], [7, 1024]], + "n_highway": 1, + "embedding": { + "dim": 16 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } + }, + 'num_output_representations': 2, + 'requires_grad': False, + 'do_layer_norm': False, + 'dropout': 0.5, + 'vocab_to_cache': None, + 'keep_sentence_boundaries': False, + 'scalar_mix_parameters': None, + 'name': 'elmo_encoder', + '@no_typecheck': ['pretrained_model_name'] + } + + def forward(self, # type: ignore + inputs: torch.Tensor, + word_inputs: Optional[torch.Tensor] = None) -> \ + Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: + r"""Encodes the inputs. + + Args: + inputs: Shape `[batch_size, max_time, 50]` of character ids + representing the current batch. + word_inputs: If you passed a cached vocab, you can in addition pass + a tensor of shape `[batch_size, max_time]`, which represent + word ids which have been pre-cached. + + Returns: + A Dict with keys: + + - :attr:`elmo_representations`: A `num_output_representations` list + of ELMo representations for the input sequence. Each + representation is shape `[batch_size, max_time, embedding_dim]` + + - :attr:`mask`: Shape `(batch_size, timesteps)` long tensor + with sequence mask. + """ + # reshape the input if needed + original_shape = inputs.size() + if len(original_shape) > 3: + timesteps, num_characters = original_shape[-2:] + reshaped_inputs = inputs.view(-1, timesteps, num_characters) + else: + reshaped_inputs = inputs + + if word_inputs is not None: + original_word_size = word_inputs.size() + if self._has_cached_vocab and len(original_word_size) > 2: + reshaped_word_inputs = word_inputs.view(-1, + original_word_size[-1]) + elif not self._has_cached_vocab: + warnings.warn( + "Word inputs were passed to ELMo but it does not have a " + "cached vocab.") + reshaped_word_inputs = None # type: ignore + else: + reshaped_word_inputs = word_inputs + else: + reshaped_word_inputs = word_inputs # type: ignore + + # run the biLM + bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs) + layer_activations = bilm_output["activations"] + mask_with_bos_eos = bilm_output["mask"] + + # compute the elmo representations + representations = [] + for i in range(len(self._scalar_mixes)): + scalar_mix = getattr(self, "scalar_mix_{}".format(i)) + representation_with_bos_eos = scalar_mix(layer_activations, + mask_with_bos_eos) + if self._keep_sentence_boundaries: + processed_representation = representation_with_bos_eos + processed_mask = mask_with_bos_eos + else: + representation_without_bos_eos, mask_without_bos_eos = \ + remove_sentence_boundaries( + representation_with_bos_eos, mask_with_bos_eos) + processed_representation = representation_without_bos_eos + processed_mask = mask_without_bos_eos + representations.append(self._dropout(processed_representation)) + + # reshape if necessary + if word_inputs is not None and len(original_word_size) > 2: + mask = processed_mask.view(original_word_size) + elmo_representations = [ + representation.view(original_word_size + (-1,)) + for representation in representations + ] + elif len(original_shape) > 3: + mask = processed_mask.view(original_shape[:-1]) + elmo_representations = [ + representation.view(original_shape[:-1] + (-1,)) + for representation in representations + ] + else: + mask = processed_mask + elmo_representations = representations + + return {"elmo_representations": elmo_representations, "mask": mask} + + @property + def output_size(self): + return self._elmo_lstm.get_output_dim() diff --git a/texar/torch/modules/encoders/elmo_encoder_test.py b/texar/torch/modules/encoders/elmo_encoder_test.py new file mode 100644 index 000000000..04a34b359 --- /dev/null +++ b/texar/torch/modules/encoders/elmo_encoder_test.py @@ -0,0 +1,146 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for ELMo Encoder. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/elmo_test.py` +""" + +import unittest + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import batch_to_ids +from texar.torch.modules.encoders.elmo_encoder import ELMoEncoder +from texar.torch.utils.test import pretrained_test + + +class ELMoEncoderTest(unittest.TestCase): + r"""Tests :class:`~texar.torch.modules.ELMoEncoder` class. + """ + + @pretrained_test + def test_model_loading(self): + r"""Tests model loading functionality.""" + sentences = [ + ["The", "sentence", "."], + ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."], + ] + character_ids = batch_to_ids(sentences) + for pretrained_model_name in ELMoEncoder.available_checkpoints(): + encoder = ELMoEncoder(pretrained_model_name=pretrained_model_name) + _ = encoder(character_ids) + + def test_encode(self): + r"""Tests encoding. + """ + hparams = { + "pretrained_model_name": None, + 'encoder': { + "lstm": { + "cell_clip": 3, + "use_skip_connections": True, + "n_layers": 2, + "proj_clip": 3, + "projection_dim": 16, + "dim": 64 + }, + "char_cnn": { + "embedding": { + "dim": 4 + }, + "filters": [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], + "n_highway": 2, + "n_characters": 262, + "max_characters_per_token": 50, + "activation": "relu" + } + } + } + encoder = ELMoEncoder(hparams=hparams) + + sentences = [ + ["The", "sentence", "."], + ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."], + ] + character_ids = batch_to_ids(sentences) + output = encoder(character_ids) + elmo_representations = output["elmo_representations"] + mask = output["mask"] + + assert len(elmo_representations) == 2 + assert list(elmo_representations[0].size()) == [2, 7, 32] + assert list(elmo_representations[1].size()) == [2, 7, 32] + assert list(mask.size()) == [2, 7] + + def test_elmo_keep_sentence_boundaries(self): + hparams = { + "pretrained_model_name": None, + 'encoder': { + "lstm": { + "cell_clip": 3, + "use_skip_connections": True, + "n_layers": 2, + "proj_clip": 3, + "projection_dim": 16, + "dim": 64 + }, + "char_cnn": { + "embedding": { + "dim": 4 + }, + "filters": [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], + "n_highway": 2, + "n_characters": 262, + "max_characters_per_token": 50, + "activation": "relu" + } + }, + 'dropout': 0.0, + 'keep_sentence_boundaries': True, + } + encoder = ELMoEncoder(hparams=hparams) + + sentences = [ + ["The", "sentence", "."], + ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."], + ] + character_ids = batch_to_ids(sentences) + output = encoder(character_ids) + elmo_representations = output["elmo_representations"] + mask = output["mask"] + + assert len(elmo_representations) == 2 + # Add 2 to the lengths because we're keeping the start and end of + # sentence tokens. + assert list(elmo_representations[0].size()) == [2, 7 + 2, 32] + assert list(elmo_representations[1].size()) == [2, 7 + 2, 32] + assert list(mask.size()) == [2, 7 + 2] + + @pretrained_test + def test_trainable_variables(self): + encoder = ELMoEncoder() + elmo_grads = [ + param.requires_grad for param in encoder._elmo_lstm.parameters() + ] + assert all(grad is False for grad in elmo_grads) + + encoder = ELMoEncoder(hparams={'requires_grad': True}) + elmo_grads = [ + param.requires_grad for param in encoder._elmo_lstm.parameters() + ] + assert all(grad is True for grad in elmo_grads) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/modules/pretrained/__init__.py b/texar/torch/modules/pretrained/__init__.py index 1f06a87a9..1e0ae19d3 100644 --- a/texar/torch/modules/pretrained/__init__.py +++ b/texar/torch/modules/pretrained/__init__.py @@ -17,6 +17,7 @@ from texar.torch.modules.pretrained.pretrained_base import * from texar.torch.modules.pretrained.bert import * +from texar.torch.modules.pretrained.elmo import * from texar.torch.modules.pretrained.gpt2 import * from texar.torch.modules.pretrained.roberta import * from texar.torch.modules.pretrained.xlnet import * diff --git a/texar/torch/modules/pretrained/elmo.py b/texar/torch/modules/pretrained/elmo.py new file mode 100644 index 000000000..ef616d1b5 --- /dev/null +++ b/texar/torch/modules/pretrained/elmo.py @@ -0,0 +1,104 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils of ELMo Modules. +""" + +import json +import os + +from abc import ABC +from typing import Any, Dict + +from texar.torch.modules.pretrained.pretrained_base import PretrainedMixin + +__all__ = [ + "PretrainedELMoMixin", +] + +_ELMo_PATH = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/" + + +class PretrainedELMoMixin(PretrainedMixin, ABC): + r"""A mixin class to support loading pre-trained checkpoints for modules + that implement the ELMo model. + + The ELMo model was proposed in + `Deep contextualized word representations`_ + by `Peters et al.` from Allen Institute for Artificial Intelligence. It is + a deep bidirectional language model (biLM), which is pre-trained on a + large text corpus. + + The available ELMo models are as follows: + + * ``elmo-small``: 13.6M parameters, trained on 800M tokens. + * ``elmo-medium``: 28.0M parameters, trained on 800M tokens. + * ``elmo-original``: 93.6M parameters, trained on 800M tokens. + * ``elmo-original-5.5b``: 93.6M parameters, trained on 5.5B tokens. + + We provide the following ELMo classes: + + * :class:`~texar.torch.modules.ELMoEncoder` for text encoding. + + .. _`Deep contextualized word representations`: + https://arxiv.org/abs/1802.05365 + """ + _MODEL_NAME = "ELMo" + _MODEL2URL = { + 'elmo-small': [ + _ELMo_PATH + '2x1024_128_2048cnn_1xhighway/' + 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5', + _ELMo_PATH + '2x1024_128_2048cnn_1xhighway/' + 'elmo_2x1024_128_2048cnn_1xhighway_options.json', + ], + 'elmo-medium': [ + _ELMo_PATH + '2x2048_256_2048cnn_1xhighway/' + 'elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5', + _ELMo_PATH + '2x2048_256_2048cnn_1xhighway/' + 'elmo_2x2048_256_2048cnn_1xhighway_options.json', + ], + 'elmo-original': [ + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway/' + 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5', + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway/' + 'elmo_2x4096_512_2048cnn_2xhighway_options.json', + ], + 'elmo-original-5.5b': [ + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway_5.5B/' + 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5', + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway_5.5B/' + 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json', + ], + } + + @classmethod + def _transform_config(cls, pretrained_model_name: str, + cache_dir: str) -> Dict[str, Any]: + info = list(os.walk(cache_dir)) + root, _, files = info[0] + config_path = None + for file in files: + if file.endswith('options.json'): + config_path = os.path.join(root, file) + if config_path is None: + raise ValueError(f"Cannot find the config file in {cache_dir}") + + with open(config_path) as f: + config_elmo = json.loads(f.read()) + + return {'encoder': config_elmo} + + def _init_from_checkpoint(self, pretrained_model_name: str, + cache_dir: str, **kwargs): + return diff --git a/texar/torch/modules/pretrained/elmo_test.py b/texar/torch/modules/pretrained/elmo_test.py new file mode 100644 index 000000000..d31bb1f5a --- /dev/null +++ b/texar/torch/modules/pretrained/elmo_test.py @@ -0,0 +1,71 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for ELMo utils. +""" + +import os +import unittest + +from texar.torch.modules.pretrained.elmo import * +from texar.torch.utils.test import pretrained_test + + +class ELMoUtilsTest(unittest.TestCase): + r"""Tests ELMo Utils. + """ + + @pretrained_test + def test_load_pretrained_elmo_AND_transform_elmo_to_texar_config(self): + pretrained_model_dir = PretrainedELMoMixin.download_checkpoint( + pretrained_model_name="elmo-small") + + info = list(os.walk(pretrained_model_dir)) + _, _, files = info[0] + self.assertIn('elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5', files) + self.assertIn('elmo_2x1024_128_2048cnn_1xhighway_options.json', files) + + model_config = PretrainedELMoMixin._transform_config( + pretrained_model_name="elmo-small", + cache_dir=pretrained_model_dir) + + exp_config = { + 'encoder': { + "lstm": { + "use_skip_connections": True, + "projection_dim": 128, + "cell_clip": 3, + "proj_clip": 3, + "dim": 1024, + "n_layers": 2 + }, + "char_cnn": { + "activation": "relu", + "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], + [6, 512], [7, 1024]], + "n_highway": 1, + "embedding": { + "dim": 16 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } + }, + } + + self.assertDictEqual(model_config, exp_config) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/modules/pretrained/elmo_utils.py b/texar/torch/modules/pretrained/elmo_utils.py new file mode 100644 index 000000000..65b8f2f69 --- /dev/null +++ b/texar/torch/modules/pretrained/elmo_utils.py @@ -0,0 +1,2166 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils of ELMo Modules. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/common/checks.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/common/util.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo_lstm.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/encoder_base.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/lstm_cell_with_projection.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/highway.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/scalar_mix.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/time_distributed.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/embedding.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/nn/initializers.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py` +""" +import itertools +import json +import logging + +from itertools import islice +from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional, + Tuple, TypeVar, Union) + +import h5py +import numpy +import torch + +from torch.nn import ParameterList, Parameter +from torch.nn.functional import embedding +from torch.nn.utils.rnn import ( + pad_packed_sequence, pack_padded_sequence, PackedSequence) + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import ( + batch_to_ids, ELMoCharacterMapper) + +# pylint: disable=attribute-defined-outside-init,protected-access + +__all__ = [ + "_ElmoBiLm", + "_ElmoCharacterEncoder", + "_EncoderBase", + "ConfigurationError", + "ElmoLstm", + "Embedding", + "Highway", + "LstmCellWithProjection", + "ScalarMix", + "TimeDistributed", + "add_sentence_boundary_token_ids", + "block_orthogonal", + "combine_initial_dims", + "get_device_of", + "get_dropout_mask", + "get_lengths_from_binary_sequence_mask", + "lazy_groups_of", + "remove_sentence_boundaries", + "sort_batch_by_length", + "uncombine_initial_dims", +] + + +class _ElmoBiLm(torch.nn.Module): + r"""Run a pre-trained bidirectional language model, outputting the + activations at each layer for weighting together into an ELMo + representation (with `allennlp.modules.seq2seq_encoders.Elmo`). + This is a lower level class, useful for advanced uses, but most users + should use `allennlp.modules.Elmo` directly. + + # Parameters + + options_file : `str` + ELMo JSON options file + weight_file : `str` + ELMo hdf5 weight file + requires_grad : `bool`, optional, (default = False). + If True, compute gradient of ELMo parameters for fine tuning. + vocab_to_cache : `List[str]`, optional, (default = None). + A list of words to pre-compute and cache character convolutions + for. If you use this option, _ElmoBiLm expects that you pass word + indices of shape (batch_size, timesteps) to forward, instead + of character indices. If you use this option and pass a word which + wasn't pre-cached, this will break. + """ + + def __init__( + self, + options_file: str, + weight_file: str, + requires_grad: bool = False, + vocab_to_cache: Optional[List[str]] = None, + ) -> None: + super().__init__() + + self._token_embedder = _ElmoCharacterEncoder( + options_file, weight_file, requires_grad=requires_grad + ) + + self._requires_grad = requires_grad + if requires_grad and vocab_to_cache: + logging.warning( + "You are fine tuning ELMo and caching char CNN word vectors. " + "This behaviour is not guaranteed to be well defined, " + "particularly. " + "if not all of your inputs will occur in the vocabulary cache." + ) + # This is an embedding, used to look up cached + # word vectors built from character level cnn embeddings. + self._word_embedding = None + self._bos_embedding: torch.Tensor = None # type: ignore + self._eos_embedding: torch.Tensor = None # type: ignore + if vocab_to_cache: + logging.info( + "Caching character cnn layers for words in vocabulary.") + # This sets 3 attributes, _word_embedding, _bos_embedding and + # _eos_embedding. They are set in the method so they can be accessed + # from outside the constructor. + self.create_cached_cnn_embeddings(vocab_to_cache) + + with open(options_file, "r") as fin: + options = json.load(fin) + if not options["lstm"].get("use_skip_connections"): + raise ConfigurationError( + "We only support pretrained biLMs with residual connections") + self._elmo_lstm = ElmoLstm( + input_size=options["lstm"]["projection_dim"], + hidden_size=options["lstm"]["projection_dim"], + cell_size=options["lstm"]["dim"], + num_layers=options["lstm"]["n_layers"], + memory_cell_clip_value=options["lstm"]["cell_clip"], + state_projection_clip_value=options["lstm"]["proj_clip"], + requires_grad=requires_grad, + ) + + if weight_file is not None: + self._elmo_lstm.load_weights(weight_file) + # Number of representation layers including context independent layer + self.num_layers = options["lstm"]["n_layers"] + 1 + + def get_output_dim(self): + return 2 * self._token_embedder.get_output_dim() + + def forward( # type: ignore + self, inputs: torch.Tensor, word_inputs: Optional[torch.Tensor] = None + ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: + r"""Encodes the inputs. + + # Parameters + + inputs : `torch.Tensor`, required. + Shape `(batch_size, timesteps, 50)` of character ids representing + the current batch. + word_inputs : `torch.Tensor`, required. + If you passed a cached vocab, you can in addition pass a tensor of + shape `(batch_size, timesteps)`, which represent word ids which + have been pre-cached. + + # Returns + + Dict with keys: + + `'activations'` : `List[torch.Tensor]` + A list of activations at each layer of the network, each of shape + `(batch_size, timesteps + 2, embedding_dim)` + `'mask'`: `torch.Tensor` + Shape `(batch_size, timesteps + 2)` long tensor with sequence mask. + + Note that the output tensors all include additional special begin and + end of sequence markers. + """ + if self._word_embedding is not None and word_inputs is not None: + try: + mask_without_bos_eos = (word_inputs > 0).long() + # The character cnn part is cached - just look it up. + embedded_inputs = self._word_embedding( + word_inputs) + # shape (batch_size, timesteps + 2, embedding_dim) + type_representation, mask = add_sentence_boundary_token_ids( + embedded_inputs, mask_without_bos_eos, self._bos_embedding, + self._eos_embedding + ) + except RuntimeError: + # Back off to running the character convolutions, + # as we might not have the words in the cache. + token_embedding = self._token_embedder(inputs) + mask = token_embedding["mask"] + type_representation = token_embedding["token_embedding"] + else: + token_embedding = self._token_embedder(inputs) + mask = token_embedding["mask"] + type_representation = token_embedding["token_embedding"] + lstm_outputs = self._elmo_lstm(type_representation, mask) + + # Prepare the output. The first layer is duplicated. + # Because of minor differences in how masking is applied depending + # on whether the char cnn layers are cached, we'll be defensive and + # multiply by the mask here. It's not strictly necessary, as the + # mask passed on is correct, but the values in the padded areas + # of the char cnn representations can change. + output_tensors = [ + torch.cat([type_representation, type_representation], dim=-1) + * mask.float().unsqueeze(-1) + ] + for layer_activations in torch.chunk(lstm_outputs, + lstm_outputs.size(0), dim=0): + output_tensors.append(layer_activations.squeeze(0)) + + return {"activations": output_tensors, "mask": mask} + + def create_cached_cnn_embeddings(self, tokens: List[str]) -> None: + r"""Given a list of tokens, this method precomputes word representations + by running just the character convolutions and highway layers of elmo, + essentially creating uncontextual word vectors. On subsequent forward + passes, the word ids are looked up from an embedding, rather than being + computed on the fly via the CNN encoder. + + This function sets 3 attributes: + + _word_embedding : `torch.Tensor` + The word embedding for each word in the tokens passed to this + method. + _bos_embedding : `torch.Tensor` + The embedding for the BOS token. + _eos_embedding : `torch.Tensor` + The embedding for the EOS token. + + # Parameters + + tokens : `List[str]`, required. + A list of tokens to precompute character convolutions for. + """ + tokens = [ELMoCharacterMapper.bos_token, + ELMoCharacterMapper.eos_token] + tokens + timesteps = 32 + batch_size = 32 + chunked_tokens = lazy_groups_of(iter(tokens), timesteps) + + all_embeddings = [] + device = get_device_of(next(self.parameters())) + for batch in lazy_groups_of(chunked_tokens, batch_size): + # Shape (batch_size, timesteps, 50) + batched_tensor = batch_to_ids(batch) + # NOTE: This device check is for when a user calls this method + # having already placed the model on a device. If this is called in + # the constructor, it will probably happen on the CPU. This isn't + # too bad, because it's only a few convolutions and will likely + # be very fast. + if device >= 0: + batched_tensor = batched_tensor.cuda(device) + output = self._token_embedder(batched_tensor) + token_embedding = output["token_embedding"] + mask = output["mask"] + token_embedding, _ = remove_sentence_boundaries(token_embedding, + mask) + all_embeddings.append(token_embedding.view( + -1, token_embedding.size(-1))) + full_embedding = torch.cat(all_embeddings, 0) + + # We might have some trailing embeddings from padding in the batch, so + # we clip the embedding and lookup to the right size. + full_embedding = full_embedding[: len(tokens), :] + embedding_ = full_embedding[2: len(tokens), :] + vocab_size, embedding_dim = list(embedding_.size()) + + self._bos_embedding = full_embedding[0, :] + self._eos_embedding = full_embedding[1, :] + self._word_embedding = Embedding( # type: ignore + vocab_size, + embedding_dim, + weight=embedding_.data, + trainable=self._requires_grad, + padding_index=0, + ) + + +class _ElmoCharacterEncoder(torch.nn.Module): + r"""Compute context insensitive token representation using pretrained biLM. + + This embedder has input character ids of size + (batch_size, sequence_length, 50) + and returns (batch_size, sequence_length + 2, embedding_dim), where + embedding_dim is specified in the options file (typically 512). + + We add special entries at the beginning and end of each sequence + corresponding to and , the beginning and end of sentence tokens. + + Note: this is a lower level class useful for advanced usage. Most users + should use `ElmoTokenEmbedder` or `allennlp.modules.Elmo` instead. + + # Parameters + + options_file : `str` + ELMo JSON options file + weight_file : `str` + ELMo hdf5 weight file + requires_grad : `bool`, optional, (default = False). + If True, compute gradient of ELMo parameters for fine tuning. + + The relevant section of the options file is something like: + .. example-code:: + + .. code-block:: python + + {'char_cnn': { + 'activation': 'relu', + 'embedding': {'dim': 4}, + 'filters': [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], + 'max_characters_per_token': 50, + 'n_characters': 262, + 'n_highway': 2 + } + } + """ + + def __init__(self, options_file: str, weight_file: str, + requires_grad: bool = False) -> None: + super().__init__() + + with open(options_file, "r") as fin: + self._options = json.load(fin) + self._weight_file = weight_file + + self.output_dim = self._options["lstm"]["projection_dim"] + self.requires_grad = requires_grad + + if weight_file is not None: + self._load_weights() + else: + # Do not load the weights + self._load_weights(False) + + # Cache the arrays for use in forward -- +1 due to masking. + self._beginning_of_sentence_characters = torch.from_numpy( + numpy.array( + ELMoCharacterMapper.beginning_of_sentence_characters) + 1 + ) + self._end_of_sentence_characters = torch.from_numpy( + numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1 + ) + + def get_output_dim(self): + return self.output_dim + + def forward(self, # type: ignore + inputs: torch.Tensor) -> Dict[str, torch.Tensor]: + r"""Compute context insensitive token embeddings for ELMo + representations. + + # Parameters + + inputs : `torch.Tensor` + Shape `(batch_size, sequence_length, 50)` of character ids + representing the current batch. + + # Returns + + Dict with keys: + `'token_embedding'` : `torch.Tensor` + Shape `(batch_size, sequence_length + 2, embedding_dim)` tensor + with context insensitive token representations. + `'mask'`: `torch.Tensor` + Shape `(batch_size, sequence_length + 2)` long tensor with + sequence mask. + """ + # Add BOS/EOS + mask = ((inputs > 0).long().sum(dim=-1) > 0).long() + character_ids_with_bos_eos, mask_with_bos_eos = \ + add_sentence_boundary_token_ids( + inputs, mask, self._beginning_of_sentence_characters, + self._end_of_sentence_characters) + + # the character id embedding + max_chars_per_token = \ + self._options["char_cnn"]["max_characters_per_token"] + # (batch_size * sequence_length, max_chars_per_token, embed_dim) + character_embedding = torch.nn.functional.embedding( + character_ids_with_bos_eos.view(-1, max_chars_per_token), + self._char_embedding_weights) + + # run convolutions + cnn_options = self._options["char_cnn"] + activation: Callable + if cnn_options["activation"] == "tanh": + activation = torch.tanh + elif cnn_options["activation"] == "relu": + activation = torch.nn.functional.relu + else: + raise ConfigurationError("Unknown activation") + + # (batch_size * sequence_length, embed_dim, max_chars_per_token) + character_embedding = torch.transpose(character_embedding, 1, 2) + convs = [] + for i in range(len(self._convolutions)): + conv = getattr(self, "char_conv_{}".format(i)) + convolved = conv(character_embedding) + # (batch_size * sequence_length, n_filters for this width) + convolved, _ = torch.max(convolved, dim=-1) + convolved = activation(convolved) + convs.append(convolved) + + # (batch_size * sequence_length, n_filters) + token_embedding = torch.cat(convs, dim=-1) + + # apply the highway layers (batch_size * sequence_length, n_filters) + token_embedding = self._highways(token_embedding) + + # final projection (batch_size * sequence_length, embedding_dim) + token_embedding = self._projection(token_embedding) + + # reshape to (batch_size, sequence_length, embedding_dim) + batch_size, sequence_length, _ = character_ids_with_bos_eos.size() + + return { + "mask": mask_with_bos_eos, + "token_embedding": token_embedding.view(batch_size, + sequence_length, -1), + } + + def _load_weights(self, load_weights=True): + self._load_char_embedding(load_weights) + self._load_cnn_weights(load_weights) + self._load_highway(load_weights) + self._load_projection(load_weights) + + def _load_char_embedding(self, load_weights): + + if load_weights: + with h5py.File(self._weight_file, "r") as fin: + char_embed_weights = fin["char_embed"][...] + + weights = numpy.zeros( + (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), + dtype="float32" + ) + weights[1:, :] = char_embed_weights + + self._char_embedding_weights = torch.nn.Parameter( + torch.FloatTensor(weights), requires_grad=self.requires_grad + ) + else: + weights = numpy.zeros( + (self._options['char_cnn']['n_characters'], + self._options['char_cnn']['embedding']['dim']), + dtype="float32" + ) + self._char_embedding_weights = torch.nn.Parameter( + torch.FloatTensor(weights), requires_grad=self.requires_grad + ) + + def _load_cnn_weights(self, load_weights): + cnn_options = self._options["char_cnn"] + filters = cnn_options["filters"] + char_embed_dim = cnn_options["embedding"]["dim"] + + convolutions = [] + for i, (width, num) in enumerate(filters): + conv = torch.nn.Conv1d( + in_channels=char_embed_dim, out_channels=num, + kernel_size=width, bias=True + ) + + if load_weights: + # load the weights + with h5py.File(self._weight_file, "r") as fin: + weight = fin["CNN"]["W_cnn_{}".format(i)][...] + bias = fin["CNN"]["b_cnn_{}".format(i)][...] + + w_reshaped = numpy.transpose(weight.squeeze(axis=0), + axes=(2, 1, 0)) + if w_reshaped.shape != tuple(conv.weight.data.shape): + raise ValueError("Invalid weight file") + conv.weight.data.copy_(torch.FloatTensor(w_reshaped)) + conv.bias.data.copy_(torch.FloatTensor(bias)) + + conv.weight.requires_grad = self.requires_grad + conv.bias.requires_grad = self.requires_grad + + convolutions.append(conv) + self.add_module("char_conv_{}".format(i), conv) + + self._convolutions = convolutions + + def _load_highway(self, load_weights): + + # the highway layers have same dimensionality as the number of cnn + # filters + cnn_options = self._options["char_cnn"] + filters = cnn_options["filters"] + n_filters = sum(f[1] for f in filters) + n_highway = cnn_options["n_highway"] + + # create the layers, and load the weights + self._highways = Highway(n_filters, n_highway, + activation=torch.nn.functional.relu) + + if load_weights: + for k in range(n_highway): + # The AllenNLP highway is one matrix multplication with + # concatenation of transform and carry weights. + with h5py.File(self._weight_file, "r") as fin: + # The weights are transposed due to multiplication order + # assumptions in tf vs pytorch (tf.matmul(X, W) vs + # pytorch.matmul(W, X)) + w_transform = numpy.transpose( + fin["CNN_high_{}".format(k)]["W_transform"][...]) + # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but + # tf is (1 - g) * x + g * f(x) + w_carry = -1.0 * numpy.transpose( + fin["CNN_high_{}".format(k)]["W_carry"][...]) + weight = numpy.concatenate([w_transform, w_carry], axis=0) + self._highways._layers[k].weight.data.copy_( + torch.FloatTensor(weight)) + self._highways._layers[k].weight.requires_grad = \ + self.requires_grad + + b_transform = \ + fin["CNN_high_{}".format(k)]["b_transform"][...] + b_carry = \ + -1.0 * fin["CNN_high_{}".format(k)]["b_carry"][...] + bias = numpy.concatenate([b_transform, b_carry], axis=0) + self._highways._layers[k].bias.data.copy_( + torch.FloatTensor(bias)) + self._highways._layers[k].bias.requires_grad = \ + self.requires_grad + + def _load_projection(self, load_weights): + cnn_options = self._options["char_cnn"] + filters = cnn_options["filters"] + n_filters = sum(f[1] for f in filters) + + self._projection = torch.nn.Linear(n_filters, self.output_dim, + bias=True) + + if load_weights: + with h5py.File(self._weight_file, "r") as fin: + weight = fin["CNN_proj"]["W_proj"][...] + bias = fin["CNN_proj"]["b_proj"][...] + self._projection.weight.data.copy_(torch.FloatTensor( + numpy.transpose(weight))) + self._projection.bias.data.copy_(torch.FloatTensor(bias)) + self._projection.weight.requires_grad = self.requires_grad + self._projection.bias.requires_grad = self.requires_grad + + +RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] +RnnStateStorage = Tuple[torch.Tensor, ...] + + +class _EncoderBase(torch.nn.Module): + r"""This abstract class serves as a base for the 3 `Encoder` abstractions + in AllenNLP. + - [`Seq2SeqEncoders`](./seq2seq_encoders/seq2seq_encoder.md) + - [`Seq2VecEncoders`](./seq2vec_encoders/seq2vec_encoder.md) + + Additionally, this class provides functionality for sorting sequences by + length so they can be consumed by Pytorch RNN classes, which require their + inputs to be sorted by length. Finally, it also provides optional + statefulness to all of it's subclasses by allowing the caching and + retrieving of the hidden states of RNNs. + """ + + def __init__(self, stateful: bool = False) -> None: + super().__init__() + self.stateful = stateful + self._states: Optional[RnnStateStorage] = None + + def sort_and_run_forward( + self, + module: Callable[ + [PackedSequence, Optional[RnnState]], + Tuple[Union[PackedSequence, torch.Tensor], RnnState], + ], + inputs: torch.Tensor, + mask: torch.Tensor, + hidden_state: Optional[RnnState] = None, + ): + r"""This function exists because Pytorch RNNs require that their inputs + be sorted before being passed as input. As all of our Seq2xxxEncoders + use this functionality, it is provided in a base class. This method can + be called on any module which takes as input a `PackedSequence` and + some `hidden_state`, which can either be a tuple of tensors or a tensor. + + As all of our Seq2xxxEncoders have different return types, we return + `sorted` outputs from the module, which is called directly. + Additionally, we return the indices into the batch dimension required + to restore the tensor to it's correct, unsorted order and the number of + valid batch elements (i.e the number of elements in the batch which are + not completely masked). This un-sorting and re-padding of the module + outputs is left to the subclasses because their outputs have different + types and handling them smoothly here is difficult. + + # Parameters + + module : `Callable[[PackedSequence, Optional[RnnState]], + Tuple[Union[PackedSequence, torch.Tensor], + RnnState]]`, required. + A function to run on the inputs. In most cases, this is a + `torch.nn.Module`. + inputs : `torch.Tensor`, required. + A tensor of shape `(batch_size, sequence_length, embedding_size)` + representing the inputs to the Encoder. + mask : `torch.Tensor`, required. + A tensor of shape `(batch_size, sequence_length)`, representing + masked and non-masked elements of the sequence for each element in + the batch. + hidden_state : `Optional[RnnState]`, (default = None). + A single tensor of shape (num_layers, batch_size, hidden_size) + representing the state of an RNN with or a tuple of tensors of + shapes (num_layers, batch_size, hidden_size) and + (num_layers, batch_size, memory_size), representing the hidden + state and memory state of an LSTM-like RNN. + + # Returns + + module_output : `Union[torch.Tensor, PackedSequence]`. + A Tensor or PackedSequence representing the output of the Pytorch + Module. The batch size dimension will be equal to `num_valid`, as + sequences of zero length are clipped off before the module is + called, as Pytorch cannot handle zero length sequences. + final_states : `Optional[RnnState]` + A Tensor representing the hidden state of the Pytorch Module. This + can either be a single tensor of shape (num_layers, num_valid, + hidden_size), for instance in the case of a GRU, or a tuple of + tensors, such as those required for an LSTM. + restoration_indices : `torch.LongTensor` + A tensor of shape `(batch_size,)`, describing the re-indexing + required to transform the outputs back to their original batch + order. + """ + # In some circumstances you may have sequences of zero length. + # `pack_padded_sequence` requires all sequence lengths to be > 0, so + # remove sequences of zero length before calling self._module, then + # fill with zeros. + + # First count how many sequences are empty. + batch_size = mask.size(0) + num_valid = torch.sum(mask[:, 0]).int().item() + + sequence_lengths = get_lengths_from_binary_sequence_mask(mask) + ( + sorted_inputs, + sorted_sequence_lengths, + restoration_indices, + sorting_indices, + ) = sort_batch_by_length(inputs, sequence_lengths) + + # Now create a PackedSequence with only the non-empty, sorted sequences. + packed_sequence_input = pack_padded_sequence( + sorted_inputs[:num_valid, :, :], + sorted_sequence_lengths[:num_valid].data.tolist(), + batch_first=True, + ) + # Prepare the initial states. + if not self.stateful: + if hidden_state is None: + initial_states: Any = hidden_state + elif isinstance(hidden_state, tuple): + initial_states = [ + state.index_select( + 1, sorting_indices)[:, :num_valid, :].contiguous() + for state in hidden_state + ] + else: + initial_states = hidden_state.index_select(1, sorting_indices)[ + :, :num_valid, :].contiguous() + + else: + initial_states = self._get_initial_states(batch_size, num_valid, + sorting_indices) + + # Actually call the module on the sorted PackedSequence. + module_output, final_states = module(packed_sequence_input, + initial_states) + + return module_output, final_states, restoration_indices + + def _get_initial_states( + self, batch_size: int, num_valid: int, sorting_indices: torch.LongTensor + ) -> Optional[RnnState]: + r"""Returns an initial state for use in an RNN. Additionally, this + method handles the batch size changing across calls by mutating the + state to append initial states for new elements in the batch. Finally, + it also handles sorting the states with respect to the sequence lengths + of elements in the batch and removing rows which are completely padded. + Importantly, this `mutates` the state if the current batch size is + larger than when it was previously called. + + # Parameters + + batch_size : `int`, required. + The batch size can change size across calls to stateful RNNs, so we + need to know if we need to expand or shrink the states before + returning them. Expanded states will be set to zero. + num_valid : `int`, required. + The batch may contain completely padded sequences which get removed + before the sequence is passed through the encoder. We also need to + clip these off of the state too. + sorting_indices `torch.LongTensor`, required. + Pytorch RNNs take sequences sorted by length. When we return the + states to be used for a given call to `module.forward`, we need the + states to match up to the sorted sequences, so before returning + them, we sort the states using the same indices used to sort the + sequences. + + # Returns + + This method has a complex return type because it has to deal with the + first time it is called, when it has no state, and the fact that types + of RNN have heterogeneous states. + + If it is the first time the module has been called, it returns `None`, + regardless of the type of the `Module`. + + Otherwise, for LSTMs, it returns a tuple of `torch.Tensors` with shape + `(num_layers, num_valid, state_size)` and `(num_layers, num_valid, + memory_size)` respectively, or for GRUs, it returns a single + `torch.Tensor` of shape `(num_layers, num_valid, state_size)`. + """ + # We don't know the state sizes the first time calling forward, + # so we let the module define what it's initial hidden state looks like. + if self._states is None: + return None + + # Otherwise, we have some previous states. + if batch_size > self._states[0].size(1): + # This batch is larger than the all previous states. + # If so, resize the states. + num_states_to_concat = batch_size - self._states[0].size(1) + resized_states = [] + # state has shape (num_layers, batch_size, hidden_size) + for state in self._states: + # This _must_ be inside the loop because some + # RNNs have states with different last dimension sizes. + zeros = state.new_zeros(state.size(0), num_states_to_concat, + state.size(2)) + resized_states.append(torch.cat([state, zeros], 1)) + self._states = tuple(resized_states) + correctly_shaped_states = self._states + + elif batch_size < self._states[0].size(1): + # This batch is smaller than the previous one. + correctly_shaped_states = tuple(state[:, :batch_size, :] for state + in self._states) + else: + correctly_shaped_states = self._states + + # At this point, our states are of shape (num_layers, batch_size, + # hidden_size). However, the encoder uses sorted sequences and + # additionally removes elements of the batch which are fully padded. + # We need the states to match up to these sorted and filtered + # sequences, so we do that in the next two blocks before returning the + # state/s. + if len(self._states) == 1: + # GRUs only have a single state. This `unpacks` it from the + # tuple and returns the tensor directly. + correctly_shaped_state = correctly_shaped_states[0] + sorted_state = correctly_shaped_state.index_select( + 1, sorting_indices) + return sorted_state[:, :num_valid, :].contiguous() + else: + # LSTMs have a state tuple of (state, memory). + sorted_states = [ + state.index_select(1, sorting_indices) for state in + correctly_shaped_states + ] + return tuple(state[:, :num_valid, :].contiguous() # type: ignore + for state in sorted_states) + + def _update_states(self, final_states: RnnStateStorage, + restoration_indices: torch.LongTensor) -> None: + r"""After the RNN has run forward, the states need to be updated. + This method just sets the state to the updated new state, performing + several pieces of book-keeping along the way - namely, unsorting the + states and ensuring that the states of completely padded sequences are + not updated. Finally, it also detaches the state variable from the + computational graph, such that the graph can be garbage collected after + each batch iteration. + + # Parameters + + final_states : `RnnStateStorage`, required. + The hidden states returned as output from the RNN. + restoration_indices : `torch.LongTensor`, required. + The indices that invert the sorting used in `sort_and_run_forward` + to order the states with respect to the lengths of the sequences in + the batch. + """ + # TODO(Mark): seems weird to sort here, but append zeros in the + # subclasses. + # which way around is best? + new_unsorted_states = [state.index_select(1, restoration_indices) for + state in final_states] + + if self._states is None: + # We don't already have states, so just set the + # ones we receive to be the current state. + self._states = tuple(state.data for state in new_unsorted_states) + else: + # Now we've sorted the states back so that they correspond to the + # original indices, we need to figure out what states we need to + # update, because if we didn't use a state for a particular row, + # we want to preserve its state. Thankfully, the rows which are + # all zero in the state correspond exactly to those which aren't + # used, so we create masks of shape (new_batch_size,), denoting + # which states were used in the RNN computation. + current_state_batch_size = self._states[0].size(1) + new_state_batch_size = final_states[0].size(1) + # Masks for the unused states of shape (1, new_batch_size, 1) + used_new_rows_mask = [ + (state[0, :, :].sum(-1) != 0.0).float().view( + 1, new_state_batch_size, 1) + for state in new_unsorted_states + ] + new_states = [] + if current_state_batch_size > new_state_batch_size: + # The new state is smaller than the old one, + # so just update the indices which we used. + for old_state, new_state, used_mask in zip( + self._states, new_unsorted_states, used_new_rows_mask + ): + # zero out all rows in the previous state + # which _were_ used in the current state. + masked_old_state = \ + old_state[:, :new_state_batch_size, :] * (1 - used_mask) + # The old state is larger, so update the relevant parts of + # it. + old_state[:, :new_state_batch_size, :] = \ + new_state + masked_old_state + new_states.append(old_state.detach()) + else: + # The states are the same size, so we just have to + # deal with the possibility that some rows weren't used. + new_states = [] + for old_state, new_state, used_mask in zip( + self._states, new_unsorted_states, used_new_rows_mask + ): + # zero out all rows which _were_ used in the current state. + masked_old_state = old_state * (1 - used_mask) + # The old state is larger, so update the relevant parts of + # it. + new_state += masked_old_state + new_states.append(new_state.detach()) + + # It looks like there should be another case handled here - when + # the current_state_batch_size < new_state_batch_size. However, + # this never happens, because the states themeselves are mutated + # by appending zeros when calling _get_inital_states, meaning that + # the new states are either of equal size, or smaller, in the case + # that there are some unused elements (zero-length) for the RNN + # computation. + self._states = tuple(new_states) + + def reset_states(self, mask: Optional[torch.Tensor] = None) -> None: + r"""Resets the internal states of a stateful encoder. + + # Parameters + + mask : `torch.Tensor`, optional. + A tensor of shape `(batch_size,)` indicating which states should + be reset. If not provided, all states will be reset. + """ + if mask is None: + self._states = None + else: + # state has shape (num_layers, batch_size, hidden_size). We reshape + # mask to have shape (1, batch_size, 1) so that operations + # broadcast properly. + mask_batch_size = mask.size(0) + mask = mask.float().view(1, mask_batch_size, 1) + new_states = [] + assert self._states is not None + for old_state in self._states: + old_state_batch_size = old_state.size(1) + if old_state_batch_size != mask_batch_size: + raise ValueError( + f"Trying to reset states using mask with incorrect " + f"batch size. " + f"Expected batch size: {old_state_batch_size}. " + f"Provided batch size: {mask_batch_size}." + ) + new_state = (1 - mask) * old_state + new_states.append(new_state.detach()) + self._states = tuple(new_states) + + +class ElmoLstm(_EncoderBase): + r"""A stacked, bidirectional LSTM which uses + [`LstmCellWithProjection`'s](./lstm_cell_with_projection.md) + with highway layers between the inputs to layers. + The inputs to the forward and backward directions are independent - + forward and backward states are not concatenated between layers. + + Additionally, this LSTM maintains its `own` state, which is updated every + time `forward` is called. It is dynamically resized for different batch + sizes and is designed for use with non-continuous inputs (i.e inputs which + aren't formatted as a stream, such as text used for a language modeling + task, which is how stateful RNNs are typically used). + This is non-standard, but can be thought of as having an "end of sentence" + state, which is carried across different sentences. + + # Parameters + + input_size : `int`, required + The dimension of the inputs to the LSTM. + hidden_size : `int`, required + The dimension of the outputs of the LSTM. + cell_size : `int`, required. + The dimension of the memory cell of the `LstmCellWithProjection`. + num_layers : `int`, required + The number of bidirectional LSTMs to use. + requires_grad : `bool`, optional + If True, compute gradient of ELMo parameters for fine tuning. + recurrent_dropout_probability : `float`, optional (default = 0.0) + The dropout probability to be used in a dropout scheme as stated in + [A Theoretically Grounded Application of Dropout in Recurrent Neural + Networks](https://arxiv.org/abs/1512.05287). + state_projection_clip_value : `float`, optional, (default = None) + The magnitude with which to clip the hidden_state after projecting it. + memory_cell_clip_value : `float`, optional, (default = None) + The magnitude with which to clip the memory cell. + """ + + def __init__( + self, + input_size: int, + hidden_size: int, + cell_size: int, + num_layers: int, + requires_grad: bool = False, + recurrent_dropout_probability: float = 0.0, + memory_cell_clip_value: Optional[float] = None, + state_projection_clip_value: Optional[float] = None, + ) -> None: + super().__init__(stateful=True) + + # Required to be wrapped with a `PytorchSeq2SeqWrapper`. + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.cell_size = cell_size + self.requires_grad = requires_grad + + forward_layers = [] + backward_layers = [] + + lstm_input_size = input_size + go_forward = True + for layer_index in range(num_layers): + forward_layer = LstmCellWithProjection( + lstm_input_size, + hidden_size, + cell_size, + go_forward, + recurrent_dropout_probability, + memory_cell_clip_value, + state_projection_clip_value, + ) + backward_layer = LstmCellWithProjection( + lstm_input_size, + hidden_size, + cell_size, + not go_forward, + recurrent_dropout_probability, + memory_cell_clip_value, + state_projection_clip_value, + ) + lstm_input_size = hidden_size + + self.add_module("forward_layer_{}".format(layer_index), + forward_layer) + self.add_module("backward_layer_{}".format(layer_index), + backward_layer) + forward_layers.append(forward_layer) + backward_layers.append(backward_layer) + self.forward_layers = forward_layers + self.backward_layers = backward_layers + + def forward(self, inputs: torch.Tensor, # type: ignore + mask: torch.LongTensor) -> torch.Tensor: + r"""Encodes the inputs. + + # Parameters + + inputs : `torch.Tensor`, required. + A Tensor of shape `(batch_size, sequence_length, hidden_size)`. + mask : `torch.LongTensor`, required. + A binary mask of shape `(batch_size, sequence_length)` representing + the non-padded elements in each sequence in the batch. + + # Returns + + A `torch.Tensor` of shape (num_layers, batch_size, sequence_length, + hidden_size), where the num_layers dimension represents the LSTM output + from that layer. + """ + batch_size, total_sequence_length = mask.size() + stacked_sequence_output, final_states, restoration_indices = \ + self.sort_and_run_forward(self._lstm_forward, inputs, mask) + + num_layers, num_valid, returned_timesteps, encoder_dim = \ + stacked_sequence_output.size() + # Add back invalid rows which were removed in the call to + # sort_and_run_forward. + if num_valid < batch_size: + zeros = stacked_sequence_output.new_zeros( + num_layers, batch_size - num_valid, returned_timesteps, + encoder_dim + ) + stacked_sequence_output = torch.cat( + [stacked_sequence_output, zeros], 1) + + # The states also need to have invalid rows added back. + new_states = [] + for state in final_states: + state_dim = state.size(-1) + zeros = state.new_zeros(num_layers, batch_size - num_valid, + state_dim) + new_states.append(torch.cat([state, zeros], 1)) + final_states = new_states + + # It's possible to need to pass sequences which are padded to longer + # than the max length of the sequence to a Seq2StackEncoder. However, + # packing and unpacking the sequences mean that the returned tensor + # won't include these dimensions, because the RNN did not need to + # process them. We add them back on in the form of zeros here. + sequence_length_difference = total_sequence_length - returned_timesteps + if sequence_length_difference > 0: + zeros = stacked_sequence_output.new_zeros( + num_layers, + batch_size, + sequence_length_difference, + stacked_sequence_output[0].size(-1), + ) + stacked_sequence_output = torch.cat( + [stacked_sequence_output, zeros], 2) + + self._update_states(final_states, restoration_indices) + + # Restore the original indices and return the sequence. + # Has shape (num_layers, batch_size, sequence_length, hidden_size) + return stacked_sequence_output.index_select(1, restoration_indices) + + def _lstm_forward( + self, + inputs: PackedSequence, + initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + r"""Encodes the inputs. + + # Parameters + + inputs : `PackedSequence`, required. + A batch first `PackedSequence` to run the stacked LSTM over. + initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, + (default = None) + A tuple (state, memory) representing the initial hidden state and + memory of the LSTM, with shape + (num_layers, batch_size, 2 * hidden_size) and + (num_layers, batch_size, 2 * cell_size) respectively. + + # Returns + + output_sequence : `torch.FloatTensor` + The encoded sequence of shape + (num_layers, batch_size, sequence_length, hidden_size) + final_states : `Tuple[torch.FloatTensor, torch.FloatTensor]` + The per-layer final (state, memory) states of the LSTM, with shape + (num_layers, batch_size, 2 * hidden_size) and + (num_layers, batch_size, 2 * cell_size) + respectively. The last dimension is duplicated because it + contains the state/memory for both the forward and backward layers. + """ + if initial_state is None: + hidden_states: List[Optional[Tuple[torch.Tensor, torch.Tensor]]] = \ + [None] * len(self.forward_layers) + elif initial_state[0].size()[0] != len(self.forward_layers): + raise ConfigurationError( + "Initial states were passed to forward() but the number of " + "initial states does not match the number of layers." + ) + else: + hidden_states = list(zip(initial_state[0].split(1, 0), + initial_state[1].split(1, 0))) + + inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True) + forward_output_sequence = inputs + backward_output_sequence = inputs + + final_states = [] + sequence_outputs = [] + for layer_index, state in enumerate(hidden_states): + forward_layer = getattr(self, "forward_layer_{}".format( + layer_index)) + backward_layer = getattr(self, "backward_layer_{}".format( + layer_index)) + + forward_cache = forward_output_sequence + backward_cache = backward_output_sequence + + if state is not None: + forward_hidden_state, backward_hidden_state = state[0].split( + self.hidden_size, 2) + forward_memory_state, backward_memory_state = state[1].split( + self.cell_size, 2) + forward_state = (forward_hidden_state, forward_memory_state) + backward_state = (backward_hidden_state, backward_memory_state) + else: + forward_state = None # type: ignore + backward_state = None # type: ignore + + forward_output_sequence, forward_state = forward_layer( + forward_output_sequence, batch_lengths, forward_state + ) + backward_output_sequence, backward_state = backward_layer( + backward_output_sequence, batch_lengths, backward_state + ) + # Skip connections, just adding the input to the output. + if layer_index != 0: + forward_output_sequence += forward_cache + backward_output_sequence += backward_cache + + sequence_outputs.append( + torch.cat([forward_output_sequence, backward_output_sequence], + -1) + ) + # Append the state tuples in a list, so that we can return + # the final states for all the layers. + final_states.append( + ( + torch.cat([forward_state[0], backward_state[0]], -1), + torch.cat([forward_state[1], backward_state[1]], -1), + ) + ) + + stacked_sequence_outputs: torch.FloatTensor = torch.stack( + sequence_outputs) + # Stack the hidden state and memory for each layer into 2 tensors of + # shape (num_layers, batch_size, hidden_size) and + # (num_layers, batch_size, cell_size) respectively. + final_hidden_states, final_memory_states = zip(*final_states) + final_state_tuple: Tuple[torch.FloatTensor, torch.FloatTensor] = ( + torch.cat(final_hidden_states, 0), + torch.cat(final_memory_states, 0), + ) + return stacked_sequence_outputs, final_state_tuple + + def load_weights(self, weight_file: str) -> None: + r"""Load the pre-trained weights from the file. + """ + requires_grad = self.requires_grad + + with h5py.File(weight_file, "r") as fin: + for i_layer, lstms in enumerate(zip(self.forward_layers, + self.backward_layers)): + for j_direction, lstm in enumerate(lstms): + # lstm is an instance of LSTMCellWithProjection + cell_size = lstm.cell_size + + dataset = fin["RNN_%s" % j_direction]["RNN"][ + "MultiRNNCell"][ + "Cell%s" % i_layer + ]["LSTMCell"] + + # tensorflow packs together both W and U matrices into one + # matrix, but pytorch maintains individual matrices. In + # addition, tensorflow packs the gates as input, memory, + # forget, output but pytorch uses input, forget, memory, + # output. So we need to modify the weights. + tf_weights = numpy.transpose(dataset["W_0"][...]) + torch_weights = tf_weights.copy() + + # split the W from U matrices + input_size = lstm.input_size + input_weights = torch_weights[:, :input_size] + recurrent_weights = torch_weights[:, input_size:] + tf_input_weights = tf_weights[:, :input_size] + tf_recurrent_weights = tf_weights[:, input_size:] + + # handle the different gate order convention + for torch_w, tf_w in [ + [input_weights, tf_input_weights], + [recurrent_weights, tf_recurrent_weights], + ]: + torch_w[(1 * cell_size): (2 * cell_size), :] = tf_w[ + (2 * cell_size): (3 * cell_size), : + ] + torch_w[(2 * cell_size): (3 * cell_size), :] = tf_w[ + (1 * cell_size): (2 * cell_size), : + ] + + lstm.input_linearity.weight.data.copy_(torch.FloatTensor( + input_weights)) + lstm.state_linearity.weight.data.copy_(torch.FloatTensor( + recurrent_weights)) + lstm.input_linearity.weight.requires_grad = requires_grad + lstm.state_linearity.weight.requires_grad = requires_grad + + # the bias weights + tf_bias = dataset["B"][...] + # tensorflow adds 1.0 to forget gate bias instead of + # modifying the parameters... + tf_bias[(2 * cell_size): (3 * cell_size)] += 1 + torch_bias = tf_bias.copy() + torch_bias[(1 * cell_size): (2 * cell_size)] = tf_bias[ + (2 * cell_size): (3 * cell_size) + ] + torch_bias[(2 * cell_size): (3 * cell_size)] = tf_bias[ + (1 * cell_size): (2 * cell_size) + ] + lstm.state_linearity.bias.data.copy_(torch.FloatTensor( + torch_bias)) + lstm.state_linearity.bias.requires_grad = requires_grad + + # the projection weights + proj_weights = numpy.transpose(dataset["W_P_0"][...]) + lstm.state_projection.weight.data.copy_(torch.FloatTensor( + proj_weights)) + lstm.state_projection.weight.requires_grad = requires_grad + + +class LstmCellWithProjection(torch.nn.Module): + r"""An LSTM with Recurrent Dropout and a projected and clipped hidden state + and memory. Note: this implementation is slower than the native Pytorch + LSTM because it cannot make use of CUDNN optimizations for stacked RNNs due + to and variational dropout and the custom nature of the cell state. + + # Parameters + + input_size : `int`, required. + The dimension of the inputs to the LSTM. + hidden_size : `int`, required. + The dimension of the outputs of the LSTM. + cell_size : `int`, required. + The dimension of the memory cell used for the LSTM. + go_forward : `bool`, optional (default = True) + The direction in which the LSTM is applied to the sequence. + Forwards by default, or backwards if False. + recurrent_dropout_probability : `float`, optional (default = 0.0) + The dropout probability to be used in a dropout scheme as stated in + [A Theoretically Grounded Application of Dropout in Recurrent Neural + Networks] (https://arxiv.org/abs/1512.05287). Implementation wise, + this simply applies a fixed dropout mask per sequence to the recurrent + connection of the LSTM. + state_projection_clip_value : `float`, optional, (default = None) + The magnitude with which to clip the hidden_state after projecting it. + memory_cell_clip_value : `float`, optional, (default = None) + The magnitude with which to clip the memory cell. + + # Returns + + output_accumulator : `torch.FloatTensor` + The outputs of the LSTM for each timestep. A tensor of shape + (batch_size, max_timesteps, hidden_size) where for a given batch + element, all outputs past the sequence length for that batch are + zero tensors. + final_state : `Tuple[torch.FloatTensor, torch.FloatTensor]` + The final (state, memory) states of the LSTM, with shape + (1, batch_size, hidden_size) and (1, batch_size, cell_size) + respectively. The first dimension is 1 in order to match the Pytorch + API for returning stacked LSTM states. + """ + + def __init__( + self, + input_size: int, + hidden_size: int, + cell_size: int, + go_forward: bool = True, + recurrent_dropout_probability: float = 0.0, + memory_cell_clip_value: Optional[float] = None, + state_projection_clip_value: Optional[float] = None, + ) -> None: + super().__init__() + # Required to be wrapped with a `PytorchSeq2SeqWrapper`. + self.input_size = input_size + self.hidden_size = hidden_size + self.cell_size = cell_size + + self.go_forward = go_forward + self.state_projection_clip_value = state_projection_clip_value + self.memory_cell_clip_value = memory_cell_clip_value + self.recurrent_dropout_probability = recurrent_dropout_probability + + # We do the projections for all the gates all at once. + self.input_linearity = torch.nn.Linear( + input_size, 4 * cell_size, bias=False) + self.state_linearity = torch.nn.Linear( + hidden_size, 4 * cell_size, bias=True) + + # Additional projection matrix for making the hidden state smaller. + self.state_projection = torch.nn.Linear( + cell_size, hidden_size, bias=False) + self.reset_parameters() + + def reset_parameters(self): + # Use sensible default initializations for parameters. + block_orthogonal(self.input_linearity.weight.data, + [self.cell_size, self.input_size]) + block_orthogonal(self.state_linearity.weight.data, + [self.cell_size, self.hidden_size]) + + self.state_linearity.bias.data.fill_(0.0) + # Initialize forget gate biases to 1.0 as per An Empirical + # Exploration of Recurrent Network Architectures, (Jozefowicz, 2015). + self.state_linearity.bias.data[self.cell_size: + 2 * self.cell_size].fill_(1.0) + + def forward( # type: ignore + self, + inputs: torch.FloatTensor, + batch_lengths: List[int], + initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ): + r"""Process the inputs. + + # Parameters + + inputs : `torch.FloatTensor`, required. + A tensor of shape (batch_size, num_timesteps, input_size) + to apply the LSTM over. + batch_lengths : `List[int]`, required. + A list of length batch_size containing the lengths of the sequences + in batch. + initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, + (default = None) + A tuple (state, memory) representing the initial hidden state and + memory of the LSTM. The `state` has shape (1, batch_size, + hidden_size) and the `memory` has shape (1, batch_size, cell_size). + + # Returns + + output_accumulator : `torch.FloatTensor` + The outputs of the LSTM for each timestep. A tensor of shape + (batch_size, max_timesteps, hidden_size) where for a given batch + element, all outputs past the sequence length for that batch are + zero tensors. + final_state : `Tuple[`torch.FloatTensor, torch.FloatTensor]` + A tuple (state, memory) representing the initial hidden state and + memory of the LSTM. The `state` has shape (1, batch_size, + hidden_size) and the `memory` has shape (1, batch_size, cell_size). + """ + batch_size = inputs.size()[0] + total_timesteps = inputs.size()[1] + + output_accumulator = inputs.new_zeros(batch_size, total_timesteps, + self.hidden_size) + + if initial_state is None: + full_batch_previous_memory = inputs.new_zeros(batch_size, + self.cell_size) + full_batch_previous_state = inputs.new_zeros(batch_size, + self.hidden_size) + else: + full_batch_previous_state = initial_state[0].squeeze(0) + full_batch_previous_memory = initial_state[1].squeeze(0) + + current_length_index = batch_size - 1 if self.go_forward else 0 + if self.recurrent_dropout_probability > 0.0 and self.training: + dropout_mask = get_dropout_mask( + self.recurrent_dropout_probability, full_batch_previous_state + ) + else: + dropout_mask = None + + for timestep in range(total_timesteps): + # The index depends on which end we start. + index = timestep if self.go_forward else \ + total_timesteps - timestep - 1 + + # What we are doing here is finding the index into the batch + # dimension which we need to use for this timestep, because the + # sequences have variable length, so once the index is greater than + # the length of this particular batch sequence, we no longer need + # to do the computation for this sequence. The key thing to + # recognise here is that the batch inputs must be _ordered_ by + # length from longest (first in batch) to shortest (last) so + # initially, we are going forwards with every sequence and as we + # pass the index at which the shortest elements of the batch finish, + # we stop picking them up for the computation. + if self.go_forward: + while batch_lengths[current_length_index] <= index: + current_length_index -= 1 + # If we're going backwards, we are _picking up_ more indices. + else: + # First conditional: Are we already at the maximum number of + # elements in the batch? + # Second conditional: Does the next shortest sequence beyond + # the current batch index require computation use this timestep? + while ( + current_length_index < (len(batch_lengths) - 1) + and batch_lengths[current_length_index + 1] > index + ): + current_length_index += 1 + + # Actually get the slices of the batch which we + # need for the computation at this timestep. + # shape (batch_size, cell_size) + previous_memory = \ + full_batch_previous_memory[0: current_length_index + 1].clone() + # Shape (batch_size, hidden_size) + previous_state = \ + full_batch_previous_state[0: current_length_index + 1].clone() + # Shape (batch_size, input_size) + timestep_input = inputs[0: current_length_index + 1, index] + + # Do the projections for all the gates all at once. + # Both have shape (batch_size, 4 * cell_size) + projected_input = self.input_linearity(timestep_input) + projected_state = self.state_linearity(previous_state) + + # Main LSTM equations using relevant chunks of the big linear + # projections of the hidden state and inputs. + input_gate = torch.sigmoid( + projected_input[:, (0 * self.cell_size): (1 * self.cell_size)] + + projected_state[:, (0 * self.cell_size): (1 * self.cell_size)] + ) + forget_gate = torch.sigmoid( + projected_input[:, (1 * self.cell_size): (2 * self.cell_size)] + + projected_state[:, (1 * self.cell_size): (2 * self.cell_size)] + ) + memory_init = torch.tanh( + projected_input[:, (2 * self.cell_size): (3 * self.cell_size)] + + projected_state[:, (2 * self.cell_size): (3 * self.cell_size)] + ) + output_gate = torch.sigmoid( + projected_input[:, (3 * self.cell_size): (4 * self.cell_size)] + + projected_state[:, (3 * self.cell_size): (4 * self.cell_size)] + ) + memory = input_gate * memory_init + forget_gate * previous_memory + + # Here is the non-standard part of this LSTM cell; first, we clip + # the memory cell, then we project the output of the timestep to a + # smaller size and again clip it. + + if self.memory_cell_clip_value: + + memory = torch.clamp( + memory, -self.memory_cell_clip_value, + self.memory_cell_clip_value + ) + + # shape (current_length_index, cell_size) + pre_projection_timestep_output = output_gate * torch.tanh(memory) + + # shape (current_length_index, hidden_size) + timestep_output = self.state_projection( + pre_projection_timestep_output) + if self.state_projection_clip_value: + + timestep_output = torch.clamp( + timestep_output, + -self.state_projection_clip_value, + self.state_projection_clip_value, + ) + + # Only do dropout if the dropout prob is > 0.0 and we are in + # training mode. + if dropout_mask is not None: + timestep_output = \ + timestep_output * dropout_mask[0: current_length_index + 1] + + # We've been doing computation with less than the full batch, so + # here we create a new variable for the the whole batch at this + # timestep and insert the result for the relevant elements of the + # batch into it. + full_batch_previous_memory = full_batch_previous_memory.clone() + full_batch_previous_state = full_batch_previous_state.clone() + full_batch_previous_memory[0: current_length_index + 1] = memory + full_batch_previous_state[0: current_length_index + 1] = \ + timestep_output + output_accumulator[0: current_length_index + 1, index] = \ + timestep_output + + # Mimic the pytorch API by returning state in the following shape: + # (num_layers * num_directions, batch_size, ...). As this + # LSTM cell cannot be stacked, the first dimension here is just 1. + final_state = ( + full_batch_previous_state.unsqueeze(0), + full_batch_previous_memory.unsqueeze(0), + ) + + return output_accumulator, final_state + + +class Highway(torch.nn.Module): + r"""A [Highway layer](https://arxiv.org/abs/1505.00387) does a gated + combination of a linear transformation and a non-linear transformation of + its input. :math:`y = g * x + (1 - g) * f(A(x))`, + where :math:`A` is a linear transformation, :math:`f` is an element-wise + non-linearity, and :math:`g` is an element-wise gate, computed + as :math:`sigmoid(B(x))`. + + This module will apply a fixed number of highway layers to its input, + returning the final result. + + # Parameters + + input_dim : `int`, required + The dimensionality of :math:`x`. We assume the input has shape + `(batch_size, ..., input_dim)`. + num_layers : `int`, optional (default=`1`) + The number of highway layers to apply to the input. + activation : `Callable[[torch.Tensor], torch.Tensor]`, optional + (default=`torch.nn.functional.relu`) + The non-linearity to use in the highway layers. + """ + + def __init__(self, input_dim: int, num_layers: int = 1, + activation: Callable[[torch.Tensor], torch.Tensor] = + torch.nn.functional.relu,) -> None: + super().__init__() + self._input_dim = input_dim + self._layers = torch.nn.ModuleList( + [torch.nn.Linear(input_dim, input_dim * 2) + for _ in range(num_layers)] + ) + self._activation = activation + for layer in self._layers: + # We should bias the highway layer to just carry its input forward. + # We do that by setting the bias on `B(x)` to be positive, because + # that means `g` will be biased to be high, so we will carry the + # input forward. The bias on `B(x)` is the second half of the + # bias vector in each Linear layer. + layer.bias[input_dim:].data.fill_(1) # type: ignore + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: # type: ignore + current_input = inputs + for layer in self._layers: + projected_input = layer(current_input) + linear_part = current_input + # NOTE: if you modify this, think about whether you should modify + # the initialization above, too. + nonlinear_part, gate = projected_input.chunk(2, dim=-1) + nonlinear_part = self._activation(nonlinear_part) + gate = torch.sigmoid(gate) + current_input = gate * linear_part + (1 - gate) * nonlinear_part + return current_input + + +class Embedding(torch.nn.Module): + r"""A more featureful embedding module than the default in Pytorch. Adds + the ability to: + + 1. embed higher-order inputs + 2. pre-specify the weight matrix + 3. use a non-trainable embedding + 4. project the resultant embeddings to some other dimension (which only + makes sense with non-trainable embeddings). + 5. build all of this easily `from_params` + + Note that if you are using our data API and are trying to embed a + [`TextField`](../../data/fields/text_field.md), you should use a + [`TextFieldEmbedder`](../text_field_embedders/text_field_embedder.md) + instead of using this directly. + + # Parameters + + num_embeddings : `int` + Size of the dictionary of embeddings (vocabulary size). + embedding_dim : `int` + The size of each embedding vector. + projection_dim : `int`, (optional, default=None) + If given, we add a projection layer after the embedding layer. This + really only makes sense if `trainable` is `False`. + weight : `torch.FloatTensor`, (optional, default=None) + A pre-initialised weight matrix for the embedding lookup, allowing the + use of pretrained vectors. + padding_index : `int`, (optional, default=None) + If given, pads the output with zeros whenever it encounters the index. + trainable : `bool`, (optional, default=True) + Whether or not to optimize the embedding parameters. + max_norm : `float`, (optional, default=None) + If given, will renormalize the embeddings to always have a norm lesser + than this + norm_type : `float`, (optional, default=2) + The p of the p-norm to compute for the max_norm option + scale_grad_by_freq : `bool`, (optional, default=False) + If given, this will scale gradients by the frequency of the words in + the mini-batch. + sparse : `bool`, (optional, default=False) + Whether or not the Pytorch backend should use a sparse representation + of the embedding weight. + vocab_namespace : `str`, (optional, default=None) + In case of fine-tuning/transfer learning, the model's embedding matrix + needs to be extended according to the size of extended-vocabulary. To + be able to know how much to extend the embedding-matrix, it's necessary + to know which vocab_namspace was used to construct it in the original + training. We store vocab_namespace used during the original training as + an attribute, so that it can be retrieved during fine-tuning. + pretrained_file : `str`, (optional, default=None) + Used to keep track of what is the source of the weights and loading + more embeddings at test time. **It does not load the weights from this + pretrained_file.** For that purpose, use `Embedding.from_params`. + + # Returns + + An Embedding module. + """ + + default_implementation = "embedding" + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + projection_dim: Optional[int] = None, + weight: Optional[torch.FloatTensor] = None, + padding_index: Optional[int] = None, + trainable: bool = True, + max_norm: Optional[float] = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + sparse: bool = False, + vocab_namespace: Optional[str] = None, + pretrained_file: Optional[str] = None, + ) -> None: + super().__init__() + self.num_embeddings = num_embeddings + self.padding_index = padding_index + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + self.sparse = sparse + self._vocab_namespace = vocab_namespace + self._pretrained_file = pretrained_file + + self.output_dim = projection_dim or embedding_dim + + if weight is None: + weight = torch.FloatTensor(num_embeddings, embedding_dim) + self.weight = torch.nn.Parameter(weight, requires_grad=trainable) + torch.nn.init.xavier_uniform_(self.weight) + else: + if weight.size() != (num_embeddings, embedding_dim): + raise ConfigurationError( + "A weight matrix was passed with contradictory embedding " + "shapes." + ) + self.weight = torch.nn.Parameter(weight, + requires_grad=trainable) + + if self.padding_index is not None: + self.weight.data[self.padding_index].fill_(0) + + if projection_dim: + self._projection = torch.nn.Linear(embedding_dim, projection_dim) + else: + self._projection = None # type: ignore + + def forward(self, tokens: torch.Tensor) -> torch.Tensor: # type: ignore + # tokens may have extra dimensions + # (batch_size, d1, ..., dn, sequence_length), + # but embedding expects (batch_size, sequence_length), so pass tokens to + # util.combine_initial_dims (which is a no-op if there are no extra + # dimensions). Remember the original size. + original_size = tokens.size() + tokens = combine_initial_dims(tokens) + + embedded = embedding( + tokens, + self.weight, + padding_idx=self.padding_index, + max_norm=self.max_norm, + norm_type=self.norm_type, + scale_grad_by_freq=self.scale_grad_by_freq, + sparse=self.sparse, + ) + + # Now (if necessary) add back in the extra dimensions. + embedded = uncombine_initial_dims(embedded, original_size) + + if self._projection: + projection = self._projection + for _ in range(embedded.dim() - 2): + projection = TimeDistributed(projection) # type: ignore + embedded = projection(embedded) + return embedded + + +class TimeDistributed(torch.nn.Module): + r"""Given an input shaped like `(batch_size, time_steps, [rest])` and a + `Module` that takes inputs like `(batch_size, [rest])`, `TimeDistributed` + reshapes the input to be `(batch_size * time_steps, [rest])`, applies the + contained `Module`, then reshapes it back. + + Note that while the above gives shapes with `batch_size` first, this + `Module` also works if `batch_size` is second - we always just combine the + first two dimensions, then split them. + + It also reshapes keyword arguments unless they are not tensors or their + name is specified in the optional `pass_through` iterable. + """ + + def __init__(self, module): + super().__init__() + self._module = module + + def forward(self, *inputs, + pass_through: Optional[List[str]] = None, **kwargs): + + pass_through = pass_through or [] + + reshaped_inputs = [self._reshape_tensor(input_tensor) + for input_tensor in inputs] + + # Need some input to then get the batch_size and time_steps. + some_input = None + if inputs: + some_input = inputs[-1] + + reshaped_kwargs = {} + for key, value in kwargs.items(): + if isinstance(value, torch.Tensor) and key not in pass_through: + if some_input is None: + some_input = value + + value = self._reshape_tensor(value) + + reshaped_kwargs[key] = value + + reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs) + + if some_input is None: + raise RuntimeError("No input tensor to time-distribute") + + # Now get the output back into the right shape. + # (batch_size, time_steps, **output_size) + new_size = some_input.size()[:2] + reshaped_outputs.size()[1:] + outputs = reshaped_outputs.contiguous().view(new_size) + + return outputs + + @staticmethod + def _reshape_tensor(input_tensor): + input_size = input_tensor.size() + if len(input_size) <= 2: + raise RuntimeError(f"No dimension to distribute: {input_size}") + # Squash batch_size and time_steps into a single axis; result has shape + # (batch_size * time_steps, **input_size). + squashed_shape = [-1] + list(input_size[2:]) + return input_tensor.contiguous().view(*squashed_shape) + + +def add_sentence_boundary_token_ids( + tensor: torch.Tensor, mask: torch.Tensor, + sentence_begin_token: Any, sentence_end_token: Any) -> \ + Tuple[torch.Tensor, torch.Tensor]: + r"""Add begin/end of sentence tokens to the batch of sentences. + Given a batch of sentences with size `(batch_size, timesteps)` or + `(batch_size, timesteps, dim)` this returns a tensor of shape + `(batch_size, timesteps + 2)` or `(batch_size, timesteps + 2, dim)` + respectively. + + Returns both the new tensor and updated mask. + + # Parameters + + tensor : `torch.Tensor` + A tensor of shape `(batch_size, timesteps)` or + `(batch_size, timesteps, dim)` + mask : `torch.Tensor` + A tensor of shape `(batch_size, timesteps)` + sentence_begin_token: Any (anything that can be broadcast in torch for + assignment) + For 2D input, a scalar with the id. For 3D input, a tensor with + length dim. + sentence_end_token: Any (anything that can be broadcast in torch for + assignment) + For 2D input, a scalar with the id. For 3D input, a tensor with + length dim. + + # Returns + + tensor_with_boundary_tokens : `torch.Tensor` + The tensor with the appended and prepended boundary tokens. If the + input was 2D, it has shape (batch_size, timesteps + 2) and if the + input was 3D, it has shape (batch_size, timesteps + 2, dim). + new_mask : `torch.Tensor` + The new mask for the tensor, taking into account the appended tokens + marking the beginning and end of the sentence. + """ + # TODO: matthewp, profile this transfer + sequence_lengths = mask.sum(dim=1).detach().cpu().numpy() + tensor_shape = list(tensor.data.shape) + new_shape = list(tensor_shape) + new_shape[1] = tensor_shape[1] + 2 + tensor_with_boundary_tokens = tensor.new_zeros(*new_shape) + if len(tensor_shape) == 2: + tensor_with_boundary_tokens[:, 1:-1] = tensor + tensor_with_boundary_tokens[:, 0] = sentence_begin_token + for i, j in enumerate(sequence_lengths): + tensor_with_boundary_tokens[i, j + 1] = sentence_end_token + new_mask = (tensor_with_boundary_tokens != 0).long() + elif len(tensor_shape) == 3: + tensor_with_boundary_tokens[:, 1:-1, :] = tensor + for i, j in enumerate(sequence_lengths): + tensor_with_boundary_tokens[i, 0, :] = sentence_begin_token + tensor_with_boundary_tokens[i, j + 1, :] = sentence_end_token + new_mask = ( + (tensor_with_boundary_tokens > 0).long().sum(dim=-1) > 0).long() + else: + raise ValueError( + "add_sentence_boundary_token_ids only accepts 2D and 3D input") + + return tensor_with_boundary_tokens, new_mask + + +def get_device_of(tensor: torch.Tensor) -> int: + r"""Returns the device of the tensor. + """ + if not tensor.is_cuda: + return -1 + else: + return tensor.get_device() + + +def remove_sentence_boundaries(tensor: torch.Tensor, mask: torch.Tensor) -> \ + Tuple[torch.Tensor, torch.Tensor]: + r"""Remove begin/end of sentence embeddings from the batch of sentences. + Given a batch of sentences with size `(batch_size, timesteps, dim)` + this returns a tensor of shape `(batch_size, timesteps - 2, dim)` after + removing the beginning and end sentence markers. The sentences are + assumed to be padded on the right, with the beginning of each sentence + assumed to occur at index 0 (i.e., `mask[:, 0]` is assumed to be 1). + + Returns both the new tensor and updated mask. + + This function is the inverse of `add_sentence_boundary_token_ids`. + + # Parameters + + tensor : `torch.Tensor` + A tensor of shape `(batch_size, timesteps, dim)` + mask : `torch.Tensor` + A tensor of shape `(batch_size, timesteps)` + + # Returns + + tensor_without_boundary_tokens : `torch.Tensor` + The tensor after removing the boundary tokens of shape + `(batch_size, timesteps - 2, dim)` + new_mask : `torch.Tensor` + The new mask for the tensor of shape `(batch_size, timesteps - 2)`. + """ + # TODO: matthewp, profile this transfer + sequence_lengths = mask.sum(dim=1).detach().cpu().numpy() + tensor_shape = list(tensor.data.shape) + new_shape = list(tensor_shape) + new_shape[1] = tensor_shape[1] - 2 + tensor_without_boundary_tokens = tensor.new_zeros(*new_shape) + new_mask = tensor.new_zeros((new_shape[0], new_shape[1]), dtype=torch.long) + for i, j in enumerate(sequence_lengths): + if j > 2: + tensor_without_boundary_tokens[i, : (j - 2), :] = \ + tensor[i, 1: (j - 1), :] + new_mask[i, : (j - 2)] = 1 + + return tensor_without_boundary_tokens, new_mask + + +A = TypeVar("A") + + +def lazy_groups_of(iterable: Iterable[A], group_size: int) -> Iterator[List[A]]: + r"""Takes an iterable and batches the individual instances into lists of the + specified size. The last list may be smaller if there are instances left + over. + """ + iterator = iter(iterable) + while True: + s = list(islice(iterator, group_size)) + if len(s) > 0: + yield s + else: + break + + +class ConfigurationError(Exception): + r"""The exception raised by any AllenNLP object when it's misconfigured + (e.g. missing properties, invalid properties, unknown properties). + """ + + def __init__(self, message): + super().__init__() + self.message = message + + def __str__(self): + # TODO(brendanr): Is there some reason why we need repr here? It + # produces horrible output for simple multi-line error messages. + return self.message + + +def get_lengths_from_binary_sequence_mask(mask: torch.Tensor): + r"""Compute sequence lengths for each batch element in a tensor using a + binary mask. + + # Parameters + + mask : torch.Tensor, required. + A 2D binary mask of shape (batch_size, sequence_length) to + calculate the per-batch sequence lengths from. + + # Returns + + A torch.LongTensor of shape (batch_size,) representing the lengths + of the sequences in the batch. + """ + return mask.long().sum(-1) + + +def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor): + r"""Sort a batch first tensor by some specified lengths. + + # Parameters + + tensor : torch.FloatTensor, required. + A batch first Pytorch tensor. + sequence_lengths : torch.LongTensor, required. + A tensor representing the lengths of some dimension of the tensor which + we want to sort by. + + # Returns + + sorted_tensor : torch.FloatTensor + The original tensor sorted along the batch dimension with respect to + sequence_lengths. + sorted_sequence_lengths : torch.LongTensor + The original sequence_lengths sorted by decreasing size. + restoration_indices : torch.LongTensor + Indices into the sorted_tensor such that + `sorted_tensor.index_select(0, restoration_indices) == original_tensor` + permutation_index : torch.LongTensor + The indices used to sort the tensor. This is useful if you want to sort + many tensors using the same ordering. + """ + + if not isinstance(tensor, torch.Tensor) or not isinstance(sequence_lengths, + torch.Tensor): + raise ConfigurationError( + "Both the tensor and sequence lengths must be torch.Tensors.") + + sorted_sequence_lengths, permutation_index = sequence_lengths.sort( + 0, descending=True) + sorted_tensor = tensor.index_select(0, permutation_index) + + index_range = torch.arange(0, len(sequence_lengths), + device=sequence_lengths.device) + # This is the equivalent of zipping with index, sorting by the original + # sequence lengths and returning the now sorted indices. + _, reverse_mapping = permutation_index.sort(0, descending=False) + restoration_indices = index_range.index_select(0, reverse_mapping) + return (sorted_tensor, sorted_sequence_lengths, restoration_indices, + permutation_index) + + +def block_orthogonal(tensor: torch.Tensor, split_sizes: List[int], + gain: float = 1.0) -> None: + r"""An initializer which allows initializing model parameters in "blocks". + This is helpful in the case of recurrent models which use multiple gates + applied to linear projections, which can be computed efficiently if they + are concatenated together. However, they are separate parameters which + should be initialized independently. + + # Parameters + + tensor : `torch.Tensor`, required. + A tensor to initialize. + split_sizes : List[int], required. + A list of length `tensor.ndim()` specifying the size of the + blocks along that particular dimension. E.g. `[10, 20]` would + result in the tensor being split into chunks of size 10 along the + first dimension and 20 along the second. + gain : float, optional (default = 1.0) + The gain (scaling) applied to the orthogonal initialization. + """ + data = tensor.data + sizes = list(tensor.size()) + if any(a % b != 0 for a, b in zip(sizes, split_sizes)): + raise ConfigurationError( + "tensor dimensions must be divisible by their respective " + "split_sizes. Found size: {} and split_sizes: {}".format( + sizes, split_sizes) + ) + indexes = [list(range(0, max_size, split)) for max_size, split in zip( + sizes, split_sizes)] + # Iterate over all possible blocks within the tensor. + for block_start_indices in itertools.product(*indexes): + # A list of tuples containing the index to start at for this block + # and the appropriate step size (i.e split_size[i] for dimension i). + index_and_step_tuples = zip(block_start_indices, split_sizes) + # This is a tuple of slices corresponding to: + # tensor[index: index + step_size, ...]. This is required because we + # could have an arbitrary number of dimensions. The actual slices we + # need are the start_index: start_index + step for each dimension in + # the tensor. + block_slice = tuple( + slice(start_index, start_index + step) for start_index, step in + index_and_step_tuples + ) + data[block_slice] = torch.nn.init.orthogonal_( + tensor[block_slice].contiguous(), gain=gain) + + +def get_dropout_mask(dropout_probability: float, + tensor_for_masking: torch.Tensor): + r"""Computes and returns an element-wise dropout mask for a given tensor, + where each element in the mask is dropped out with probability + dropout_probability. Note that the mask is NOT applied to the tensor - + the tensor is passed to retain the correct CUDA tensor type for the mask. + + # Parameters + + dropout_probability : float, required. + Probability of dropping a dimension of the input. + tensor_for_masking : torch.Tensor, required. + + # Returns + + A torch.FloatTensor consisting of the binary mask scaled by + 1/ (1 - dropout_probability). + This scaling ensures expected values and variances of the output of + applying this mask and the original tensor are the same. + """ + binary_mask = ( + torch.rand(tensor_for_masking.size()) > dropout_probability).to( + tensor_for_masking.device + ) + # Scale mask by 1/keep_prob to preserve output statistics. + dropout_mask = binary_mask.float().div(1.0 - dropout_probability) + return dropout_mask + + +def combine_initial_dims(tensor: torch.Tensor) -> torch.Tensor: + r"""Given a (possibly higher order) tensor of ids with shape + (d1, ..., dn, sequence_length) Return a view that's + (d1 * ... * dn, sequence_length). If original tensor is 1-d or 2-d, + return it as is. + """ + if tensor.dim() <= 2: + return tensor + else: + return tensor.view(-1, tensor.size(-1)) + + +def uncombine_initial_dims(tensor: torch.Tensor, original_size: torch.Size) -> \ + torch.Tensor: + r"""Given a tensor of embeddings with shape + (d1 * ... * dn, sequence_length, embedding_dim) and the original shape + (d1, ..., dn, sequence_length), return the reshaped tensor of embeddings + with shape (d1, ..., dn, sequence_length, embedding_dim). + If original size is 1-d or 2-d, return it as is. + """ + if len(original_size) <= 2: + return tensor + else: + view_args = list(original_size) + [tensor.size(-1)] + return tensor.view(*view_args) + + +class ScalarMix(torch.nn.Module): + r"""Computes a parameterised scalar mixture of N tensors, + `mixture = gamma * sum(s_k * tensor_k)` where `s = softmax(w)`, with `w` + and `gamma` scalar parameters. + + In addition, if `do_layer_norm=True` then apply layer normalization to + each tensor before weighting. + """ + + def __init__(self, mixture_size: int, do_layer_norm: bool = False, + initial_scalar_parameters: Optional[List[float]] = None, + trainable: bool = True,) -> None: + super().__init__() + self.mixture_size = mixture_size + self.do_layer_norm = do_layer_norm + + if initial_scalar_parameters is None: + initial_scalar_parameters = [0.0] * mixture_size + elif len(initial_scalar_parameters) != mixture_size: + raise ConfigurationError( + "Length of initial_scalar_parameters {} differs " + "from mixture_size {}".format(initial_scalar_parameters, + mixture_size) + ) + + self.scalar_parameters = ParameterList( + [ + Parameter( + torch.FloatTensor([initial_scalar_parameters[i]]), + requires_grad=trainable + ) + for i in range(mixture_size) + ] + ) + self.gamma = Parameter(torch.FloatTensor([1.0]), + requires_grad=trainable) + + def forward(self, tensors: List[torch.Tensor], # type: ignore + mask: Optional[torch.Tensor] = None) -> torch.Tensor: + r"""Compute a weighted average of the `tensors`. The input tensors an + be any shape with at least two dimensions, but must all be the same + shape. + + When `do_layer_norm=True`, the `mask` is required input. If the + `tensors` are dimensioned `(dim_0, ..., dim_{n-1}, dim_n)`, then the + `mask` is dimensioned `(dim_0, ..., dim_{n-1})`, as in the typical + case with `tensors` of shape `(batch_size, timesteps, dim)` and `mask` + of shape `(batch_size, timesteps)`. + + When `do_layer_norm=False` the `mask` is ignored. + """ + if len(tensors) != self.mixture_size: + raise ConfigurationError( + "{} tensors were passed, but the module was initialized to " + "mix {} tensors.".format(len(tensors), self.mixture_size) + ) + + def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked): + tensor_masked = tensor * broadcast_mask + mean = torch.sum(tensor_masked) / num_elements_not_masked + variance = ( + torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) / + num_elements_not_masked + ) + return (tensor - mean) / torch.sqrt(variance + 1e-12) + + # pylint: disable=unnecessary-comprehension + normed_weights = torch.nn.functional.softmax( + torch.cat([parameter for parameter in self.scalar_parameters]), + dim=0 + ) + normed_weights = torch.split(normed_weights, split_size_or_sections=1) + + if not self.do_layer_norm: + pieces = [] + for weight, tensor in zip(normed_weights, tensors): + pieces.append(weight * tensor) + return self.gamma * sum(pieces) + + else: + assert mask is not None + mask_float = mask.float() + broadcast_mask = mask_float.unsqueeze(-1) + input_dim = tensors[0].size(-1) + num_elements_not_masked = torch.sum(mask_float) * input_dim + + pieces = [] + for weight, tensor in zip(normed_weights, tensors): + pieces.append( + weight * _do_layer_norm(tensor, broadcast_mask, + num_elements_not_masked) + ) + return self.gamma * sum(pieces) diff --git a/texar/torch/modules/pretrained/elmo_utils_test.py b/texar/torch/modules/pretrained/elmo_utils_test.py new file mode 100644 index 000000000..34d826241 --- /dev/null +++ b/texar/torch/modules/pretrained/elmo_utils_test.py @@ -0,0 +1,882 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for utils of ELMo modules. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/common/util_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/elmo_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/encoder_base_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/lstm_cell_with_projection_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/highway_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/time_distributed_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/nn/initializers_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/nn/util_test.py` +""" + +import unittest + +import h5py +import json +import numpy +import tempfile +import torch + +from numpy.testing import assert_array_almost_equal, assert_almost_equal +from torch.nn import LSTM, RNN, Embedding, Module, Parameter + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import batch_to_ids +from texar.torch.data.data_utils import maybe_download +from texar.torch.modules.pretrained.elmo_utils import ( + Highway, LstmCellWithProjection, _EncoderBase, _ElmoBiLm, TimeDistributed, + sort_batch_by_length, get_lengths_from_binary_sequence_mask, + remove_sentence_boundaries, add_sentence_boundary_token_ids, + lazy_groups_of, block_orthogonal, ConfigurationError, combine_initial_dims, + uncombine_initial_dims, ScalarMix) +from texar.torch.utils.test import cuda_test + + +class TestElmoBiLm(unittest.TestCase): + + def setUp(self): + super().setUp() + self.tmp_dir = tempfile.TemporaryDirectory() + self.options_file = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/tests/' + 'fixtures/elmo/options.json?raw=true', + self.tmp_dir.name) + self.weight_file = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/tests/' + 'fixtures/elmo/lm_weights.hdf5?raw=true', + self.tmp_dir.name) + self.sentences_json_file = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/tests/' + 'fixtures/elmo/sentences.json?raw=true', + self.tmp_dir.name) + + def tearDown(self): + self.tmp_dir.cleanup() + + def _load_sentences_embeddings(self): + r"""Load the test sentences and the expected LM embeddings. + + These files loaded in this method were created with a batch-size of 3. + Due to idiosyncrasies with TensorFlow, the 30 sentences in + sentences.json are split into 3 files in which the k-th sentence in + each is from batch k. + + This method returns a (sentences, embeddings) pair where each is a + list of length batch_size. Each list contains a sublist with + total_sentence_count / batch_size elements. As with the original files, + the k-th element in the sublist is in batch k. + """ + with open(self.sentences_json_file) as fin: + sentences = json.load(fin) + + # the expected embeddings + expected_lm_embeddings = [] + for k in range(len(sentences)): + embed_fname = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/' + 'tests/fixtures/elmo/lm_embeddings_{}.hdf5?raw=true'.format(k), + self.tmp_dir.name) + expected_lm_embeddings.append([]) + with h5py.File(embed_fname, "r") as fin: + for i in range(10): + sent_embeds = fin["%s" % i][...] + sent_embeds_concat = numpy.concatenate( + (sent_embeds[0, :, :], sent_embeds[1, :, :]), axis=-1 + ) + expected_lm_embeddings[-1].append(sent_embeds_concat) + + return sentences, expected_lm_embeddings + + def test_elmo_bilm(self): + # get the raw data + sentences, expected_lm_embeddings = self._load_sentences_embeddings() + + # load the test model + elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) + + batches = [[sentences[j][i].split() for j in range(3)] + for i in range(10)] + + # Now finally we can iterate through batches. + for i, batch in enumerate(batches): + lm_embeddings = elmo_bilm(batch_to_ids(batch[:3])) + top_layer_embeddings, mask = remove_sentence_boundaries( + lm_embeddings["activations"][2], lm_embeddings["mask"] + ) + + # check the mask lengths + lengths = mask.data.numpy().sum(axis=1) + batch_sentences = [sentences[k][i] for k in range(3)] + expected_lengths = [len(sentence.split()) for sentence in + batch_sentences] + self.assertEqual(lengths.tolist(), expected_lengths) + + # get the expected embeddings and compare! + expected_top_layer = [expected_lm_embeddings[k][i] for k in + range(3)] + for k in range(3): + self.assertTrue( + numpy.allclose( + top_layer_embeddings[k, : lengths[k], :].data.numpy(), + expected_top_layer[k], + atol=1.0e-6, + ) + ) + + +class TestEncoderBase(unittest.TestCase): + + def setUp(self): + super().setUp() + self.lstm = LSTM( + bidirectional=True, num_layers=3, input_size=3, hidden_size=7, + batch_first=True + ) + self.rnn = RNN( + bidirectional=True, num_layers=3, input_size=3, hidden_size=7, + batch_first=True + ) + self.encoder_base = _EncoderBase(stateful=True) + + tensor = torch.rand([5, 7, 3]) + tensor[1, 6:, :] = 0 + tensor[3, 2:, :] = 0 + self.tensor = tensor + mask = torch.ones(5, 7) + mask[1, 6:] = 0 + mask[2, :] = 0 # <= completely masked + mask[3, 2:] = 0 + mask[4, :] = 0 # <= completely masked + self.mask = mask + + self.batch_size = 5 + self.num_valid = 3 + sequence_lengths = get_lengths_from_binary_sequence_mask(mask) + _, _, restoration_indices, sorting_indices = sort_batch_by_length( + tensor, sequence_lengths) + self.sorting_indices = sorting_indices + self.restoration_indices = restoration_indices + + def test_non_stateful_states_are_sorted_correctly(self): + encoder_base = _EncoderBase(stateful=False) + initial_states = (torch.randn(6, 5, 7), torch.randn(6, 5, 7)) + # Check that we sort the state for non-stateful encoders. To test + # we'll just use a "pass through" encoder, as we aren't actually testing + # the functionality of the encoder here anyway. + _, states, restoration_indices = encoder_base.sort_and_run_forward( + lambda *x: x, self.tensor, self.mask, initial_states + ) + # Our input tensor had 2 zero length sequences, so we need + # to concat a tensor of shape + # (num_layers * num_directions, batch_size - num_valid, hidden_dim), + # to the output before unsorting it. + zeros = torch.zeros([6, 2, 7]) + + # sort_and_run_forward strips fully-padded instances from the batch; + # in order to use the restoration_indices we need to add back the two + # that got stripped. What we get back should match what we started with. + for state, original in zip(states, initial_states): + assert list(state.size()) == [6, 3, 7] + state_with_zeros = torch.cat([state, zeros], 1) + unsorted_state = state_with_zeros.index_select(1, + restoration_indices) + for index in [0, 1, 3]: + numpy.testing.assert_array_equal( + unsorted_state[:, index, :].data.numpy(), + original[:, index, :].data.numpy() + ) + + def test_get_initial_states(self): + # First time we call it, there should be no state, so we should return + # None. + assert ( + self.encoder_base._get_initial_states( + self.batch_size, self.num_valid, self.sorting_indices + ) + is None + ) + + # First test the case that the previous state is _smaller_ than the + # current state input. + initial_states = (torch.randn([1, 3, 7]), torch.randn([1, 3, 7])) + self.encoder_base._states = initial_states + # sorting indices are: [0, 1, 3, 2, 4] + returned_states = self.encoder_base._get_initial_states( + self.batch_size, self.num_valid, self.sorting_indices + ) + + correct_expanded_states = [ + torch.cat([state, torch.zeros([1, 2, 7])], 1) + for state in initial_states + ] + # State should have been expanded with zeros to have shape + # (1, batch_size, hidden_size). + numpy.testing.assert_array_equal( + self.encoder_base._states[0].data.numpy(), + correct_expanded_states[0].data.numpy() + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1].data.numpy(), + correct_expanded_states[1].data.numpy() + ) + + # The returned states should be of shape (1, num_valid, hidden_size) and + # they also should have been sorted with respect to the indices. + # sorting indices are: [0, 1, 3, 2, 4] + + correct_returned_states = [ + state.index_select(1, self.sorting_indices)[:, : self.num_valid, :] + for state in correct_expanded_states + ] + + numpy.testing.assert_array_equal( + returned_states[0].data.numpy(), + correct_returned_states[0].data.numpy() + ) + numpy.testing.assert_array_equal( + returned_states[1].data.numpy(), + correct_returned_states[1].data.numpy() + ) + + # Now test the case that the previous state is larger: + original_states = (torch.randn([1, 10, 7]), torch.randn([1, 10, 7])) + self.encoder_base._states = original_states + # sorting indices are: [0, 1, 3, 2, 4] + returned_states = self.encoder_base._get_initial_states( + self.batch_size, self.num_valid, self.sorting_indices + ) + # State should not have changed, as they were larger + # than the batch size of the requested states. + numpy.testing.assert_array_equal( + self.encoder_base._states[0].data.numpy(), + original_states[0].data.numpy() + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1].data.numpy(), + original_states[1].data.numpy() + ) + + # The returned states should be of shape (1, num_valid, hidden_size) + # and they also should have been sorted with respect to the indices. + correct_returned_state = [ + x.index_select(1, self.sorting_indices)[:, : self.num_valid, :] + for x in original_states + ] + numpy.testing.assert_array_equal( + returned_states[0].data.numpy(), + correct_returned_state[0].data.numpy() + ) + numpy.testing.assert_array_equal( + returned_states[1].data.numpy(), + correct_returned_state[1].data.numpy() + ) + + def test_update_states(self): + assert self.encoder_base._states is None + initial_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) + + index_selected_initial_states = ( + initial_states[0].index_select(1, self.restoration_indices), + initial_states[1].index_select(1, self.restoration_indices), + ) + + self.encoder_base._update_states(initial_states, + self.restoration_indices) + # State was None, so the updated state should just be the sorted given + # state. + numpy.testing.assert_array_equal( + self.encoder_base._states[0].data.numpy(), + index_selected_initial_states[0].data.numpy() + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1].data.numpy(), + index_selected_initial_states[1].data.numpy() + ) + + new_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) + # tensor has 2 completely masked rows, so the last 2 rows of the _ + # sorted_ states will be completely zero, having been appended after + # calling the respective encoder. + new_states[0][:, -2:, :] = 0 + new_states[1][:, -2:, :] = 0 + + index_selected_new_states = ( + new_states[0].index_select(1, self.restoration_indices), + new_states[1].index_select(1, self.restoration_indices), + ) + + self.encoder_base._update_states(new_states, self.restoration_indices) + # Check that the update _preserved_ the state for the rows which were + # completely masked (2 and 4): + for index in [2, 4]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_initial_states[0][:, index, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_initial_states[1][:, index, :].data.numpy(), + ) + # Now the states which were updated: + for index in [0, 1, 3]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_new_states[0][:, index, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_new_states[1][:, index, :].data.numpy(), + ) + + # Now test the case that the new state is smaller: + small_new_states = torch.randn([1, 3, 7]), torch.randn([1, 3, 7]) + # pretend the 2nd sequence in the batch was fully masked. + small_restoration_indices = torch.LongTensor([2, 0, 1]) + small_new_states[0][:, 0, :] = 0 + small_new_states[1][:, 0, :] = 0 + + index_selected_small_states = ( + small_new_states[0].index_select(1, small_restoration_indices), + small_new_states[1].index_select(1, small_restoration_indices), + ) + self.encoder_base._update_states(small_new_states, + small_restoration_indices) + + # Check the index for the row we didn't update is the same as the + # previous step: + for index in [1, 3]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_new_states[0][:, index, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_new_states[1][:, index, :].data.numpy(), + ) + # Indices we did update: + for index in [0, 2]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_small_states[0][:, index, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_small_states[1][:, index, :].data.numpy(), + ) + + # We didn't update index 4 in the previous step either, so it should + # be equal to the 4th index of initial states. + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, 4, :].data.numpy(), + index_selected_initial_states[0][:, 4, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, 4, :].data.numpy(), + index_selected_initial_states[1][:, 4, :].data.numpy(), + ) + + def test_reset_states(self): + # Initialize the encoder states. + assert self.encoder_base._states is None + initial_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) + index_selected_initial_states = ( + initial_states[0].index_select(1, self.restoration_indices), + initial_states[1].index_select(1, self.restoration_indices), + ) + self.encoder_base._update_states(initial_states, + self.restoration_indices) + + # Check that only some of the states are reset when a mask is provided. + mask = torch.FloatTensor([1, 1, 0, 0, 0]) + self.encoder_base.reset_states(mask) + # First two states should be zeros + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, :2, :].data.numpy(), + torch.zeros_like(initial_states[0])[:, :2, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, :2, :].data.numpy(), + torch.zeros_like(initial_states[1])[:, :2, :].data.numpy(), + ) + # Remaining states should be the same + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, 2:, :].data.numpy(), + index_selected_initial_states[0][:, 2:, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, 2:, :].data.numpy(), + index_selected_initial_states[1][:, 2:, :].data.numpy(), + ) + + # Check that error is raised if mask has wrong batch size. + bad_mask = torch.FloatTensor([1, 1, 0]) + with self.assertRaises(ValueError): + self.encoder_base.reset_states(bad_mask) + + # Check that states are reset to None if no mask is provided. + self.encoder_base.reset_states() + assert self.encoder_base._states is None + + def test_non_contiguous_initial_states_handled(self): + # Check that the encoder is robust to non-contiguous initial states. + + # Case 1: Encoder is not stateful + + # A transposition will make the tensors non-contiguous, start them off + # at the wrong shape and transpose them into the right shape. + encoder_base = _EncoderBase(stateful=False) + initial_states = ( + torch.randn(5, 6, 7).permute(1, 0, 2), + torch.randn(5, 6, 7).permute(1, 0, 2), + ) + assert not initial_states[0].is_contiguous() and \ + not initial_states[1].is_contiguous() + assert initial_states[0].size() == torch.Size([6, 5, 7]) + assert initial_states[1].size() == torch.Size([6, 5, 7]) + + # We'll pass them through an LSTM encoder and a vanilla RNN encoder to + # make sure it works whether the initial states are a tuple of tensors + # or just a single tensor. + encoder_base.sort_and_run_forward(self.lstm, self.tensor, + self.mask, initial_states) + encoder_base.sort_and_run_forward(self.rnn, self.tensor, + self.mask, initial_states[0]) + + # Case 2: Encoder is stateful + + # For stateful encoders, the initial state may be non-contiguous if + # its state was previously updated with non-contiguous tensors. As in + # the non-stateful tests, we check that the encoder still works on + # initial states for RNNs and LSTMs. + final_states = initial_states + # Check LSTM + encoder_base = _EncoderBase(stateful=True) + encoder_base._update_states(final_states, self.restoration_indices) + encoder_base.sort_and_run_forward(self.lstm, self.tensor, self.mask) + # Check RNN + encoder_base.reset_states() + encoder_base._update_states([final_states[0]], self.restoration_indices) + encoder_base.sort_and_run_forward(self.rnn, self.tensor, self.mask) + + @cuda_test + def test_non_contiguous_initial_states_handled_on_gpu(self): + # Some PyTorch operations which produce contiguous tensors on the CPU + # produce non-contiguous tensors on the GPU (e.g. forward pass of an + # RNN when batch_first=True). Accordingly, we perform the same checks + # from previous test on the GPU to ensure the encoder is not affected + # by which device it is on. + + # Case 1: Encoder is not stateful + + # A transposition will make the tensors non-contiguous, start them off + # at the wrong shape and transpose them into the right shape. + encoder_base = _EncoderBase(stateful=False).cuda() + initial_states = ( + torch.randn(5, 6, 7).cuda().permute(1, 0, 2), + torch.randn(5, 6, 7).cuda().permute(1, 0, 2), + ) + assert not initial_states[0].is_contiguous() and not initial_states[ + 1].is_contiguous() + assert initial_states[0].size() == torch.Size([6, 5, 7]) + assert initial_states[1].size() == torch.Size([6, 5, 7]) + + # We'll pass them through an LSTM encoder and a vanilla RNN encoder to + # make sure it works whether the initial states are a tuple of tensors + # or just a single tensor. + encoder_base.sort_and_run_forward( + self.lstm.cuda(), self.tensor.cuda(), self.mask.cuda(), + initial_states + ) + encoder_base.sort_and_run_forward( + self.rnn.cuda(), self.tensor.cuda(), self.mask.cuda(), + initial_states[0] + ) + + # Case 2: Encoder is stateful + + # For stateful encoders, the initial state may be non-contiguous if its + # state was previously updated with non-contiguous tensors. As in the + # non-stateful tests, we check that the encoder still works on initial + # states for RNNs and LSTMs. + final_states = initial_states + # Check LSTM + encoder_base = _EncoderBase(stateful=True).cuda() + encoder_base._update_states(final_states, + self.restoration_indices.cuda()) + encoder_base.sort_and_run_forward(self.lstm.cuda(), self.tensor.cuda(), + self.mask.cuda()) + # Check RNN + encoder_base.reset_states() + encoder_base._update_states([final_states[0]], + self.restoration_indices.cuda()) + encoder_base.sort_and_run_forward(self.rnn.cuda(), self.tensor.cuda(), + self.mask.cuda()) + + +class TestHighway(unittest.TestCase): + + def test_forward_works_on_simple_input(self): + highway = Highway(2, 2) + + highway._layers[0].weight.data.fill_(1) + highway._layers[0].bias.data.fill_(0) + highway._layers[1].weight.data.fill_(2) + highway._layers[1].bias.data.fill_(-2) + input_tensor = torch.FloatTensor([[-2, 1], [3, -2]]) + result = highway(input_tensor).data.numpy() + assert result.shape == (2, 2) + # This was checked by hand. + assert_almost_equal(result, [[-0.0394, 0.0197], [1.7527, -0.5550]], + decimal=4) + + def test_forward_works_on_nd_input(self): + highway = Highway(2, 2) + input_tensor = torch.ones(2, 2, 2) + output = highway(input_tensor) + assert output.size() == (2, 2, 2) + + +class TestLstmCellWithProjection(unittest.TestCase): + + def test_elmo_lstm_cell_completes_forward_pass(self): + input_tensor = torch.rand(4, 5, 3) + input_tensor[1, 4:, :] = 0.0 + input_tensor[2, 2:, :] = 0.0 + input_tensor[3, 1:, :] = 0.0 + + initial_hidden_state = torch.ones([1, 4, 5]) + initial_memory_state = torch.ones([1, 4, 7]) + + lstm = LstmCellWithProjection( + input_size=3, + hidden_size=5, + cell_size=7, + memory_cell_clip_value=2, + state_projection_clip_value=1, + ) + output_sequence, lstm_state = lstm( + input_tensor, [5, 4, 2, 1], (initial_hidden_state, + initial_memory_state) + ) + numpy.testing.assert_array_equal( + output_sequence.data[1, 4:, :].numpy(), 0.0) + numpy.testing.assert_array_equal( + output_sequence.data[2, 2:, :].numpy(), 0.0) + numpy.testing.assert_array_equal( + output_sequence.data[3, 1:, :].numpy(), 0.0) + + # Test the state clipping. + numpy.testing.assert_array_less(output_sequence.data.numpy(), 1.0) + numpy.testing.assert_array_less(-output_sequence.data.numpy(), 1.0) + + # LSTM state should be (num_layers, batch_size, hidden_size) + assert list(lstm_state[0].size()) == [1, 4, 5] + # LSTM memory cell should be (num_layers, batch_size, cell_size) + assert list((lstm_state[1].size())) == [1, 4, 7] + + # Test the cell clipping. + numpy.testing.assert_array_less(lstm_state[0].data.numpy(), 2.0) + numpy.testing.assert_array_less(-lstm_state[0].data.numpy(), 2.0) + + +class TestTimeDistributed(unittest.TestCase): + + def test_time_distributed_reshapes_named_arg_correctly(self): + char_embedding = Embedding(2, 2) + char_embedding.weight = Parameter( + torch.FloatTensor([[0.4, 0.4], [0.5, 0.5]])) + distributed_embedding = TimeDistributed(char_embedding) + char_input = torch.LongTensor([[[1, 0], [1, 1]]]) + output = distributed_embedding(char_input) + assert_almost_equal( + output.data.numpy(), + [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]] + ) + + def test_time_distributed_reshapes_positional_kwarg_correctly(self): + char_embedding = Embedding(2, 2) + char_embedding.weight = Parameter(torch.FloatTensor( + [[0.4, 0.4], [0.5, 0.5]])) + distributed_embedding = TimeDistributed(char_embedding) + char_input = torch.LongTensor([[[1, 0], [1, 1]]]) + output = distributed_embedding(input=char_input) + assert_almost_equal( + output.data.numpy(), + [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]] + ) + + def test_time_distributed_works_with_multiple_inputs(self): + module = lambda x, y: x + y + distributed = TimeDistributed(module) + x_input = torch.LongTensor([[[1, 2], [3, 4]]]) + y_input = torch.LongTensor([[[4, 2], [9, 1]]]) + output = distributed(x_input, y_input) + assert_almost_equal(output.data.numpy(), [[[5, 4], [12, 5]]]) + + def test_time_distributed_reshapes_multiple_inputs_with_pass_through_tensor_correctly(self): + + class FakeModule(Module): + + def forward(self, input_tensor, tensor_to_pass_through=None, + another_tensor=None): + + return input_tensor + tensor_to_pass_through + another_tensor + + module = FakeModule() + distributed_module = TimeDistributed(module) + + input_tensor1 = torch.LongTensor([[[1, 2], [3, 4]]]) + input_to_pass_through = torch.LongTensor([3, 7]) + input_tensor2 = torch.LongTensor([[[4, 2], [9, 1]]]) + + output = distributed_module( + input_tensor1, + tensor_to_pass_through=input_to_pass_through, + another_tensor=input_tensor2, + pass_through=["tensor_to_pass_through"], + ) + assert_almost_equal(output.data.numpy(), [[[8, 11], [15, 12]]]) + + def test_time_distributed_reshapes_multiple_inputs_with_pass_through_non_tensor_correctly(self): + + class FakeModule(Module): + + def forward(self, input_tensor, number=0, another_tensor=None): + + return input_tensor + number + another_tensor + + module = FakeModule() + distributed_module = TimeDistributed(module) + + input_tensor1 = torch.LongTensor([[[1, 2], [3, 4]]]) + input_number = 5 + input_tensor2 = torch.LongTensor([[[4, 2], [9, 1]]]) + + output = distributed_module( + input_tensor1, + number=input_number, + another_tensor=input_tensor2, + pass_through=["number"], + ) + assert_almost_equal(output.data.numpy(), [[[10, 9], [17, 10]]]) + + +class TestUtils(unittest.TestCase): + + def test_add_sentence_boundary_token_ids_handles_2D_input(self): + tensor = torch.from_numpy(numpy.array([[1, 2, 3], [4, 5, 0]])) + mask = (tensor > 0).long() + bos = 9 + eos = 10 + new_tensor, new_mask = add_sentence_boundary_token_ids( + tensor, mask, bos, eos) + expected_new_tensor = numpy.array([[9, 1, 2, 3, 10], [9, 4, 5, 10, 0]]) + assert (new_tensor.data.numpy() == expected_new_tensor).all() + assert (new_mask.data.numpy() == (expected_new_tensor > 0)).all() + + def test_add_sentence_boundary_token_ids_handles_3D_input(self): + tensor = torch.from_numpy( + numpy.array( + [ + [[1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2]], + [[4, 3, 2, 1], [8, 7, 6, 5], [0, 0, 0, 0]], + ] + ) + ) + mask = ((tensor > 0).sum(dim=-1) > 0).type(torch.LongTensor) + bos = torch.from_numpy(numpy.array([9, 9, 9, 9])) + eos = torch.from_numpy(numpy.array([10, 10, 10, 10])) + new_tensor, new_mask = add_sentence_boundary_token_ids( + tensor, mask, bos, eos) + expected_new_tensor = numpy.array( + [ + [[9, 9, 9, 9], [1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2], + [10, 10, 10, 10]], + [[9, 9, 9, 9], [4, 3, 2, 1], [8, 7, 6, 5], [10, 10, 10, 10], + [0, 0, 0, 0]], + ] + ) + assert (new_tensor.data.numpy() == expected_new_tensor).all() + assert (new_mask.data.numpy() == ( + (expected_new_tensor > 0).sum(axis=-1) > 0)).all() + + def test_remove_sentence_boundaries(self): + tensor = torch.from_numpy(numpy.random.rand(3, 5, 7)) + mask = torch.from_numpy( + # The mask with two elements is to test the corner case + # of an empty sequence, so here we are removing boundaries + # from " " + numpy.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]) + ).long() + new_tensor, new_mask = remove_sentence_boundaries(tensor, mask) + + expected_new_tensor = torch.zeros(3, 3, 7) + expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :] + expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :] + assert_array_almost_equal(new_tensor.data.numpy(), + expected_new_tensor.data.numpy()) + + expected_new_mask = torch.from_numpy(numpy.array( + [[0, 0, 0], [1, 1, 1], [1, 1, 0]])).long() + assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all() + + def test_lazy_groups_of(self): + xs = [1, 2, 3, 4, 5, 6, 7] + groups = lazy_groups_of(iter(xs), group_size=3) + assert next(groups) == [1, 2, 3] + assert next(groups) == [4, 5, 6] + assert next(groups) == [7] + with self.assertRaises(StopIteration): + _ = next(groups) + + def test_get_sequence_lengths_from_binary_mask(self): + binary_mask = torch.ByteTensor( + [[1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1], + [1, 0, 0, 0, 0, 0]] + ) + lengths = get_lengths_from_binary_sequence_mask(binary_mask) + numpy.testing.assert_array_equal(lengths.numpy(), + numpy.array([3, 2, 6, 1])) + + def test_sort_tensor_by_length(self): + tensor = torch.rand([5, 7, 9]) + tensor[0, 3:, :] = 0 + tensor[1, 4:, :] = 0 + tensor[2, 1:, :] = 0 + tensor[3, 5:, :] = 0 + + sequence_lengths = torch.LongTensor([3, 4, 1, 5, 7]) + sorted_tensor, sorted_lengths, reverse_indices, _ = \ + sort_batch_by_length(tensor, sequence_lengths) + + # Test sorted indices are padded correctly. + numpy.testing.assert_array_equal( + sorted_tensor[1, 5:, :].data.numpy(), 0.0) + numpy.testing.assert_array_equal( + sorted_tensor[2, 4:, :].data.numpy(), 0.0) + numpy.testing.assert_array_equal( + sorted_tensor[3, 3:, :].data.numpy(), 0.0) + numpy.testing.assert_array_equal( + sorted_tensor[4, 1:, :].data.numpy(), 0.0) + + assert sorted_lengths.data.equal(torch.LongTensor([7, 5, 4, 3, 1])) + + # Test restoration indices correctly recover the original tensor. + assert sorted_tensor.index_select(0, reverse_indices).data.equal( + tensor.data) + + def test_block_orthogonal_can_initialize(self): + tensor = torch.zeros([10, 6]) + block_orthogonal(tensor, [5, 3]) + tensor = tensor.data.numpy() + + def test_block_is_orthogonal(block) -> None: + matrix_product = block.T @ block + numpy.testing.assert_array_almost_equal( + matrix_product, numpy.eye(matrix_product.shape[-1]), 6 + ) + + test_block_is_orthogonal(tensor[:5, :3]) + test_block_is_orthogonal(tensor[:5, 3:]) + test_block_is_orthogonal(tensor[5:, 3:]) + test_block_is_orthogonal(tensor[5:, :3]) + + def test_block_orthogonal_raises_on_mismatching_dimensions(self): + tensor = torch.zeros([10, 6, 8]) + with self.assertRaises(ConfigurationError): + block_orthogonal(tensor, [7, 2, 1]) + + def test_combine_initial_dims(self): + tensor = torch.randn(4, 10, 20, 17, 5) + + tensor2d = combine_initial_dims(tensor) + assert list(tensor2d.size()) == [4 * 10 * 20 * 17, 5] + + def test_uncombine_initial_dims(self): + embedding2d = torch.randn(4 * 10 * 20 * 17 * 5, 12) + + embedding = uncombine_initial_dims(embedding2d, + torch.Size((4, 10, 20, 17, 5))) + assert list(embedding.size()) == [4, 10, 20, 17, 5, 12] + + +class TestScalarMix(unittest.TestCase): + + def test_scalar_mix_can_run_forward(self): + mixture = ScalarMix(3) + tensors = [torch.randn([3, 4, 5]) for _ in range(3)] + for k in range(3): + mixture.scalar_parameters[k].data[0] = 0.1 * (k + 1) + mixture.gamma.data[0] = 0.5 + result = mixture(tensors) + + weights = [0.1, 0.2, 0.3] + normed_weights = numpy.exp(weights) / numpy.sum(numpy.exp(weights)) + expected_result = sum(normed_weights[k] * tensors[k].data.numpy() + for k in range(3)) + expected_result *= 0.5 + numpy.testing.assert_almost_equal(expected_result, result.data.numpy()) + + def test_scalar_mix_throws_error_on_incorrect_number_of_inputs(self): + mixture = ScalarMix(3) + tensors = [torch.randn([3, 4, 5]) for _ in range(5)] + with self.assertRaises(ConfigurationError): + _ = mixture(tensors) + + def test_scalar_mix_throws_error_on_incorrect_initial_scalar_parameters_length(self): + with self.assertRaises(ConfigurationError): + ScalarMix(3, initial_scalar_parameters=[0.0, 0.0]) + + def test_scalar_mix_trainable_with_initial_scalar_parameters(self): + initial_scalar_parameters = [1.0, 2.0, 3.0] + mixture = ScalarMix(3, + initial_scalar_parameters=initial_scalar_parameters, + trainable=False) + for i, scalar_mix_parameter in enumerate(mixture.scalar_parameters): + assert scalar_mix_parameter.requires_grad is False + assert scalar_mix_parameter.item() == initial_scalar_parameters[i] + + def test_scalar_mix_layer_norm(self): + mixture = ScalarMix(3, do_layer_norm="scalar_norm_reg") + + tensors = [torch.randn([3, 4, 5]) for _ in range(3)] + numpy_mask = numpy.ones((3, 4), dtype="int32") + numpy_mask[1, 2:] = 0 + mask = torch.from_numpy(numpy_mask) + + weights = [0.1, 0.2, 0.3] + for k in range(3): + mixture.scalar_parameters[k].data[0] = weights[k] + mixture.gamma.data[0] = 0.5 + result = mixture(tensors, mask) + + normed_weights = numpy.exp(weights) / numpy.sum(numpy.exp(weights)) + expected_result = numpy.zeros((3, 4, 5)) + for k in range(3): + mean = numpy.mean(tensors[k].data.numpy()[numpy_mask == 1]) + std = numpy.std(tensors[k].data.numpy()[numpy_mask == 1]) + normed_tensor = (tensors[k].data.numpy() - mean) / (std + 1e-12) + expected_result += normed_tensor * normed_weights[k] + expected_result *= 0.5 + + numpy.testing.assert_almost_equal(expected_result, result.data.numpy(), + decimal=6) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/utils/test.py b/texar/torch/utils/test.py index 26bdfe10e..50a28eb27 100644 --- a/texar/torch/utils/test.py +++ b/texar/torch/utils/test.py @@ -21,6 +21,7 @@ __all__ = [ "pretrained_test", "data_test", + "cuda_test", "external_library_test", ] @@ -35,6 +36,8 @@ def define_skip_condition(flag: str, explanation: str): 'TEST_PRETRAINED', "Test requires loading pre-trained checkpoints.") data_test = define_skip_condition( 'TEST_DATA', "Test requires loading large data files.") +cuda_test = define_skip_condition( + 'TEST_CUDA', "Test requires cuda.") def external_library_test(name: str): From 1ad8c7d1c4518e385c07462bc54dbebcd46d95bb Mon Sep 17 00:00:00 2001 From: Pengzhi Gao Date: Fri, 21 Feb 2020 15:23:21 -0500 Subject: [PATCH 2/2] Polish ELMo modules --- docs/code/modules.rst | 10 + docs/spelling_wordlist.txt | 1 + .../data/tokenizers/elmo_tokenizer_utils.py | 39 +- .../tokenizers/elmo_tokenizer_utils_test.py | 67 +- texar/torch/modules/encoders/elmo_encoder.py | 29 +- texar/torch/modules/pretrained/elmo.py | 2 +- texar/torch/modules/pretrained/elmo_utils.py | 1460 ++++++----------- .../modules/pretrained/elmo_utils_test.py | 265 +-- texar/torch/utils/utils.py | 111 +- texar/torch/utils/utils_test.py | 63 +- 10 files changed, 781 insertions(+), 1266 deletions(-) diff --git a/docs/code/modules.rst b/docs/code/modules.rst index a1a8b1392..eada424b4 100644 --- a/docs/code/modules.rst +++ b/docs/code/modules.rst @@ -64,6 +64,11 @@ Encoders .. autoclass:: texar.torch.modules.BERTEncoder :members: +:hidden:`ELMoEncoder` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: texar.torch.modules.ELMoEncoder + :members: + :hidden:`RoBERTaEncoder` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: texar.torch.modules.RoBERTaEncoder @@ -283,6 +288,11 @@ Pre-trained .. autoclass:: texar.torch.modules.PretrainedBERTMixin :members: +:hidden:`PretrainedELMoMixin` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: texar.torch.modules.PretrainedELMoMixin + :members: + :hidden:`PretrainedRoBERTaMixin` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: texar.torch.modules.PretrainedRoBERTaMixin diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 38a5a84af..3b2871bd2 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -72,3 +72,4 @@ tokenizer wordpiece unigram TF +convnet diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py index ea454d0d8..9f51168a0 100644 --- a/texar/torch/data/tokenizers/elmo_tokenizer_utils.py +++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py @@ -30,13 +30,11 @@ ] -def _make_bos_eos( - character: int, - padding_character: int, - beginning_of_word_character: int, - end_of_word_character: int, - max_word_length: int, -): +def _make_bos_eos(character: int, + padding_character: int, + beginning_of_word_character: int, + end_of_word_character: int, + max_word_length: int): char_ids = [padding_character] * max_word_length char_ids[0] = beginning_of_word_character char_ids[1] = character @@ -86,25 +84,22 @@ def __init__(self, tokens_to_add: Optional[Dict[str, int]] = None) -> None: def convert_word_to_char_ids(self, word: str) -> List[int]: if word in self.tokens_to_add: - char_ids = ([ELMoCharacterMapper.padding_character] * - ELMoCharacterMapper.max_word_length) - char_ids[0] = ELMoCharacterMapper.beginning_of_word_character + char_ids = [self.padding_character] * self.max_word_length + char_ids[0] = self.beginning_of_word_character char_ids[1] = self.tokens_to_add[word] - char_ids[2] = ELMoCharacterMapper.end_of_word_character - elif word == ELMoCharacterMapper.bos_token: - char_ids = ELMoCharacterMapper.beginning_of_sentence_characters - elif word == ELMoCharacterMapper.eos_token: - char_ids = ELMoCharacterMapper.end_of_sentence_characters + char_ids[2] = self.end_of_word_character + elif word == self.bos_token: + char_ids = self.beginning_of_sentence_characters + elif word == self.eos_token: + char_ids = self.end_of_sentence_characters else: - word_encoded = word.encode( - "utf-8", "ignore")[: (ELMoCharacterMapper.max_word_length - 2)] - char_ids = ([ELMoCharacterMapper.padding_character] * - ELMoCharacterMapper.max_word_length) - char_ids[0] = ELMoCharacterMapper.beginning_of_word_character + word_encoded = word.encode("utf-8", "ignore")[: ( + self.max_word_length - 2)] + char_ids = [self.padding_character] * self.max_word_length + char_ids[0] = self.beginning_of_word_character for k, chr_id in enumerate(word_encoded, start=1): char_ids[k] = chr_id - char_ids[len(word_encoded) + 1] = \ - ELMoCharacterMapper.end_of_word_character + char_ids[len(word_encoded) + 1] = self.end_of_word_character # +1 one for masking return [c + 1 for c in char_ids] diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py index f8dac6703..32e2c7a24 100644 --- a/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py +++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Unit tests for pre-trained ELMo tokenizer. +Unit tests for the utils of pre-trained ELMo tokenizer. Code adapted from: `https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/token_indexers/elmo_indexer_test.py` @@ -29,75 +29,42 @@ class ELMoTokenizerUtilsTest(unittest.TestCase): def test_bos_to_char_ids(self): mapper = ELMoCharacterMapper() indices = mapper.convert_word_to_char_ids('') - expected_indices = [ - 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - ] + # [, , , , ... ] + expected_indices = [259, 257, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) self.assertEqual(indices, expected_indices) def test_eos_to_char_ids(self): mapper = ELMoCharacterMapper() indices = mapper.convert_word_to_char_ids('') - expected_indices = [ - 259, 258, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - ] + expected_indices = [259, 258, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) self.assertEqual(indices, expected_indices) def test_unicode_to_char_ids(self): mapper = ELMoCharacterMapper() indices = mapper.convert_word_to_char_ids(chr(256) + "t") - expected_indices = [ - 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - ] + expected_indices = [259, 197, 129, 117, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) self.assertEqual(indices, expected_indices) def test_additional_tokens(self): mapper = ELMoCharacterMapper(tokens_to_add={"": 1}) indices = mapper.convert_word_to_char_ids("") - expected_indices = [ - 259, 2, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - ] + expected_indices = [259, 2, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) self.assertEqual(indices, expected_indices) def test_batch_to_ids(self): sentences = [['First', 'sentence', '.'], ['Another', '.']] indices = batch_to_ids(sentences) - expected_indices = [[[ - 259, 71, 106, 115, 116, 117, 260, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [ - 259, 116, 102, 111, 117, 102, 111, 100, 102, 260, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [ - 259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261]], - [[259, 66, 111, 112, 117, 105, 102, 115, 260, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], - [259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0]]] + expected_indices = [[ + [259, 71, 106, 115, 116, 117, 260] + [261] * 43, + [259, 116, 102, 111, 117, 102, 111, 100, 102, 260] + [261] * 40, + [259, 47, 260] + [261] * 47], [ + [259, 66, 111, 112, 117, 105, 102, 115, 260] + [261] * 41, + [259, 47, 260] + [261] * 47, + [0] * 50]] self.assertEqual(indices.tolist(), expected_indices) diff --git a/texar/torch/modules/encoders/elmo_encoder.py b/texar/torch/modules/encoders/elmo_encoder.py index 2011d3dbc..98c05f8a1 100644 --- a/texar/torch/modules/encoders/elmo_encoder.py +++ b/texar/torch/modules/encoders/elmo_encoder.py @@ -81,11 +81,9 @@ def __init__(self, assert options_file is not None self._elmo_lstm = _ElmoBiLm( - options_file, - weight_file, # type: ignore + options_file, weight_file, requires_grad=self.hparams.requires_grad, - vocab_to_cache=self.hparams.vocab_to_cache, - ) + vocab_to_cache=self.hparams.vocab_to_cache) tmp_dir.cleanup() self._has_cached_vocab = self.hparams.vocab_to_cache is not None @@ -97,8 +95,7 @@ def __init__(self, self._elmo_lstm.num_layers, do_layer_norm=self.hparams.do_layer_norm, initial_scalar_parameters=self.hparams.scalar_mix_parameters, - trainable=self.hparams.scalar_mix_parameters is None, - ) + trainable=self.hparams.scalar_mix_parameters is None) self.add_module("scalar_mix_{}".format(k), scalar_mix) self._scalar_mixes.append(scalar_mix) @@ -152,7 +149,7 @@ def default_hparams(): Here: - The default parameters are values for elmo-small model. + The default parameters are values for ELMo small model. `"pretrained_model_name"`: str or None The name of the pre-trained ELMo model. If None, the model @@ -164,7 +161,7 @@ def default_hparams(): `"num_output_representations"`: int The number of ELMo representation to output with different linear weighted combination of the 3 layers (i.e., character-convnet - output, 1st lstm output, 2nd lstm output). + output, the first LSTM output, the second LSTM output). `"requires_grad"`: bool If True, compute gradient of ELMo parameters for fine tuning. @@ -175,12 +172,12 @@ def default_hparams(): `"dropout"`: float The dropout to be applied to the ELMo representations. - `"vocab_to_cache"`: List[str] + `"vocab_to_cache"`: List[string] A list of words to pre-compute and cache character convolutions - for. If you use this option, Elmo expects that you pass word - indices of shape (batch_size, timesteps) to forward, instead + for. If you use this option, ELMo expects that you pass word + indices of shape `(batch_size, timesteps)` to forward, instead of character indices. If you use this option and pass a word which - wasn't pre-cached, this will break. + was not pre-cached, this will break. `"keep_sentence_boundaries"`: bool If True, the representation of the sentence boundary tokens are @@ -245,7 +242,7 @@ def forward(self, # type: ignore word ids which have been pre-cached. Returns: - A Dict with keys: + A Dictionary with keys: - :attr:`elmo_representations`: A `num_output_representations` list of ELMo representations for the input sequence. Each @@ -304,14 +301,12 @@ def forward(self, # type: ignore mask = processed_mask.view(original_word_size) elmo_representations = [ representation.view(original_word_size + (-1,)) - for representation in representations - ] + for representation in representations] elif len(original_shape) > 3: mask = processed_mask.view(original_shape[:-1]) elmo_representations = [ representation.view(original_shape[:-1] + (-1,)) - for representation in representations - ] + for representation in representations] else: mask = processed_mask elmo_representations = representations diff --git a/texar/torch/modules/pretrained/elmo.py b/texar/torch/modules/pretrained/elmo.py index ef616d1b5..2783aa4e5 100644 --- a/texar/torch/modules/pretrained/elmo.py +++ b/texar/torch/modules/pretrained/elmo.py @@ -37,7 +37,7 @@ class PretrainedELMoMixin(PretrainedMixin, ABC): The ELMo model was proposed in `Deep contextualized word representations`_ by `Peters et al.` from Allen Institute for Artificial Intelligence. It is - a deep bidirectional language model (biLM), which is pre-trained on a + a deep bidirectional language model (`biLM`), which is pre-trained on a large text corpus. The available ELMo models are as follows: diff --git a/texar/torch/modules/pretrained/elmo_utils.py b/texar/torch/modules/pretrained/elmo_utils.py index 65b8f2f69..be6e76c5d 100644 --- a/texar/torch/modules/pretrained/elmo_utils.py +++ b/texar/torch/modules/pretrained/elmo_utils.py @@ -15,8 +15,6 @@ Utils of ELMo Modules. Code adapted from: - `https://github.com/allenai/allennlp/blob/master/allennlp/common/checks.py` - `https://github.com/allenai/allennlp/blob/master/allennlp/common/util.py` `https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py` `https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo_lstm.py` `https://github.com/allenai/allennlp/blob/master/allennlp/modules/encoder_base.py` @@ -32,9 +30,7 @@ import json import logging -from itertools import islice -from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional, - Tuple, TypeVar, Union) +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import h5py import numpy @@ -47,6 +43,9 @@ from texar.torch.data.tokenizers.elmo_tokenizer_utils import ( batch_to_ids, ELMoCharacterMapper) +from texar.torch.utils.utils import ( + combine_initial_dims, get_device_of, lazy_groups_of, sort_batch_by_length, + uncombine_initial_dims) # pylint: disable=attribute-defined-outside-init,protected-access @@ -54,7 +53,6 @@ "_ElmoBiLm", "_ElmoCharacterEncoder", "_EncoderBase", - "ConfigurationError", "ElmoLstm", "Embedding", "Highway", @@ -63,61 +61,34 @@ "TimeDistributed", "add_sentence_boundary_token_ids", "block_orthogonal", - "combine_initial_dims", - "get_device_of", "get_dropout_mask", - "get_lengths_from_binary_sequence_mask", - "lazy_groups_of", "remove_sentence_boundaries", - "sort_batch_by_length", - "uncombine_initial_dims", ] class _ElmoBiLm(torch.nn.Module): r"""Run a pre-trained bidirectional language model, outputting the activations at each layer for weighting together into an ELMo - representation (with `allennlp.modules.seq2seq_encoders.Elmo`). - This is a lower level class, useful for advanced uses, but most users - should use `allennlp.modules.Elmo` directly. - - # Parameters - - options_file : `str` - ELMo JSON options file - weight_file : `str` - ELMo hdf5 weight file - requires_grad : `bool`, optional, (default = False). - If True, compute gradient of ELMo parameters for fine tuning. - vocab_to_cache : `List[str]`, optional, (default = None). - A list of words to pre-compute and cache character convolutions - for. If you use this option, _ElmoBiLm expects that you pass word - indices of shape (batch_size, timesteps) to forward, instead - of character indices. If you use this option and pass a word which - wasn't pre-cached, this will break. + representation. + + Args: + options_file: ELMo JSON options file + weight_file: ELMo hdf5 weight file + requires_grad: If True, compute gradient of ELMo parameters for fine + tuning. + vocab_to_cache: A list of words to pre-compute and cache character + convolutions for. If you use this option, `_ElmoBiLm` expects that + you pass word indices of shape `(batch_size, timesteps)` to forward, + instead of character indices. If you use this option and pass a word + which wasn't pre-cached, this will break. """ - - def __init__( - self, - options_file: str, - weight_file: str, - requires_grad: bool = False, - vocab_to_cache: Optional[List[str]] = None, - ) -> None: + def __init__(self, options_file: str, weight_file: Optional[str] = None, + requires_grad: bool = False, + vocab_to_cache: Optional[List[str]] = None) -> None: super().__init__() - self._token_embedder = _ElmoCharacterEncoder( - options_file, weight_file, requires_grad=requires_grad - ) - + options_file, weight_file, requires_grad=requires_grad) self._requires_grad = requires_grad - if requires_grad and vocab_to_cache: - logging.warning( - "You are fine tuning ELMo and caching char CNN word vectors. " - "This behaviour is not guaranteed to be well defined, " - "particularly. " - "if not all of your inputs will occur in the vocabulary cache." - ) # This is an embedding, used to look up cached # word vectors built from character level cnn embeddings. self._word_embedding = None @@ -134,7 +105,7 @@ def __init__( with open(options_file, "r") as fin: options = json.load(fin) if not options["lstm"].get("use_skip_connections"): - raise ConfigurationError( + raise ValueError( "We only support pretrained biLMs with residual connections") self._elmo_lstm = ElmoLstm( input_size=options["lstm"]["projection_dim"], @@ -143,8 +114,7 @@ def __init__( num_layers=options["lstm"]["n_layers"], memory_cell_clip_value=options["lstm"]["cell_clip"], state_projection_clip_value=options["lstm"]["proj_clip"], - requires_grad=requires_grad, - ) + requires_grad=requires_grad) if weight_file is not None: self._elmo_lstm.load_weights(weight_file) @@ -159,28 +129,24 @@ def forward( # type: ignore ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: r"""Encodes the inputs. - # Parameters - - inputs : `torch.Tensor`, required. - Shape `(batch_size, timesteps, 50)` of character ids representing - the current batch. - word_inputs : `torch.Tensor`, required. - If you passed a cached vocab, you can in addition pass a tensor of - shape `(batch_size, timesteps)`, which represent word ids which - have been pre-cached. + Args: + inputs: Shape `(batch_size, timesteps, 50)` of character ids + representing the current batch. + word_inputs: If you passed a cached vocab, you can in addition pass + a tensor of shape `(batch_size, timesteps)`, which represent + word ids which have been pre-cached. - # Returns + Returns: + Dict with keys: - Dict with keys: + - `'activations'`: A list of activations at each layer of the + network, each of shape `(batch_size, timesteps + 2, + embedding_dim)`. + - `'mask'`: Shape `(batch_size, timesteps + 2)` long tensor with + sequence mask. - `'activations'` : `List[torch.Tensor]` - A list of activations at each layer of the network, each of shape - `(batch_size, timesteps + 2, embedding_dim)` - `'mask'`: `torch.Tensor` - Shape `(batch_size, timesteps + 2)` long tensor with sequence mask. - - Note that the output tensors all include additional special begin and - end of sequence markers. + Note that the output tensors all include additional special begin + and end of sequence markers. """ if self._word_embedding is not None and word_inputs is not None: try: @@ -191,8 +157,7 @@ def forward( # type: ignore # shape (batch_size, timesteps + 2, embedding_dim) type_representation, mask = add_sentence_boundary_token_ids( embedded_inputs, mask_without_bos_eos, self._bos_embedding, - self._eos_embedding - ) + self._eos_embedding) except RuntimeError: # Back off to running the character convolutions, # as we might not have the words in the cache. @@ -213,8 +178,7 @@ def forward( # type: ignore # of the char cnn representations can change. output_tensors = [ torch.cat([type_representation, type_representation], dim=-1) - * mask.float().unsqueeze(-1) - ] + * mask.float().unsqueeze(-1)] for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0): output_tensors.append(layer_activations.squeeze(0)) @@ -230,18 +194,13 @@ def create_cached_cnn_embeddings(self, tokens: List[str]) -> None: This function sets 3 attributes: - _word_embedding : `torch.Tensor` - The word embedding for each word in the tokens passed to this - method. - _bos_embedding : `torch.Tensor` - The embedding for the BOS token. - _eos_embedding : `torch.Tensor` - The embedding for the EOS token. - - # Parameters + _word_embedding: The word embedding for each word in the tokens passed + to this method. + _bos_embedding: The embedding for the BOS token. + _eos_embedding: The embedding for the EOS token. - tokens : `List[str]`, required. - A list of tokens to precompute character convolutions for. + Args: + tokens: A list of tokens to precompute character convolutions for. """ tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens @@ -283,50 +242,27 @@ def create_cached_cnn_embeddings(self, tokens: List[str]) -> None: embedding_dim, weight=embedding_.data, trainable=self._requires_grad, - padding_index=0, - ) + padding_index=0) class _ElmoCharacterEncoder(torch.nn.Module): - r"""Compute context insensitive token representation using pretrained biLM. + r"""Compute context insensitive token representation using pre-trained biLM. This embedder has input character ids of size - (batch_size, sequence_length, 50) - and returns (batch_size, sequence_length + 2, embedding_dim), where - embedding_dim is specified in the options file (typically 512). + `(batch_size, sequence_length, 50)` + and returns `(batch_size, sequence_length + 2, embedding_dim)`, where + `embedding_dim` is specified in the options file (typically 512). We add special entries at the beginning and end of each sequence corresponding to and , the beginning and end of sentence tokens. - Note: this is a lower level class useful for advanced usage. Most users - should use `ElmoTokenEmbedder` or `allennlp.modules.Elmo` instead. - - # Parameters - - options_file : `str` - ELMo JSON options file - weight_file : `str` - ELMo hdf5 weight file - requires_grad : `bool`, optional, (default = False). - If True, compute gradient of ELMo parameters for fine tuning. - - The relevant section of the options file is something like: - .. example-code:: - - .. code-block:: python - - {'char_cnn': { - 'activation': 'relu', - 'embedding': {'dim': 4}, - 'filters': [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], - 'max_characters_per_token': 50, - 'n_characters': 262, - 'n_highway': 2 - } - } + Args: + options_file: ELMo JSON options file. + weight_file: ELMo hdf5 weight file. + requires_grad: If True, compute gradient of ELMo parameters for fine + tuning. """ - - def __init__(self, options_file: str, weight_file: str, + def __init__(self, options_file: str, weight_file: Optional[str] = None, requires_grad: bool = False) -> None: super().__init__() @@ -346,11 +282,9 @@ def __init__(self, options_file: str, weight_file: str, # Cache the arrays for use in forward -- +1 due to masking. self._beginning_of_sentence_characters = torch.from_numpy( numpy.array( - ELMoCharacterMapper.beginning_of_sentence_characters) + 1 - ) + ELMoCharacterMapper.beginning_of_sentence_characters) + 1) self._end_of_sentence_characters = torch.from_numpy( - numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1 - ) + numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1) def get_output_dim(self): return self.output_dim @@ -360,21 +294,18 @@ def forward(self, # type: ignore r"""Compute context insensitive token embeddings for ELMo representations. - # Parameters + Args: + inputs: Shape `(batch_size, sequence_length, 50)` of character ids + representing the current batch. - inputs : `torch.Tensor` - Shape `(batch_size, sequence_length, 50)` of character ids - representing the current batch. + Returns: + Dict with keys: - # Returns - - Dict with keys: - `'token_embedding'` : `torch.Tensor` - Shape `(batch_size, sequence_length + 2, embedding_dim)` tensor - with context insensitive token representations. - `'mask'`: `torch.Tensor` - Shape `(batch_size, sequence_length + 2)` long tensor with - sequence mask. + - `'token_embedding'`: Shape `(batch_size, sequence_length + 2, + embedding_dim)` tensor with context insensitive token + representations. + - `'mask'`: Shape `(batch_size, sequence_length + 2)` long tensor + with sequence mask. """ # Add BOS/EOS mask = ((inputs > 0).long().sum(dim=-1) > 0).long() @@ -399,7 +330,7 @@ def forward(self, # type: ignore elif cnn_options["activation"] == "relu": activation = torch.nn.functional.relu else: - raise ConfigurationError("Unknown activation") + raise ValueError("Unknown activation") # (batch_size * sequence_length, embed_dim, max_chars_per_token) character_embedding = torch.transpose(character_embedding, 1, 2) @@ -414,21 +345,16 @@ def forward(self, # type: ignore # (batch_size * sequence_length, n_filters) token_embedding = torch.cat(convs, dim=-1) - # apply the highway layers (batch_size * sequence_length, n_filters) token_embedding = self._highways(token_embedding) - # final projection (batch_size * sequence_length, embedding_dim) token_embedding = self._projection(token_embedding) - # reshape to (batch_size, sequence_length, embedding_dim) batch_size, sequence_length, _ = character_ids_with_bos_eos.size() - return { - "mask": mask_with_bos_eos, - "token_embedding": token_embedding.view(batch_size, - sequence_length, -1), - } + return {"mask": mask_with_bos_eos, + "token_embedding": token_embedding.view( + batch_size, sequence_length, -1)} def _load_weights(self, load_weights=True): self._load_char_embedding(load_weights) @@ -437,29 +363,23 @@ def _load_weights(self, load_weights=True): self._load_projection(load_weights) def _load_char_embedding(self, load_weights): - if load_weights: with h5py.File(self._weight_file, "r") as fin: char_embed_weights = fin["char_embed"][...] weights = numpy.zeros( (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), - dtype="float32" - ) + dtype="float32") weights[1:, :] = char_embed_weights self._char_embedding_weights = torch.nn.Parameter( - torch.FloatTensor(weights), requires_grad=self.requires_grad - ) + torch.FloatTensor(weights), requires_grad=self.requires_grad) else: - weights = numpy.zeros( - (self._options['char_cnn']['n_characters'], - self._options['char_cnn']['embedding']['dim']), - dtype="float32" - ) + weights = numpy.zeros(( + self._options['char_cnn']['n_characters'], + self._options['char_cnn']['embedding']['dim']), dtype="float32") self._char_embedding_weights = torch.nn.Parameter( - torch.FloatTensor(weights), requires_grad=self.requires_grad - ) + torch.FloatTensor(weights), requires_grad=self.requires_grad) def _load_cnn_weights(self, load_weights): cnn_options = self._options["char_cnn"] @@ -468,11 +388,8 @@ def _load_cnn_weights(self, load_weights): convolutions = [] for i, (width, num) in enumerate(filters): - conv = torch.nn.Conv1d( - in_channels=char_embed_dim, out_channels=num, - kernel_size=width, bias=True - ) - + conv = torch.nn.Conv1d(in_channels=char_embed_dim, out_channels=num, + kernel_size=width, bias=True) if load_weights: # load the weights with h5py.File(self._weight_file, "r") as fin: @@ -485,17 +402,14 @@ def _load_cnn_weights(self, load_weights): raise ValueError("Invalid weight file") conv.weight.data.copy_(torch.FloatTensor(w_reshaped)) conv.bias.data.copy_(torch.FloatTensor(bias)) - conv.weight.requires_grad = self.requires_grad conv.bias.requires_grad = self.requires_grad convolutions.append(conv) self.add_module("char_conv_{}".format(i), conv) - self._convolutions = convolutions def _load_highway(self, load_weights): - # the highway layers have same dimensionality as the number of cnn # filters cnn_options = self._options["char_cnn"] @@ -506,7 +420,6 @@ def _load_highway(self, load_weights): # create the layers, and load the weights self._highways = Highway(n_filters, n_highway, activation=torch.nn.functional.relu) - if load_weights: for k in range(n_highway): # The AllenNLP highway is one matrix multplication with @@ -526,7 +439,6 @@ def _load_highway(self, load_weights): torch.FloatTensor(weight)) self._highways._layers[k].weight.requires_grad = \ self.requires_grad - b_transform = \ fin["CNN_high_{}".format(k)]["b_transform"][...] b_carry = \ @@ -544,7 +456,6 @@ def _load_projection(self, load_weights): self._projection = torch.nn.Linear(n_filters, self.output_dim, bias=True) - if load_weights: with h5py.File(self._weight_file, "r") as fin: weight = fin["CNN_proj"]["W_proj"][...] @@ -561,34 +472,27 @@ def _load_projection(self, load_weights): class _EncoderBase(torch.nn.Module): - r"""This abstract class serves as a base for the 3 `Encoder` abstractions - in AllenNLP. - - [`Seq2SeqEncoders`](./seq2seq_encoders/seq2seq_encoder.md) - - [`Seq2VecEncoders`](./seq2vec_encoders/seq2vec_encoder.md) + r"""This abstract class serves as a base for `Encoder`. Additionally, this class provides functionality for sorting sequences by - length so they can be consumed by Pytorch RNN classes, which require their + length so they can be consumed by PyTorch RNN classes, which require their inputs to be sorted by length. Finally, it also provides optional statefulness to all of it's subclasses by allowing the caching and retrieving of the hidden states of RNNs. """ - def __init__(self, stateful: bool = False) -> None: super().__init__() self.stateful = stateful self._states: Optional[RnnStateStorage] = None def sort_and_run_forward( - self, - module: Callable[ - [PackedSequence, Optional[RnnState]], - Tuple[Union[PackedSequence, torch.Tensor], RnnState], - ], - inputs: torch.Tensor, - mask: torch.Tensor, - hidden_state: Optional[RnnState] = None, - ): - r"""This function exists because Pytorch RNNs require that their inputs + self, + module: Callable[[PackedSequence, Optional[RnnState]], + Tuple[Union[PackedSequence, torch.Tensor], + RnnState]], + inputs: torch.Tensor, mask: torch.Tensor, + hidden_state: Optional[RnnState] = None): + r"""This function exists because PyTorch RNNs require that their inputs be sorted before being passed as input. As all of our Seq2xxxEncoders use this functionality, it is provided in a base class. This method can be called on any module which takes as input a `PackedSequence` and @@ -603,43 +507,34 @@ def sort_and_run_forward( outputs is left to the subclasses because their outputs have different types and handling them smoothly here is difficult. - # Parameters - - module : `Callable[[PackedSequence, Optional[RnnState]], - Tuple[Union[PackedSequence, torch.Tensor], - RnnState]]`, required. - A function to run on the inputs. In most cases, this is a - `torch.nn.Module`. - inputs : `torch.Tensor`, required. - A tensor of shape `(batch_size, sequence_length, embedding_size)` - representing the inputs to the Encoder. - mask : `torch.Tensor`, required. - A tensor of shape `(batch_size, sequence_length)`, representing - masked and non-masked elements of the sequence for each element in - the batch. - hidden_state : `Optional[RnnState]`, (default = None). - A single tensor of shape (num_layers, batch_size, hidden_size) - representing the state of an RNN with or a tuple of tensors of - shapes (num_layers, batch_size, hidden_size) and - (num_layers, batch_size, memory_size), representing the hidden - state and memory state of an LSTM-like RNN. - - # Returns - - module_output : `Union[torch.Tensor, PackedSequence]`. - A Tensor or PackedSequence representing the output of the Pytorch - Module. The batch size dimension will be equal to `num_valid`, as - sequences of zero length are clipped off before the module is - called, as Pytorch cannot handle zero length sequences. - final_states : `Optional[RnnState]` - A Tensor representing the hidden state of the Pytorch Module. This - can either be a single tensor of shape (num_layers, num_valid, - hidden_size), for instance in the case of a GRU, or a tuple of - tensors, such as those required for an LSTM. - restoration_indices : `torch.LongTensor` - A tensor of shape `(batch_size,)`, describing the re-indexing - required to transform the outputs back to their original batch - order. + Args: + module: A function to run on the inputs. In most cases, this is a + `torch.nn.Module`. + inputs: A tensor of shape `(batch_size, sequence_length, + embedding_size)` representing the inputs to the Encoder. + mask: A tensor of shape `(batch_size, sequence_length)`, + representing masked and non-masked elements of the sequence for + each element in the batch. + hidden_state: A single tensor of shape `(num_layers, batch_size, + hidden_size)` representing the state of an RNN with or a tuple + of tensors of shapes `(num_layers, batch_size, hidden_size)` and + `(num_layers, batch_size, memory_size)`, representing the hidden + state and memory state of an LSTM-like RNN. + + Returns: + module_output: A Tensor or `PackedSequence` representing the output + of the PyTorch Module. The batch size dimension will be equal to + `num_valid`, as sequences of zero length are clipped off before + the module is called, as PyTorch cannot handle zero length + sequences. + final_states: A Tensor representing the hidden state of the PyTorch + Module. This can either be a single tensor of shape + `(num_layers, num_valid, hidden_size)`, for instance in the case + of a GRU, or a tuple of tensors, such as those required for an + LSTM. + restoration_indices: A tensor of shape `(batch_size,)`, describing + the re-indexing required to transform the outputs back to their + original batch order. """ # In some circumstances you may have sequences of zero length. # `pack_padded_sequence` requires all sequence lengths to be > 0, so @@ -650,34 +545,25 @@ def sort_and_run_forward( batch_size = mask.size(0) num_valid = torch.sum(mask[:, 0]).int().item() - sequence_lengths = get_lengths_from_binary_sequence_mask(mask) - ( - sorted_inputs, - sorted_sequence_lengths, - restoration_indices, - sorting_indices, - ) = sort_batch_by_length(inputs, sequence_lengths) + sequence_lengths = mask.long().sum(-1) + (sorted_inputs, sorted_sequence_lengths, restoration_indices, + sorting_indices) = sort_batch_by_length(inputs, sequence_lengths) # Now create a PackedSequence with only the non-empty, sorted sequences. packed_sequence_input = pack_padded_sequence( sorted_inputs[:num_valid, :, :], - sorted_sequence_lengths[:num_valid].data.tolist(), - batch_first=True, - ) + sorted_sequence_lengths[:num_valid].data.tolist(), batch_first=True) # Prepare the initial states. if not self.stateful: if hidden_state is None: initial_states: Any = hidden_state elif isinstance(hidden_state, tuple): - initial_states = [ - state.index_select( - 1, sorting_indices)[:, :num_valid, :].contiguous() - for state in hidden_state - ] + initial_states = [state.index_select( + 1, sorting_indices)[:, :num_valid, :].contiguous() + for state in hidden_state] else: initial_states = hidden_state.index_select(1, sorting_indices)[ :, :num_valid, :].contiguous() - else: initial_states = self._get_initial_states(batch_size, num_valid, sorting_indices) @@ -685,12 +571,11 @@ def sort_and_run_forward( # Actually call the module on the sorted PackedSequence. module_output, final_states = module(packed_sequence_input, initial_states) - return module_output, final_states, restoration_indices - def _get_initial_states( - self, batch_size: int, num_valid: int, sorting_indices: torch.LongTensor - ) -> Optional[RnnState]: + def _get_initial_states(self, batch_size: int, num_valid: int, + sorting_indices: torch.LongTensor) -> \ + Optional[RnnState]: r"""Returns an initial state for use in an RNN. Additionally, this method handles the batch size changing across calls by mutating the state to append initial states for new elements in the batch. Finally, @@ -699,36 +584,32 @@ def _get_initial_states( Importantly, this `mutates` the state if the current batch size is larger than when it was previously called. - # Parameters - - batch_size : `int`, required. - The batch size can change size across calls to stateful RNNs, so we - need to know if we need to expand or shrink the states before - returning them. Expanded states will be set to zero. - num_valid : `int`, required. - The batch may contain completely padded sequences which get removed - before the sequence is passed through the encoder. We also need to - clip these off of the state too. - sorting_indices `torch.LongTensor`, required. - Pytorch RNNs take sequences sorted by length. When we return the - states to be used for a given call to `module.forward`, we need the - states to match up to the sorted sequences, so before returning - them, we sort the states using the same indices used to sort the - sequences. - - # Returns - - This method has a complex return type because it has to deal with the - first time it is called, when it has no state, and the fact that types - of RNN have heterogeneous states. - - If it is the first time the module has been called, it returns `None`, - regardless of the type of the `Module`. - - Otherwise, for LSTMs, it returns a tuple of `torch.Tensors` with shape - `(num_layers, num_valid, state_size)` and `(num_layers, num_valid, - memory_size)` respectively, or for GRUs, it returns a single - `torch.Tensor` of shape `(num_layers, num_valid, state_size)`. + Args: + batch_size: The batch size can change size across calls to stateful + RNNs, so we need to know if we need to expand or shrink the + states before returning them. Expanded states will be set to + zero. + num_valid: The batch may contain completely padded sequences which + get removed before the sequence is passed through the encoder. + We also need to clip these off of the state too. + sorting_indices: Pytorch RNNs take sequences sorted by length. When + we return the states to be used for a given call to + `module.forward`, we need the states to match up to the sorted + sequences, so before returning them, we sort the states using + the same indices used to sort the sequences. + + Returns: + This method has a complex return type because it has to deal with + the first time it is called, when it has no state, and the fact that + types of RNN have heterogeneous states. + + If it is the first time the module has been called, it returns + `None`, regardless of the type of the `Module`. + + Otherwise, for LSTMs, it returns a tuple of `torch.Tensors` with + shape `(num_layers, num_valid, state_size)` and `(num_layers, + num_valid, memory_size)` respectively, for GRUs, it returns a single + `torch.Tensor` of shape `(num_layers, num_valid, state_size)`. """ # We don't know the state sizes the first time calling forward, # so we let the module define what it's initial hidden state looks like. @@ -750,7 +631,6 @@ def _get_initial_states( resized_states.append(torch.cat([state, zeros], 1)) self._states = tuple(resized_states) correctly_shaped_states = self._states - elif batch_size < self._states[0].size(1): # This batch is smaller than the previous one. correctly_shaped_states = tuple(state[:, :batch_size, :] for state @@ -773,10 +653,8 @@ def _get_initial_states( return sorted_state[:, :num_valid, :].contiguous() else: # LSTMs have a state tuple of (state, memory). - sorted_states = [ - state.index_select(1, sorting_indices) for state in - correctly_shaped_states - ] + sorted_states = [state.index_select(1, sorting_indices) for state in + correctly_shaped_states] return tuple(state[:, :num_valid, :].contiguous() # type: ignore for state in sorted_states) @@ -790,18 +668,12 @@ def _update_states(self, final_states: RnnStateStorage, computational graph, such that the graph can be garbage collected after each batch iteration. - # Parameters - - final_states : `RnnStateStorage`, required. - The hidden states returned as output from the RNN. - restoration_indices : `torch.LongTensor`, required. - The indices that invert the sorting used in `sort_and_run_forward` - to order the states with respect to the lengths of the sequences in - the batch. + Args: + final_states: The hidden states returned as output from the RNN. + restoration_indices: The indices that invert the sorting used in + `sort_and_run_forward` to order the states with respect to the + lengths of the sequences in the batch. """ - # TODO(Mark): seems weird to sort here, but append zeros in the - # subclasses. - # which way around is best? new_unsorted_states = [state.index_select(1, restoration_indices) for state in final_states] @@ -820,18 +692,14 @@ def _update_states(self, final_states: RnnStateStorage, current_state_batch_size = self._states[0].size(1) new_state_batch_size = final_states[0].size(1) # Masks for the unused states of shape (1, new_batch_size, 1) - used_new_rows_mask = [ - (state[0, :, :].sum(-1) != 0.0).float().view( - 1, new_state_batch_size, 1) - for state in new_unsorted_states - ] + used_new_rows_mask = [(state[0, :, :].sum(-1) != 0.0).float().view( + 1, new_state_batch_size, 1) for state in new_unsorted_states] new_states = [] if current_state_batch_size > new_state_batch_size: # The new state is smaller than the old one, # so just update the indices which we used. for old_state, new_state, used_mask in zip( - self._states, new_unsorted_states, used_new_rows_mask - ): + self._states, new_unsorted_states, used_new_rows_mask): # zero out all rows in the previous state # which _were_ used in the current state. masked_old_state = \ @@ -846,8 +714,7 @@ def _update_states(self, final_states: RnnStateStorage, # deal with the possibility that some rows weren't used. new_states = [] for old_state, new_state, used_mask in zip( - self._states, new_unsorted_states, used_new_rows_mask - ): + self._states, new_unsorted_states, used_new_rows_mask): # zero out all rows which _were_ used in the current state. masked_old_state = old_state * (1 - used_mask) # The old state is larger, so update the relevant parts of @@ -867,11 +734,9 @@ def _update_states(self, final_states: RnnStateStorage, def reset_states(self, mask: Optional[torch.Tensor] = None) -> None: r"""Resets the internal states of a stateful encoder. - # Parameters - - mask : `torch.Tensor`, optional. - A tensor of shape `(batch_size,)` indicating which states should - be reset. If not provided, all states will be reset. + Args: + mask: A tensor of shape `(batch_size,)` indicating which states + should be reset. If not provided, all states will be reset. """ if mask is None: self._states = None @@ -890,19 +755,17 @@ def reset_states(self, mask: Optional[torch.Tensor] = None) -> None: f"Trying to reset states using mask with incorrect " f"batch size. " f"Expected batch size: {old_state_batch_size}. " - f"Provided batch size: {mask_batch_size}." - ) + f"Provided batch size: {mask_batch_size}.") new_state = (1 - mask) * old_state new_states.append(new_state.detach()) self._states = tuple(new_states) class ElmoLstm(_EncoderBase): - r"""A stacked, bidirectional LSTM which uses - [`LstmCellWithProjection`'s](./lstm_cell_with_projection.md) - with highway layers between the inputs to layers. - The inputs to the forward and backward directions are independent - - forward and backward states are not concatenated between layers. + r"""A stacked, bidirectional LSTM which uses `LstmCellWithProjection`'s + with highway layers between the inputs to layers. The inputs to the forward + and backward directions are independent - forward and backward states are + not concatenated between layers. Additionally, this LSTM maintains its `own` state, which is updated every time `forward` is called. It is dynamically resized for different batch @@ -912,39 +775,29 @@ class ElmoLstm(_EncoderBase): This is non-standard, but can be thought of as having an "end of sentence" state, which is carried across different sentences. - # Parameters - - input_size : `int`, required - The dimension of the inputs to the LSTM. - hidden_size : `int`, required - The dimension of the outputs of the LSTM. - cell_size : `int`, required. - The dimension of the memory cell of the `LstmCellWithProjection`. - num_layers : `int`, required - The number of bidirectional LSTMs to use. - requires_grad : `bool`, optional - If True, compute gradient of ELMo parameters for fine tuning. - recurrent_dropout_probability : `float`, optional (default = 0.0) - The dropout probability to be used in a dropout scheme as stated in - [A Theoretically Grounded Application of Dropout in Recurrent Neural - Networks](https://arxiv.org/abs/1512.05287). - state_projection_clip_value : `float`, optional, (default = None) - The magnitude with which to clip the hidden_state after projecting it. - memory_cell_clip_value : `float`, optional, (default = None) - The magnitude with which to clip the memory cell. + Args: + input_size: The dimension of the inputs to the LSTM. + hidden_size: The dimension of the outputs of the LSTM. + cell_size: The dimension of the memory cell of the + `LstmCellWithProjection`. + num_layers: The number of bidirectional LSTMs to use. + requires_grad: If True, compute gradient of ELMo parameters for fine + tuning. + recurrent_dropout_probability: The dropout probability to be used in a + dropout scheme as stated in [A Theoretically Grounded Application of + Dropout in Recurrent Neural Networks] + (https://arxiv.org/abs/1512.05287). + state_projection_clip_value: The magnitude with which to clip the + `hidden_state` after projecting it. + memory_cell_clip_value: The magnitude with which to clip the memory + cell. """ - def __init__( - self, - input_size: int, - hidden_size: int, - cell_size: int, - num_layers: int, - requires_grad: bool = False, - recurrent_dropout_probability: float = 0.0, - memory_cell_clip_value: Optional[float] = None, - state_projection_clip_value: Optional[float] = None, - ) -> None: + def __init__(self, input_size: int, hidden_size: int, cell_size: int, + num_layers: int, requires_grad: bool = False, + recurrent_dropout_probability: float = 0.0, + memory_cell_clip_value: Optional[float] = None, + state_projection_clip_value: Optional[float] = None) -> None: super().__init__(stateful=True) # Required to be wrapped with a `PytorchSeq2SeqWrapper`. @@ -961,23 +814,13 @@ def __init__( go_forward = True for layer_index in range(num_layers): forward_layer = LstmCellWithProjection( - lstm_input_size, - hidden_size, - cell_size, - go_forward, - recurrent_dropout_probability, - memory_cell_clip_value, - state_projection_clip_value, - ) + lstm_input_size, hidden_size, cell_size, go_forward, + recurrent_dropout_probability, memory_cell_clip_value, + state_projection_clip_value) backward_layer = LstmCellWithProjection( - lstm_input_size, - hidden_size, - cell_size, - not go_forward, - recurrent_dropout_probability, - memory_cell_clip_value, - state_projection_clip_value, - ) + lstm_input_size, hidden_size, cell_size, not go_forward, + recurrent_dropout_probability, memory_cell_clip_value, + state_projection_clip_value) lstm_input_size = hidden_size self.add_module("forward_layer_{}".format(layer_index), @@ -993,19 +836,17 @@ def forward(self, inputs: torch.Tensor, # type: ignore mask: torch.LongTensor) -> torch.Tensor: r"""Encodes the inputs. - # Parameters - - inputs : `torch.Tensor`, required. - A Tensor of shape `(batch_size, sequence_length, hidden_size)`. - mask : `torch.LongTensor`, required. - A binary mask of shape `(batch_size, sequence_length)` representing - the non-padded elements in each sequence in the batch. - - # Returns - - A `torch.Tensor` of shape (num_layers, batch_size, sequence_length, - hidden_size), where the num_layers dimension represents the LSTM output - from that layer. + Args: + inputs: A Tensor of shape + `(batch_size, sequence_length, hidden_size)`. + mask: A binary mask of shape `(batch_size, sequence_length)` + representing the non-padded elements in each sequence in the + batch. + + Returns: + A `torch.Tensor` of shape `(num_layers, batch_size, sequence_length, + hidden_size)`, where the `num_layers` dimension represents the LSTM + output from that layer. """ batch_size, total_sequence_length = mask.size() stacked_sequence_output, final_states, restoration_indices = \ @@ -1018,11 +859,9 @@ def forward(self, inputs: torch.Tensor, # type: ignore if num_valid < batch_size: zeros = stacked_sequence_output.new_zeros( num_layers, batch_size - num_valid, returned_timesteps, - encoder_dim - ) + encoder_dim) stacked_sequence_output = torch.cat( [stacked_sequence_output, zeros], 1) - # The states also need to have invalid rows added back. new_states = [] for state in final_states: @@ -1040,58 +879,45 @@ def forward(self, inputs: torch.Tensor, # type: ignore sequence_length_difference = total_sequence_length - returned_timesteps if sequence_length_difference > 0: zeros = stacked_sequence_output.new_zeros( - num_layers, - batch_size, - sequence_length_difference, - stacked_sequence_output[0].size(-1), - ) + num_layers, batch_size, sequence_length_difference, + stacked_sequence_output[0].size(-1)) stacked_sequence_output = torch.cat( [stacked_sequence_output, zeros], 2) - self._update_states(final_states, restoration_indices) # Restore the original indices and return the sequence. # Has shape (num_layers, batch_size, sequence_length, hidden_size) return stacked_sequence_output.index_select(1, restoration_indices) - def _lstm_forward( - self, - inputs: PackedSequence, - initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + def _lstm_forward(self, inputs: PackedSequence, + initial_state: Optional[ + Tuple[torch.Tensor, torch.Tensor]] = None) -> \ + Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: r"""Encodes the inputs. - # Parameters - - inputs : `PackedSequence`, required. - A batch first `PackedSequence` to run the stacked LSTM over. - initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, - (default = None) - A tuple (state, memory) representing the initial hidden state and - memory of the LSTM, with shape - (num_layers, batch_size, 2 * hidden_size) and - (num_layers, batch_size, 2 * cell_size) respectively. - - # Returns - - output_sequence : `torch.FloatTensor` - The encoded sequence of shape - (num_layers, batch_size, sequence_length, hidden_size) - final_states : `Tuple[torch.FloatTensor, torch.FloatTensor]` - The per-layer final (state, memory) states of the LSTM, with shape - (num_layers, batch_size, 2 * hidden_size) and - (num_layers, batch_size, 2 * cell_size) - respectively. The last dimension is duplicated because it - contains the state/memory for both the forward and backward layers. + Args: + inputs: A batch first `PackedSequence` to run the stacked LSTM over. + initial_state: A tuple (state, memory) representing the initial + hidden state and memory of the LSTM, with shape + `(num_layers, batch_size, 2 * hidden_size)` and + `(num_layers, batch_size, 2 * cell_size)` respectively. + + Returns: + output_sequence: The encoded sequence of shape + `(num_layers, batch_size, sequence_length, hidden_size)`. + final_states: The per-layer final (state, memory) states of the + LSTM, with shape `(num_layers, batch_size, 2 * hidden_size)` and + `(num_layers, batch_size, 2 * cell_size)` respectively. The + last dimension is duplicated because it contains the + state/memory for both the forward and backward layers. """ if initial_state is None: hidden_states: List[Optional[Tuple[torch.Tensor, torch.Tensor]]] = \ [None] * len(self.forward_layers) elif initial_state[0].size()[0] != len(self.forward_layers): - raise ConfigurationError( + raise ValueError( "Initial states were passed to forward() but the number of " - "initial states does not match the number of layers." - ) + "initial states does not match the number of layers.") else: hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0))) @@ -1111,6 +937,8 @@ def _lstm_forward( forward_cache = forward_output_sequence backward_cache = backward_output_sequence + forward_state = None + backward_state = None if state is not None: forward_hidden_state, backward_hidden_state = state[0].split( self.hidden_size, 2) @@ -1118,16 +946,11 @@ def _lstm_forward( self.cell_size, 2) forward_state = (forward_hidden_state, forward_memory_state) backward_state = (backward_hidden_state, backward_memory_state) - else: - forward_state = None # type: ignore - backward_state = None # type: ignore forward_output_sequence, forward_state = forward_layer( - forward_output_sequence, batch_lengths, forward_state - ) + forward_output_sequence, batch_lengths, forward_state) backward_output_sequence, backward_state = backward_layer( - backward_output_sequence, batch_lengths, backward_state - ) + backward_output_sequence, batch_lengths, backward_state) # Skip connections, just adding the input to the output. if layer_index != 0: forward_output_sequence += forward_cache @@ -1135,16 +958,12 @@ def _lstm_forward( sequence_outputs.append( torch.cat([forward_output_sequence, backward_output_sequence], - -1) - ) + -1)) # Append the state tuples in a list, so that we can return # the final states for all the layers. final_states.append( - ( - torch.cat([forward_state[0], backward_state[0]], -1), - torch.cat([forward_state[1], backward_state[1]], -1), - ) - ) + (torch.cat([forward_state[0], backward_state[0]], -1), + torch.cat([forward_state[1], backward_state[1]], -1))) stacked_sequence_outputs: torch.FloatTensor = torch.stack( sequence_outputs) @@ -1154,8 +973,7 @@ def _lstm_forward( final_hidden_states, final_memory_states = zip(*final_states) final_state_tuple: Tuple[torch.FloatTensor, torch.FloatTensor] = ( torch.cat(final_hidden_states, 0), - torch.cat(final_memory_states, 0), - ) + torch.cat(final_memory_states, 0)) return stacked_sequence_outputs, final_state_tuple def load_weights(self, weight_file: str) -> None: @@ -1169,12 +987,8 @@ def load_weights(self, weight_file: str) -> None: for j_direction, lstm in enumerate(lstms): # lstm is an instance of LSTMCellWithProjection cell_size = lstm.cell_size - dataset = fin["RNN_%s" % j_direction]["RNN"][ - "MultiRNNCell"][ - "Cell%s" % i_layer - ]["LSTMCell"] - + "MultiRNNCell"]["Cell%s" % i_layer]["LSTMCell"] # tensorflow packs together both W and U matrices into one # matrix, but pytorch maintains individual matrices. In # addition, tensorflow packs the gates as input, memory, @@ -1193,14 +1007,11 @@ def load_weights(self, weight_file: str) -> None: # handle the different gate order convention for torch_w, tf_w in [ [input_weights, tf_input_weights], - [recurrent_weights, tf_recurrent_weights], - ]: + [recurrent_weights, tf_recurrent_weights]]: torch_w[(1 * cell_size): (2 * cell_size), :] = tf_w[ - (2 * cell_size): (3 * cell_size), : - ] + (2 * cell_size): (3 * cell_size), :] torch_w[(2 * cell_size): (3 * cell_size), :] = tf_w[ - (1 * cell_size): (2 * cell_size), : - ] + (1 * cell_size): (2 * cell_size), :] lstm.input_linearity.weight.data.copy_(torch.FloatTensor( input_weights)) @@ -1216,11 +1027,9 @@ def load_weights(self, weight_file: str) -> None: tf_bias[(2 * cell_size): (3 * cell_size)] += 1 torch_bias = tf_bias.copy() torch_bias[(1 * cell_size): (2 * cell_size)] = tf_bias[ - (2 * cell_size): (3 * cell_size) - ] + (2 * cell_size): (3 * cell_size)] torch_bias[(2 * cell_size): (3 * cell_size)] = tf_bias[ - (1 * cell_size): (2 * cell_size) - ] + (1 * cell_size): (2 * cell_size)] lstm.state_linearity.bias.data.copy_(torch.FloatTensor( torch_bias)) lstm.state_linearity.bias.requires_grad = requires_grad @@ -1234,56 +1043,43 @@ def load_weights(self, weight_file: str) -> None: class LstmCellWithProjection(torch.nn.Module): r"""An LSTM with Recurrent Dropout and a projected and clipped hidden state - and memory. Note: this implementation is slower than the native Pytorch + and memory. Note: this implementation is slower than the native PyTorch LSTM because it cannot make use of CUDNN optimizations for stacked RNNs due to and variational dropout and the custom nature of the cell state. - # Parameters - - input_size : `int`, required. - The dimension of the inputs to the LSTM. - hidden_size : `int`, required. - The dimension of the outputs of the LSTM. - cell_size : `int`, required. - The dimension of the memory cell used for the LSTM. - go_forward : `bool`, optional (default = True) - The direction in which the LSTM is applied to the sequence. - Forwards by default, or backwards if False. - recurrent_dropout_probability : `float`, optional (default = 0.0) - The dropout probability to be used in a dropout scheme as stated in - [A Theoretically Grounded Application of Dropout in Recurrent Neural - Networks] (https://arxiv.org/abs/1512.05287). Implementation wise, - this simply applies a fixed dropout mask per sequence to the recurrent - connection of the LSTM. - state_projection_clip_value : `float`, optional, (default = None) - The magnitude with which to clip the hidden_state after projecting it. - memory_cell_clip_value : `float`, optional, (default = None) - The magnitude with which to clip the memory cell. - - # Returns - - output_accumulator : `torch.FloatTensor` - The outputs of the LSTM for each timestep. A tensor of shape - (batch_size, max_timesteps, hidden_size) where for a given batch - element, all outputs past the sequence length for that batch are - zero tensors. - final_state : `Tuple[torch.FloatTensor, torch.FloatTensor]` - The final (state, memory) states of the LSTM, with shape - (1, batch_size, hidden_size) and (1, batch_size, cell_size) - respectively. The first dimension is 1 in order to match the Pytorch - API for returning stacked LSTM states. + Args: + input_size: The dimension of the inputs to the LSTM. + hidden_size: The dimension of the outputs of the LSTM. + cell_size: The dimension of the memory cell used for the LSTM. + go_forward: The direction in which the LSTM is applied to the sequence. + Forwards by default, or backwards if False. + recurrent_dropout_probability: The dropout probability to be used in a + dropout scheme as stated in [A Theoretically Grounded Application of + Dropout in Recurrent Neural Networks] + (https://arxiv.org/abs/1512.05287). Implementation wise, this simply + applies a fixed dropout mask per sequence to the recurrent + connection of the LSTM. + state_projection_clip_value: The magnitude with which to clip the + `hidden_state` after projecting it. + memory_cell_clip_value: The magnitude with which to clip the memory + cell. + + Returns: + output_accumulator: The outputs of the LSTM for each timestep. A tensor + of shape `(batch_size, max_timesteps, hidden_size)` where for a + given batch element, all outputs past the sequence length for that + batch are zero tensors. + final_state: The final (state, memory) states of the LSTM, with shape + `(1, batch_size, hidden_size)` and `(1, batch_size, cell_size)` + respectively. The first dimension is 1 in order to match the PyTorch + API for returning stacked LSTM states. """ - def __init__( - self, - input_size: int, - hidden_size: int, - cell_size: int, - go_forward: bool = True, - recurrent_dropout_probability: float = 0.0, - memory_cell_clip_value: Optional[float] = None, - state_projection_clip_value: Optional[float] = None, - ) -> None: + def __init__(self, input_size: int, hidden_size: int, cell_size: int, + go_forward: bool = True, + recurrent_dropout_probability: float = 0.0, + memory_cell_clip_value: Optional[float] = None, + state_projection_clip_value: Optional[float] = None) -> None: super().__init__() # Required to be wrapped with a `PytorchSeq2SeqWrapper`. self.input_size = input_size @@ -1296,14 +1092,13 @@ def __init__( self.recurrent_dropout_probability = recurrent_dropout_probability # We do the projections for all the gates all at once. - self.input_linearity = torch.nn.Linear( - input_size, 4 * cell_size, bias=False) - self.state_linearity = torch.nn.Linear( - hidden_size, 4 * cell_size, bias=True) - + self.input_linearity = torch.nn.Linear(input_size, 4 * cell_size, + bias=False) + self.state_linearity = torch.nn.Linear(hidden_size, 4 * cell_size, + bias=True) # Additional projection matrix for making the hidden state smaller. - self.state_projection = torch.nn.Linear( - cell_size, hidden_size, bias=False) + self.state_projection = torch.nn.Linear(cell_size, hidden_size, + bias=False) self.reset_parameters() def reset_parameters(self): @@ -1319,46 +1114,36 @@ def reset_parameters(self): self.state_linearity.bias.data[self.cell_size: 2 * self.cell_size].fill_(1.0) - def forward( # type: ignore - self, - inputs: torch.FloatTensor, - batch_lengths: List[int], - initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - ): + def forward(self, inputs: torch.FloatTensor, # type: ignore + batch_lengths: List[int], + initial_state: Optional[Tuple[torch.Tensor, + torch.Tensor]] = None): r"""Process the inputs. - # Parameters - - inputs : `torch.FloatTensor`, required. - A tensor of shape (batch_size, num_timesteps, input_size) - to apply the LSTM over. - batch_lengths : `List[int]`, required. - A list of length batch_size containing the lengths of the sequences - in batch. - initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, - (default = None) - A tuple (state, memory) representing the initial hidden state and - memory of the LSTM. The `state` has shape (1, batch_size, - hidden_size) and the `memory` has shape (1, batch_size, cell_size). - - # Returns - - output_accumulator : `torch.FloatTensor` - The outputs of the LSTM for each timestep. A tensor of shape - (batch_size, max_timesteps, hidden_size) where for a given batch - element, all outputs past the sequence length for that batch are - zero tensors. - final_state : `Tuple[`torch.FloatTensor, torch.FloatTensor]` - A tuple (state, memory) representing the initial hidden state and - memory of the LSTM. The `state` has shape (1, batch_size, - hidden_size) and the `memory` has shape (1, batch_size, cell_size). + Args: + inputs: A tensor of shape `(batch_size, num_timesteps, input_size)` + to apply the LSTM over. + batch_lengths: A list of length batch_size containing the lengths + of the sequences in batch. + initial_state: A tuple (state, memory) representing the initial + hidden state and memory of the LSTM. The `state` has shape + `(1, batch_size, hidden_size)` and the `memory` has shape + `(1, batch_size, cell_size)`. + + Returns: + output_accumulator: The outputs of the LSTM for each timestep. A + tensor of shape `(batch_size, max_timesteps, hidden_size)` where + for a given batch element, all outputs past the sequence length + for that batch are zero tensors. + final_state: A tuple (state, memory) representing the initial hidden + state and memory of the LSTM. The `state` has shape + `(1, batch_size, hidden_size)` and the `memory` has shape + `(1, batch_size, cell_size)`. """ batch_size = inputs.size()[0] total_timesteps = inputs.size()[1] - output_accumulator = inputs.new_zeros(batch_size, total_timesteps, self.hidden_size) - if initial_state is None: full_batch_previous_memory = inputs.new_zeros(batch_size, self.cell_size) @@ -1371,8 +1156,7 @@ def forward( # type: ignore current_length_index = batch_size - 1 if self.go_forward else 0 if self.recurrent_dropout_probability > 0.0 and self.training: dropout_mask = get_dropout_mask( - self.recurrent_dropout_probability, full_batch_previous_state - ) + self.recurrent_dropout_probability, full_batch_previous_state) else: dropout_mask = None @@ -1380,7 +1164,6 @@ def forward( # type: ignore # The index depends on which end we start. index = timestep if self.go_forward else \ total_timesteps - timestep - 1 - # What we are doing here is finding the index into the batch # dimension which we need to use for this timestep, because the # sequences have variable length, so once the index is greater than @@ -1400,10 +1183,8 @@ def forward( # type: ignore # elements in the batch? # Second conditional: Does the next shortest sequence beyond # the current batch index require computation use this timestep? - while ( - current_length_index < (len(batch_lengths) - 1) - and batch_lengths[current_length_index + 1] > index - ): + while (current_length_index < (len(batch_lengths) - 1) + and batch_lengths[current_length_index + 1] > index): current_length_index += 1 # Actually get the slices of the batch which we @@ -1445,13 +1226,9 @@ def forward( # type: ignore # Here is the non-standard part of this LSTM cell; first, we clip # the memory cell, then we project the output of the timestep to a # smaller size and again clip it. - if self.memory_cell_clip_value: - - memory = torch.clamp( - memory, -self.memory_cell_clip_value, - self.memory_cell_clip_value - ) + memory = torch.clamp(memory, -self.memory_cell_clip_value, + self.memory_cell_clip_value) # shape (current_length_index, cell_size) pre_projection_timestep_output = output_gate * torch.tanh(memory) @@ -1460,12 +1237,9 @@ def forward( # type: ignore timestep_output = self.state_projection( pre_projection_timestep_output) if self.state_projection_clip_value: - timestep_output = torch.clamp( - timestep_output, - -self.state_projection_clip_value, - self.state_projection_clip_value, - ) + timestep_output, -self.state_projection_clip_value, + self.state_projection_clip_value) # Only do dropout if the dropout prob is > 0.0 and we are in # training mode. @@ -1488,46 +1262,36 @@ def forward( # type: ignore # Mimic the pytorch API by returning state in the following shape: # (num_layers * num_directions, batch_size, ...). As this # LSTM cell cannot be stacked, the first dimension here is just 1. - final_state = ( - full_batch_previous_state.unsqueeze(0), - full_batch_previous_memory.unsqueeze(0), - ) - + final_state = (full_batch_previous_state.unsqueeze(0), + full_batch_previous_memory.unsqueeze(0)) return output_accumulator, final_state class Highway(torch.nn.Module): r"""A [Highway layer](https://arxiv.org/abs/1505.00387) does a gated combination of a linear transformation and a non-linear transformation of - its input. :math:`y = g * x + (1 - g) * f(A(x))`, - where :math:`A` is a linear transformation, :math:`f` is an element-wise - non-linearity, and :math:`g` is an element-wise gate, computed - as :math:`sigmoid(B(x))`. + its input. :math:`y = g * x + (1 - g) * f(A(x))`, where :math:`A` is a + linear transformation, :math:`f` is an element-wise non-linearity, and + :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`. This module will apply a fixed number of highway layers to its input, returning the final result. - # Parameters - - input_dim : `int`, required - The dimensionality of :math:`x`. We assume the input has shape - `(batch_size, ..., input_dim)`. - num_layers : `int`, optional (default=`1`) - The number of highway layers to apply to the input. - activation : `Callable[[torch.Tensor], torch.Tensor]`, optional - (default=`torch.nn.functional.relu`) - The non-linearity to use in the highway layers. + Args: + input_dim: The dimensionality of :math:`x`. We assume the input has + shape `(batch_size, ..., input_dim)`. + num_layers: The number of highway layers to apply to the input. + activation: The non-linearity to use in the highway layers. """ def __init__(self, input_dim: int, num_layers: int = 1, activation: Callable[[torch.Tensor], torch.Tensor] = - torch.nn.functional.relu,) -> None: + torch.nn.functional.relu) -> None: super().__init__() self._input_dim = input_dim self._layers = torch.nn.ModuleList( [torch.nn.Linear(input_dim, input_dim * 2) - for _ in range(num_layers)] - ) + for _ in range(num_layers)]) self._activation = activation for layer in self._layers: # We should bias the highway layer to just carry its input forward. @@ -1560,74 +1324,51 @@ class Embedding(torch.nn.Module): 3. use a non-trainable embedding 4. project the resultant embeddings to some other dimension (which only makes sense with non-trainable embeddings). - 5. build all of this easily `from_params` - - Note that if you are using our data API and are trying to embed a - [`TextField`](../../data/fields/text_field.md), you should use a - [`TextFieldEmbedder`](../text_field_embedders/text_field_embedder.md) - instead of using this directly. - - # Parameters - - num_embeddings : `int` - Size of the dictionary of embeddings (vocabulary size). - embedding_dim : `int` - The size of each embedding vector. - projection_dim : `int`, (optional, default=None) - If given, we add a projection layer after the embedding layer. This - really only makes sense if `trainable` is `False`. - weight : `torch.FloatTensor`, (optional, default=None) - A pre-initialised weight matrix for the embedding lookup, allowing the - use of pretrained vectors. - padding_index : `int`, (optional, default=None) - If given, pads the output with zeros whenever it encounters the index. - trainable : `bool`, (optional, default=True) - Whether or not to optimize the embedding parameters. - max_norm : `float`, (optional, default=None) - If given, will renormalize the embeddings to always have a norm lesser - than this - norm_type : `float`, (optional, default=2) - The p of the p-norm to compute for the max_norm option - scale_grad_by_freq : `bool`, (optional, default=False) - If given, this will scale gradients by the frequency of the words in - the mini-batch. - sparse : `bool`, (optional, default=False) - Whether or not the Pytorch backend should use a sparse representation - of the embedding weight. - vocab_namespace : `str`, (optional, default=None) - In case of fine-tuning/transfer learning, the model's embedding matrix - needs to be extended according to the size of extended-vocabulary. To - be able to know how much to extend the embedding-matrix, it's necessary - to know which vocab_namspace was used to construct it in the original - training. We store vocab_namespace used during the original training as - an attribute, so that it can be retrieved during fine-tuning. - pretrained_file : `str`, (optional, default=None) - Used to keep track of what is the source of the weights and loading - more embeddings at test time. **It does not load the weights from this - pretrained_file.** For that purpose, use `Embedding.from_params`. - - # Returns - - An Embedding module. + + Args: + num_embeddings: Size of the dictionary of embeddings (vocabulary size). + embedding_dim: The size of each embedding vector. + projection_dim: If given, we add a projection layer after the embedding + layer. This really only makes sense if `trainable` is `False`. + weight: A pre-initialised weight matrix for the embedding lookup, + allowing the use of pre-trained vectors. + padding_index: If given, pads the output with zeros whenever it + encounters the index. + trainable: Whether or not to optimize the embedding parameters. + max_norm: If given, will renormalize the embeddings to always have a + norm lesser than this. + norm_type: The p of the p-norm to compute for the max_norm option. + scale_grad_by_freq: If given, this will scale gradients by the frequency + of the words in the mini-batch. + sparse: Whether or not the Pytorch backend should use a sparse + representation of the embedding weight. + vocab_namespace: In case of fine-tuning/transfer learning, the model's + embedding matrix needs to be extended according to the size of + extended-vocabulary. To be able to know how much to extend the + embedding-matrix, it's necessary to know which `vocab_namspace` was + used to construct it in the original training. We store + vocab_namespace used during the original training as an attribute, + so that it can be retrieved during fine-tuning. + pretrained_file: Used to keep track of what is the source of the weights + and loading more embeddings at test time. **It does not load the + weights from this pretrained_file.** For that purpose, use + `Embedding.from_params`. + + Returns: + An Embedding module. """ default_implementation = "embedding" - def __init__( - self, - num_embeddings: int, - embedding_dim: int, - projection_dim: Optional[int] = None, - weight: Optional[torch.FloatTensor] = None, - padding_index: Optional[int] = None, - trainable: bool = True, - max_norm: Optional[float] = None, - norm_type: float = 2.0, - scale_grad_by_freq: bool = False, - sparse: bool = False, - vocab_namespace: Optional[str] = None, - pretrained_file: Optional[str] = None, - ) -> None: + def __init__(self, + num_embeddings: int, embedding_dim: int, + projection_dim: Optional[int] = None, + weight: Optional[torch.FloatTensor] = None, + padding_index: Optional[int] = None, trainable: bool = True, + max_norm: Optional[float] = None, norm_type: float = 2.0, + scale_grad_by_freq: bool = False, sparse: bool = False, + vocab_namespace: Optional[str] = None, + pretrained_file: Optional[str] = None) -> None: super().__init__() self.num_embeddings = num_embeddings self.padding_index = padding_index @@ -1637,7 +1378,6 @@ def __init__( self.sparse = sparse self._vocab_namespace = vocab_namespace self._pretrained_file = pretrained_file - self.output_dim = projection_dim or embedding_dim if weight is None: @@ -1646,40 +1386,29 @@ def __init__( torch.nn.init.xavier_uniform_(self.weight) else: if weight.size() != (num_embeddings, embedding_dim): - raise ConfigurationError( + raise ValueError( "A weight matrix was passed with contradictory embedding " - "shapes." - ) - self.weight = torch.nn.Parameter(weight, - requires_grad=trainable) + "shapes.") + self.weight = torch.nn.Parameter(weight, requires_grad=trainable) if self.padding_index is not None: self.weight.data[self.padding_index].fill_(0) + self._projection = None if projection_dim: self._projection = torch.nn.Linear(embedding_dim, projection_dim) - else: - self._projection = None # type: ignore def forward(self, tokens: torch.Tensor) -> torch.Tensor: # type: ignore - # tokens may have extra dimensions - # (batch_size, d1, ..., dn, sequence_length), - # but embedding expects (batch_size, sequence_length), so pass tokens to - # util.combine_initial_dims (which is a no-op if there are no extra - # dimensions). Remember the original size. + # tokens may have extra dimensions (batch_size, d1, ..., dn, + # sequence_length), but embedding expects (batch_size, sequence_length), + # so pass tokens to util.combine_initial_dims (which is a no-op if + # there are no extra dimensions). Remember the original size. original_size = tokens.size() tokens = combine_initial_dims(tokens) - embedded = embedding( - tokens, - self.weight, - padding_idx=self.padding_index, - max_norm=self.max_norm, - norm_type=self.norm_type, - scale_grad_by_freq=self.scale_grad_by_freq, - sparse=self.sparse, - ) - + tokens, self.weight, padding_idx=self.padding_index, + max_norm=self.max_norm, norm_type=self.norm_type, + scale_grad_by_freq=self.scale_grad_by_freq, sparse=self.sparse) # Now (if necessary) add back in the extra dimensions. embedded = uncombine_initial_dims(embedded, original_size) @@ -1711,37 +1440,27 @@ def __init__(self, module): def forward(self, *inputs, pass_through: Optional[List[str]] = None, **kwargs): - pass_through = pass_through or [] - reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs] - # Need some input to then get the batch_size and time_steps. some_input = None if inputs: some_input = inputs[-1] - reshaped_kwargs = {} for key, value in kwargs.items(): if isinstance(value, torch.Tensor) and key not in pass_through: if some_input is None: some_input = value - value = self._reshape_tensor(value) - reshaped_kwargs[key] = value - reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs) - if some_input is None: raise RuntimeError("No input tensor to time-distribute") - # Now get the output back into the right shape. # (batch_size, time_steps, **output_size) new_size = some_input.size()[:2] + reshaped_outputs.size()[1:] outputs = reshaped_outputs.contiguous().view(new_size) - return outputs @staticmethod @@ -1755,6 +1474,85 @@ def _reshape_tensor(input_tensor): return input_tensor.contiguous().view(*squashed_shape) +class ScalarMix(torch.nn.Module): + r"""Computes a parameterised scalar mixture of N tensors, + `mixture = gamma * sum(s_k * tensor_k)` where `s = softmax(w)`, with `w` + and `gamma` scalar parameters. + + In addition, if `do_layer_norm=True` then apply layer normalization to + each tensor before weighting. + """ + def __init__(self, mixture_size: int, do_layer_norm: bool = False, + initial_scalar_parameters: Optional[List[float]] = None, + trainable: bool = True) -> None: + super().__init__() + self.mixture_size = mixture_size + self.do_layer_norm = do_layer_norm + + if initial_scalar_parameters is None: + initial_scalar_parameters = [0.0] * mixture_size + elif len(initial_scalar_parameters) != mixture_size: + raise ValueError( + "Length of initial_scalar_parameters {} differs " + "from mixture_size {}".format(initial_scalar_parameters, + mixture_size)) + self.scalar_parameters = ParameterList([ + Parameter(torch.FloatTensor([initial_scalar_parameters[i]]), + requires_grad=trainable) for i in range(mixture_size)]) + self.gamma = Parameter(torch.FloatTensor([1.0]), + requires_grad=trainable) + + def forward(self, tensors: List[torch.Tensor], # type: ignore + mask: Optional[torch.Tensor] = None) -> torch.Tensor: + r"""Compute a weighted average of the `tensors`. The input tensors can + be any shape with at least two dimensions, but must all be the same + shape. + + When `do_layer_norm=True`, the `mask` is required input. If the + `tensors` are dimensioned `(dim_0, ..., dim_{n-1}, dim_n)`, then the + `mask` is dimensioned `(dim_0, ..., dim_{n-1})`, as in the typical + case with `tensors` of shape `(batch_size, timesteps, dim)` and `mask` + of shape `(batch_size, timesteps)`. + + When `do_layer_norm=False` the `mask` is ignored. + """ + if len(tensors) != self.mixture_size: + raise ValueError( + "{} tensors were passed, but the module was initialized to " + "mix {} tensors.".format(len(tensors), self.mixture_size)) + + def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked): + tensor_masked = tensor * broadcast_mask + mean = torch.sum(tensor_masked) / num_elements_not_masked + variance = ( + torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) / + num_elements_not_masked) + return (tensor - mean) / torch.sqrt(variance + 1e-12) + + # pylint: disable=unnecessary-comprehension + normed_weights = torch.nn.functional.softmax( + torch.cat([parameter for parameter in self.scalar_parameters]), + dim=0) + normed_weights = torch.split(normed_weights, split_size_or_sections=1) + + if not self.do_layer_norm: + pieces = [] + for weight, tensor in zip(normed_weights, tensors): + pieces.append(weight * tensor) + return self.gamma * sum(pieces) + else: + assert mask is not None + mask_float = mask.float() + broadcast_mask = mask_float.unsqueeze(-1) + input_dim = tensors[0].size(-1) + num_elements_not_masked = torch.sum(mask_float) * input_dim + pieces = [] + for weight, tensor in zip(normed_weights, tensors): + pieces.append(weight * _do_layer_norm(tensor, broadcast_mask, + num_elements_not_masked)) + return self.gamma * sum(pieces) + + def add_sentence_boundary_token_ids( tensor: torch.Tensor, mask: torch.Tensor, sentence_begin_token: Any, sentence_end_token: Any) -> \ @@ -1767,33 +1565,23 @@ def add_sentence_boundary_token_ids( Returns both the new tensor and updated mask. - # Parameters - - tensor : `torch.Tensor` - A tensor of shape `(batch_size, timesteps)` or - `(batch_size, timesteps, dim)` - mask : `torch.Tensor` - A tensor of shape `(batch_size, timesteps)` - sentence_begin_token: Any (anything that can be broadcast in torch for - assignment) - For 2D input, a scalar with the id. For 3D input, a tensor with - length dim. - sentence_end_token: Any (anything that can be broadcast in torch for - assignment) - For 2D input, a scalar with the id. For 3D input, a tensor with - length dim. - - # Returns - - tensor_with_boundary_tokens : `torch.Tensor` - The tensor with the appended and prepended boundary tokens. If the - input was 2D, it has shape (batch_size, timesteps + 2) and if the - input was 3D, it has shape (batch_size, timesteps + 2, dim). - new_mask : `torch.Tensor` - The new mask for the tensor, taking into account the appended tokens - marking the beginning and end of the sentence. + Args: + tensor: A tensor of shape `(batch_size, timesteps)` or + `(batch_size, timesteps, dim)`. + mask: A tensor of shape `(batch_size, timesteps)`. + sentence_begin_token: For 2D input, a scalar with the id. + For 3D input, a tensor with length dim. + sentence_end_token: For 2D input, a scalar with the id. + For 3D input, a tensor with length dim. + + Returns: + tensor_with_boundary_tokens: The tensor with the appended and prepended + boundary tokens. If the input was 2D, it has shape + `(batch_size, timesteps + 2)` and if the input was 3D, it has shape + `(batch_size, timesteps + 2, dim)`. + new_mask: The new mask for the tensor, taking into account the appended + tokens marking the beginning and end of the sentence. """ - # TODO: matthewp, profile this transfer sequence_lengths = mask.sum(dim=1).detach().cpu().numpy() tensor_shape = list(tensor.data.shape) new_shape = list(tensor_shape) @@ -1815,19 +1603,9 @@ def add_sentence_boundary_token_ids( else: raise ValueError( "add_sentence_boundary_token_ids only accepts 2D and 3D input") - return tensor_with_boundary_tokens, new_mask -def get_device_of(tensor: torch.Tensor) -> int: - r"""Returns the device of the tensor. - """ - if not tensor.is_cuda: - return -1 - else: - return tensor.get_device() - - def remove_sentence_boundaries(tensor: torch.Tensor, mask: torch.Tensor) -> \ Tuple[torch.Tensor, torch.Tensor]: r"""Remove begin/end of sentence embeddings from the batch of sentences. @@ -1841,22 +1619,16 @@ def remove_sentence_boundaries(tensor: torch.Tensor, mask: torch.Tensor) -> \ This function is the inverse of `add_sentence_boundary_token_ids`. - # Parameters - - tensor : `torch.Tensor` - A tensor of shape `(batch_size, timesteps, dim)` - mask : `torch.Tensor` - A tensor of shape `(batch_size, timesteps)` - - # Returns + Args: + tensor: A tensor of shape `(batch_size, timesteps, dim)`. + mask: A tensor of shape `(batch_size, timesteps)`. - tensor_without_boundary_tokens : `torch.Tensor` - The tensor after removing the boundary tokens of shape - `(batch_size, timesteps - 2, dim)` - new_mask : `torch.Tensor` - The new mask for the tensor of shape `(batch_size, timesteps - 2)`. + Returns: + tensor_without_boundary_tokens: The tensor after removing the boundary + tokens of shape `(batch_size, timesteps - 2, dim)`. + new_mask: The new mask for the tensor of shape + `(batch_size, timesteps - 2)`. """ - # TODO: matthewp, profile this transfer sequence_lengths = mask.sum(dim=1).detach().cpu().numpy() tensor_shape = list(tensor.data.shape) new_shape = list(tensor_shape) @@ -1868,105 +1640,9 @@ def remove_sentence_boundaries(tensor: torch.Tensor, mask: torch.Tensor) -> \ tensor_without_boundary_tokens[i, : (j - 2), :] = \ tensor[i, 1: (j - 1), :] new_mask[i, : (j - 2)] = 1 - return tensor_without_boundary_tokens, new_mask -A = TypeVar("A") - - -def lazy_groups_of(iterable: Iterable[A], group_size: int) -> Iterator[List[A]]: - r"""Takes an iterable and batches the individual instances into lists of the - specified size. The last list may be smaller if there are instances left - over. - """ - iterator = iter(iterable) - while True: - s = list(islice(iterator, group_size)) - if len(s) > 0: - yield s - else: - break - - -class ConfigurationError(Exception): - r"""The exception raised by any AllenNLP object when it's misconfigured - (e.g. missing properties, invalid properties, unknown properties). - """ - - def __init__(self, message): - super().__init__() - self.message = message - - def __str__(self): - # TODO(brendanr): Is there some reason why we need repr here? It - # produces horrible output for simple multi-line error messages. - return self.message - - -def get_lengths_from_binary_sequence_mask(mask: torch.Tensor): - r"""Compute sequence lengths for each batch element in a tensor using a - binary mask. - - # Parameters - - mask : torch.Tensor, required. - A 2D binary mask of shape (batch_size, sequence_length) to - calculate the per-batch sequence lengths from. - - # Returns - - A torch.LongTensor of shape (batch_size,) representing the lengths - of the sequences in the batch. - """ - return mask.long().sum(-1) - - -def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor): - r"""Sort a batch first tensor by some specified lengths. - - # Parameters - - tensor : torch.FloatTensor, required. - A batch first Pytorch tensor. - sequence_lengths : torch.LongTensor, required. - A tensor representing the lengths of some dimension of the tensor which - we want to sort by. - - # Returns - - sorted_tensor : torch.FloatTensor - The original tensor sorted along the batch dimension with respect to - sequence_lengths. - sorted_sequence_lengths : torch.LongTensor - The original sequence_lengths sorted by decreasing size. - restoration_indices : torch.LongTensor - Indices into the sorted_tensor such that - `sorted_tensor.index_select(0, restoration_indices) == original_tensor` - permutation_index : torch.LongTensor - The indices used to sort the tensor. This is useful if you want to sort - many tensors using the same ordering. - """ - - if not isinstance(tensor, torch.Tensor) or not isinstance(sequence_lengths, - torch.Tensor): - raise ConfigurationError( - "Both the tensor and sequence lengths must be torch.Tensors.") - - sorted_sequence_lengths, permutation_index = sequence_lengths.sort( - 0, descending=True) - sorted_tensor = tensor.index_select(0, permutation_index) - - index_range = torch.arange(0, len(sequence_lengths), - device=sequence_lengths.device) - # This is the equivalent of zipping with index, sorting by the original - # sequence lengths and returning the now sorted indices. - _, reverse_mapping = permutation_index.sort(0, descending=False) - restoration_indices = index_range.index_select(0, reverse_mapping) - return (sorted_tensor, sorted_sequence_lengths, restoration_indices, - permutation_index) - - def block_orthogonal(tensor: torch.Tensor, split_sizes: List[int], gain: float = 1.0) -> None: r"""An initializer which allows initializing model parameters in "blocks". @@ -1975,26 +1651,21 @@ def block_orthogonal(tensor: torch.Tensor, split_sizes: List[int], are concatenated together. However, they are separate parameters which should be initialized independently. - # Parameters - - tensor : `torch.Tensor`, required. - A tensor to initialize. - split_sizes : List[int], required. - A list of length `tensor.ndim()` specifying the size of the - blocks along that particular dimension. E.g. `[10, 20]` would - result in the tensor being split into chunks of size 10 along the - first dimension and 20 along the second. - gain : float, optional (default = 1.0) - The gain (scaling) applied to the orthogonal initialization. + Args: + tensor: A tensor to initialize. + split_sizes: A list of length `tensor.ndim()` specifying the size of the + blocks along that particular dimension. E.g. `[10, 20]` would + result in the tensor being split into chunks of size 10 along the + first dimension and 20 along the second. + gain: The gain (scaling) applied to the orthogonal initialization. """ data = tensor.data sizes = list(tensor.size()) if any(a % b != 0 for a, b in zip(sizes, split_sizes)): - raise ConfigurationError( + raise ValueError( "tensor dimensions must be divisible by their respective " "split_sizes. Found size: {} and split_sizes: {}".format( - sizes, split_sizes) - ) + sizes, split_sizes)) indexes = [list(range(0, max_size, split)) for max_size, split in zip( sizes, split_sizes)] # Iterate over all possible blocks within the tensor. @@ -2009,8 +1680,7 @@ def block_orthogonal(tensor: torch.Tensor, split_sizes: List[int], # the tensor. block_slice = tuple( slice(start_index, start_index + step) for start_index, step in - index_and_step_tuples - ) + index_and_step_tuples) data[block_slice] = torch.nn.init.orthogonal_( tensor[block_slice].contiguous(), gain=gain) @@ -2022,145 +1692,19 @@ def get_dropout_mask(dropout_probability: float, dropout_probability. Note that the mask is NOT applied to the tensor - the tensor is passed to retain the correct CUDA tensor type for the mask. - # Parameters + Args: + dropout_probability: Probability of dropping a dimension of the input. + tensor_for_masking: torch.Tensor, required. - dropout_probability : float, required. - Probability of dropping a dimension of the input. - tensor_for_masking : torch.Tensor, required. - - # Returns - - A torch.FloatTensor consisting of the binary mask scaled by - 1/ (1 - dropout_probability). - This scaling ensures expected values and variances of the output of - applying this mask and the original tensor are the same. + Returns: + A torch.FloatTensor consisting of the binary mask scaled by + `1 / (1 - dropout_probability)`. This scaling ensures expected values + and variances of the output of applying this mask and the original + tensor are the same. """ binary_mask = ( torch.rand(tensor_for_masking.size()) > dropout_probability).to( - tensor_for_masking.device - ) + tensor_for_masking.device) # Scale mask by 1/keep_prob to preserve output statistics. dropout_mask = binary_mask.float().div(1.0 - dropout_probability) return dropout_mask - - -def combine_initial_dims(tensor: torch.Tensor) -> torch.Tensor: - r"""Given a (possibly higher order) tensor of ids with shape - (d1, ..., dn, sequence_length) Return a view that's - (d1 * ... * dn, sequence_length). If original tensor is 1-d or 2-d, - return it as is. - """ - if tensor.dim() <= 2: - return tensor - else: - return tensor.view(-1, tensor.size(-1)) - - -def uncombine_initial_dims(tensor: torch.Tensor, original_size: torch.Size) -> \ - torch.Tensor: - r"""Given a tensor of embeddings with shape - (d1 * ... * dn, sequence_length, embedding_dim) and the original shape - (d1, ..., dn, sequence_length), return the reshaped tensor of embeddings - with shape (d1, ..., dn, sequence_length, embedding_dim). - If original size is 1-d or 2-d, return it as is. - """ - if len(original_size) <= 2: - return tensor - else: - view_args = list(original_size) + [tensor.size(-1)] - return tensor.view(*view_args) - - -class ScalarMix(torch.nn.Module): - r"""Computes a parameterised scalar mixture of N tensors, - `mixture = gamma * sum(s_k * tensor_k)` where `s = softmax(w)`, with `w` - and `gamma` scalar parameters. - - In addition, if `do_layer_norm=True` then apply layer normalization to - each tensor before weighting. - """ - - def __init__(self, mixture_size: int, do_layer_norm: bool = False, - initial_scalar_parameters: Optional[List[float]] = None, - trainable: bool = True,) -> None: - super().__init__() - self.mixture_size = mixture_size - self.do_layer_norm = do_layer_norm - - if initial_scalar_parameters is None: - initial_scalar_parameters = [0.0] * mixture_size - elif len(initial_scalar_parameters) != mixture_size: - raise ConfigurationError( - "Length of initial_scalar_parameters {} differs " - "from mixture_size {}".format(initial_scalar_parameters, - mixture_size) - ) - - self.scalar_parameters = ParameterList( - [ - Parameter( - torch.FloatTensor([initial_scalar_parameters[i]]), - requires_grad=trainable - ) - for i in range(mixture_size) - ] - ) - self.gamma = Parameter(torch.FloatTensor([1.0]), - requires_grad=trainable) - - def forward(self, tensors: List[torch.Tensor], # type: ignore - mask: Optional[torch.Tensor] = None) -> torch.Tensor: - r"""Compute a weighted average of the `tensors`. The input tensors an - be any shape with at least two dimensions, but must all be the same - shape. - - When `do_layer_norm=True`, the `mask` is required input. If the - `tensors` are dimensioned `(dim_0, ..., dim_{n-1}, dim_n)`, then the - `mask` is dimensioned `(dim_0, ..., dim_{n-1})`, as in the typical - case with `tensors` of shape `(batch_size, timesteps, dim)` and `mask` - of shape `(batch_size, timesteps)`. - - When `do_layer_norm=False` the `mask` is ignored. - """ - if len(tensors) != self.mixture_size: - raise ConfigurationError( - "{} tensors were passed, but the module was initialized to " - "mix {} tensors.".format(len(tensors), self.mixture_size) - ) - - def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked): - tensor_masked = tensor * broadcast_mask - mean = torch.sum(tensor_masked) / num_elements_not_masked - variance = ( - torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) / - num_elements_not_masked - ) - return (tensor - mean) / torch.sqrt(variance + 1e-12) - - # pylint: disable=unnecessary-comprehension - normed_weights = torch.nn.functional.softmax( - torch.cat([parameter for parameter in self.scalar_parameters]), - dim=0 - ) - normed_weights = torch.split(normed_weights, split_size_or_sections=1) - - if not self.do_layer_norm: - pieces = [] - for weight, tensor in zip(normed_weights, tensors): - pieces.append(weight * tensor) - return self.gamma * sum(pieces) - - else: - assert mask is not None - mask_float = mask.float() - broadcast_mask = mask_float.unsqueeze(-1) - input_dim = tensors[0].size(-1) - num_elements_not_masked = torch.sum(mask_float) * input_dim - - pieces = [] - for weight, tensor in zip(normed_weights, tensors): - pieces.append( - weight * _do_layer_norm(tensor, broadcast_mask, - num_elements_not_masked) - ) - return self.gamma * sum(pieces) diff --git a/texar/torch/modules/pretrained/elmo_utils_test.py b/texar/torch/modules/pretrained/elmo_utils_test.py index 34d826241..46bf680f5 100644 --- a/texar/torch/modules/pretrained/elmo_utils_test.py +++ b/texar/torch/modules/pretrained/elmo_utils_test.py @@ -15,7 +15,6 @@ Unit tests for utils of ELMo modules. Code adapted from: - `https://github.com/allenai/allennlp/blob/master/allennlp/tests/common/util_test.py` `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/elmo_test.py` `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/encoder_base_test.py` `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/lstm_cell_with_projection_test.py` @@ -40,11 +39,10 @@ from texar.torch.data.data_utils import maybe_download from texar.torch.modules.pretrained.elmo_utils import ( Highway, LstmCellWithProjection, _EncoderBase, _ElmoBiLm, TimeDistributed, - sort_batch_by_length, get_lengths_from_binary_sequence_mask, remove_sentence_boundaries, add_sentence_boundary_token_ids, - lazy_groups_of, block_orthogonal, ConfigurationError, combine_initial_dims, - uncombine_initial_dims, ScalarMix) + block_orthogonal, ScalarMix) from texar.torch.utils.test import cuda_test +from texar.torch.utils.utils import sort_batch_by_length class TestElmoBiLm(unittest.TestCase): @@ -96,8 +94,7 @@ def _load_sentences_embeddings(self): for i in range(10): sent_embeds = fin["%s" % i][...] sent_embeds_concat = numpy.concatenate( - (sent_embeds[0, :, :], sent_embeds[1, :, :]), axis=-1 - ) + (sent_embeds[0, :, :], sent_embeds[1, :, :]), axis=-1) expected_lm_embeddings[-1].append(sent_embeds_concat) return sentences, expected_lm_embeddings @@ -116,8 +113,7 @@ def test_elmo_bilm(self): for i, batch in enumerate(batches): lm_embeddings = elmo_bilm(batch_to_ids(batch[:3])) top_layer_embeddings, mask = remove_sentence_boundaries( - lm_embeddings["activations"][2], lm_embeddings["mask"] - ) + lm_embeddings["activations"][2], lm_embeddings["mask"]) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) @@ -134,9 +130,7 @@ def test_elmo_bilm(self): numpy.allclose( top_layer_embeddings[k, : lengths[k], :].data.numpy(), expected_top_layer[k], - atol=1.0e-6, - ) - ) + atol=1.0e-6,)) class TestEncoderBase(unittest.TestCase): @@ -145,12 +139,10 @@ def setUp(self): super().setUp() self.lstm = LSTM( bidirectional=True, num_layers=3, input_size=3, hidden_size=7, - batch_first=True - ) + batch_first=True) self.rnn = RNN( bidirectional=True, num_layers=3, input_size=3, hidden_size=7, - batch_first=True - ) + batch_first=True) self.encoder_base = _EncoderBase(stateful=True) tensor = torch.rand([5, 7, 3]) @@ -166,7 +158,7 @@ def setUp(self): self.batch_size = 5 self.num_valid = 3 - sequence_lengths = get_lengths_from_binary_sequence_mask(mask) + sequence_lengths = mask.long().sum(-1) _, _, restoration_indices, sorting_indices = sort_batch_by_length( tensor, sequence_lengths) self.sorting_indices = sorting_indices @@ -179,8 +171,7 @@ def test_non_stateful_states_are_sorted_correctly(self): # we'll just use a "pass through" encoder, as we aren't actually testing # the functionality of the encoder here anyway. _, states, restoration_indices = encoder_base.sort_and_run_forward( - lambda *x: x, self.tensor, self.mask, initial_states - ) + lambda *x: x, self.tensor, self.mask, initial_states) # Our input tensor had 2 zero length sequences, so we need # to concat a tensor of shape # (num_layers * num_directions, batch_size - num_valid, hidden_dim), @@ -198,18 +189,13 @@ def test_non_stateful_states_are_sorted_correctly(self): for index in [0, 1, 3]: numpy.testing.assert_array_equal( unsorted_state[:, index, :].data.numpy(), - original[:, index, :].data.numpy() - ) + original[:, index, :].data.numpy()) def test_get_initial_states(self): # First time we call it, there should be no state, so we should return # None. - assert ( - self.encoder_base._get_initial_states( - self.batch_size, self.num_valid, self.sorting_indices - ) - is None - ) + assert (self.encoder_base._get_initial_states( + self.batch_size, self.num_valid, self.sorting_indices) is None) # First test the case that the previous state is _smaller_ than the # current state input. @@ -217,23 +203,18 @@ def test_get_initial_states(self): self.encoder_base._states = initial_states # sorting indices are: [0, 1, 3, 2, 4] returned_states = self.encoder_base._get_initial_states( - self.batch_size, self.num_valid, self.sorting_indices - ) + self.batch_size, self.num_valid, self.sorting_indices) - correct_expanded_states = [ - torch.cat([state, torch.zeros([1, 2, 7])], 1) - for state in initial_states - ] + correct_expanded_states = [torch.cat([state, torch.zeros([1, 2, 7])], 1) + for state in initial_states] # State should have been expanded with zeros to have shape # (1, batch_size, hidden_size). numpy.testing.assert_array_equal( self.encoder_base._states[0].data.numpy(), - correct_expanded_states[0].data.numpy() - ) + correct_expanded_states[0].data.numpy()) numpy.testing.assert_array_equal( self.encoder_base._states[1].data.numpy(), - correct_expanded_states[1].data.numpy() - ) + correct_expanded_states[1].data.numpy()) # The returned states should be of shape (1, num_valid, hidden_size) and # they also should have been sorted with respect to the indices. @@ -241,50 +222,41 @@ def test_get_initial_states(self): correct_returned_states = [ state.index_select(1, self.sorting_indices)[:, : self.num_valid, :] - for state in correct_expanded_states - ] + for state in correct_expanded_states] numpy.testing.assert_array_equal( returned_states[0].data.numpy(), - correct_returned_states[0].data.numpy() - ) + correct_returned_states[0].data.numpy()) numpy.testing.assert_array_equal( returned_states[1].data.numpy(), - correct_returned_states[1].data.numpy() - ) + correct_returned_states[1].data.numpy()) # Now test the case that the previous state is larger: original_states = (torch.randn([1, 10, 7]), torch.randn([1, 10, 7])) self.encoder_base._states = original_states # sorting indices are: [0, 1, 3, 2, 4] returned_states = self.encoder_base._get_initial_states( - self.batch_size, self.num_valid, self.sorting_indices - ) + self.batch_size, self.num_valid, self.sorting_indices) # State should not have changed, as they were larger # than the batch size of the requested states. numpy.testing.assert_array_equal( self.encoder_base._states[0].data.numpy(), - original_states[0].data.numpy() - ) + original_states[0].data.numpy()) numpy.testing.assert_array_equal( self.encoder_base._states[1].data.numpy(), - original_states[1].data.numpy() - ) + original_states[1].data.numpy()) # The returned states should be of shape (1, num_valid, hidden_size) # and they also should have been sorted with respect to the indices. correct_returned_state = [ x.index_select(1, self.sorting_indices)[:, : self.num_valid, :] - for x in original_states - ] + for x in original_states] numpy.testing.assert_array_equal( returned_states[0].data.numpy(), - correct_returned_state[0].data.numpy() - ) + correct_returned_state[0].data.numpy()) numpy.testing.assert_array_equal( returned_states[1].data.numpy(), - correct_returned_state[1].data.numpy() - ) + correct_returned_state[1].data.numpy()) def test_update_states(self): assert self.encoder_base._states is None @@ -292,8 +264,7 @@ def test_update_states(self): index_selected_initial_states = ( initial_states[0].index_select(1, self.restoration_indices), - initial_states[1].index_select(1, self.restoration_indices), - ) + initial_states[1].index_select(1, self.restoration_indices),) self.encoder_base._update_states(initial_states, self.restoration_indices) @@ -301,12 +272,10 @@ def test_update_states(self): # state. numpy.testing.assert_array_equal( self.encoder_base._states[0].data.numpy(), - index_selected_initial_states[0].data.numpy() - ) + index_selected_initial_states[0].data.numpy()) numpy.testing.assert_array_equal( self.encoder_base._states[1].data.numpy(), - index_selected_initial_states[1].data.numpy() - ) + index_selected_initial_states[1].data.numpy()) new_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) # tensor has 2 completely masked rows, so the last 2 rows of the _ @@ -317,8 +286,7 @@ def test_update_states(self): index_selected_new_states = ( new_states[0].index_select(1, self.restoration_indices), - new_states[1].index_select(1, self.restoration_indices), - ) + new_states[1].index_select(1, self.restoration_indices),) self.encoder_base._update_states(new_states, self.restoration_indices) # Check that the update _preserved_ the state for the rows which were @@ -326,22 +294,18 @@ def test_update_states(self): for index in [2, 4]: numpy.testing.assert_array_equal( self.encoder_base._states[0][:, index, :].data.numpy(), - index_selected_initial_states[0][:, index, :].data.numpy(), - ) + index_selected_initial_states[0][:, index, :].data.numpy(),) numpy.testing.assert_array_equal( self.encoder_base._states[1][:, index, :].data.numpy(), - index_selected_initial_states[1][:, index, :].data.numpy(), - ) + index_selected_initial_states[1][:, index, :].data.numpy(),) # Now the states which were updated: for index in [0, 1, 3]: numpy.testing.assert_array_equal( self.encoder_base._states[0][:, index, :].data.numpy(), - index_selected_new_states[0][:, index, :].data.numpy(), - ) + index_selected_new_states[0][:, index, :].data.numpy(),) numpy.testing.assert_array_equal( self.encoder_base._states[1][:, index, :].data.numpy(), - index_selected_new_states[1][:, index, :].data.numpy(), - ) + index_selected_new_states[1][:, index, :].data.numpy(),) # Now test the case that the new state is smaller: small_new_states = torch.randn([1, 3, 7]), torch.randn([1, 3, 7]) @@ -352,8 +316,7 @@ def test_update_states(self): index_selected_small_states = ( small_new_states[0].index_select(1, small_restoration_indices), - small_new_states[1].index_select(1, small_restoration_indices), - ) + small_new_states[1].index_select(1, small_restoration_indices),) self.encoder_base._update_states(small_new_states, small_restoration_indices) @@ -362,33 +325,27 @@ def test_update_states(self): for index in [1, 3]: numpy.testing.assert_array_equal( self.encoder_base._states[0][:, index, :].data.numpy(), - index_selected_new_states[0][:, index, :].data.numpy(), - ) + index_selected_new_states[0][:, index, :].data.numpy(),) numpy.testing.assert_array_equal( self.encoder_base._states[1][:, index, :].data.numpy(), - index_selected_new_states[1][:, index, :].data.numpy(), - ) + index_selected_new_states[1][:, index, :].data.numpy(),) # Indices we did update: for index in [0, 2]: numpy.testing.assert_array_equal( self.encoder_base._states[0][:, index, :].data.numpy(), - index_selected_small_states[0][:, index, :].data.numpy(), - ) + index_selected_small_states[0][:, index, :].data.numpy(),) numpy.testing.assert_array_equal( self.encoder_base._states[1][:, index, :].data.numpy(), - index_selected_small_states[1][:, index, :].data.numpy(), - ) + index_selected_small_states[1][:, index, :].data.numpy(),) # We didn't update index 4 in the previous step either, so it should # be equal to the 4th index of initial states. numpy.testing.assert_array_equal( self.encoder_base._states[0][:, 4, :].data.numpy(), - index_selected_initial_states[0][:, 4, :].data.numpy(), - ) + index_selected_initial_states[0][:, 4, :].data.numpy(),) numpy.testing.assert_array_equal( self.encoder_base._states[1][:, 4, :].data.numpy(), - index_selected_initial_states[1][:, 4, :].data.numpy(), - ) + index_selected_initial_states[1][:, 4, :].data.numpy(),) def test_reset_states(self): # Initialize the encoder states. @@ -396,8 +353,7 @@ def test_reset_states(self): initial_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) index_selected_initial_states = ( initial_states[0].index_select(1, self.restoration_indices), - initial_states[1].index_select(1, self.restoration_indices), - ) + initial_states[1].index_select(1, self.restoration_indices),) self.encoder_base._update_states(initial_states, self.restoration_indices) @@ -407,21 +363,17 @@ def test_reset_states(self): # First two states should be zeros numpy.testing.assert_array_equal( self.encoder_base._states[0][:, :2, :].data.numpy(), - torch.zeros_like(initial_states[0])[:, :2, :].data.numpy(), - ) + torch.zeros_like(initial_states[0])[:, :2, :].data.numpy(),) numpy.testing.assert_array_equal( self.encoder_base._states[1][:, :2, :].data.numpy(), - torch.zeros_like(initial_states[1])[:, :2, :].data.numpy(), - ) + torch.zeros_like(initial_states[1])[:, :2, :].data.numpy(),) # Remaining states should be the same numpy.testing.assert_array_equal( self.encoder_base._states[0][:, 2:, :].data.numpy(), - index_selected_initial_states[0][:, 2:, :].data.numpy(), - ) + index_selected_initial_states[0][:, 2:, :].data.numpy(),) numpy.testing.assert_array_equal( self.encoder_base._states[1][:, 2:, :].data.numpy(), - index_selected_initial_states[1][:, 2:, :].data.numpy(), - ) + index_selected_initial_states[1][:, 2:, :].data.numpy(),) # Check that error is raised if mask has wrong batch size. bad_mask = torch.FloatTensor([1, 1, 0]) @@ -440,10 +392,8 @@ def test_non_contiguous_initial_states_handled(self): # A transposition will make the tensors non-contiguous, start them off # at the wrong shape and transpose them into the right shape. encoder_base = _EncoderBase(stateful=False) - initial_states = ( - torch.randn(5, 6, 7).permute(1, 0, 2), - torch.randn(5, 6, 7).permute(1, 0, 2), - ) + initial_states = (torch.randn(5, 6, 7).permute(1, 0, 2), + torch.randn(5, 6, 7).permute(1, 0, 2),) assert not initial_states[0].is_contiguous() and \ not initial_states[1].is_contiguous() assert initial_states[0].size() == torch.Size([6, 5, 7]) @@ -486,10 +436,8 @@ def test_non_contiguous_initial_states_handled_on_gpu(self): # A transposition will make the tensors non-contiguous, start them off # at the wrong shape and transpose them into the right shape. encoder_base = _EncoderBase(stateful=False).cuda() - initial_states = ( - torch.randn(5, 6, 7).cuda().permute(1, 0, 2), - torch.randn(5, 6, 7).cuda().permute(1, 0, 2), - ) + initial_states = (torch.randn(5, 6, 7).cuda().permute(1, 0, 2), + torch.randn(5, 6, 7).cuda().permute(1, 0, 2),) assert not initial_states[0].is_contiguous() and not initial_states[ 1].is_contiguous() assert initial_states[0].size() == torch.Size([6, 5, 7]) @@ -500,12 +448,10 @@ def test_non_contiguous_initial_states_handled_on_gpu(self): # or just a single tensor. encoder_base.sort_and_run_forward( self.lstm.cuda(), self.tensor.cuda(), self.mask.cuda(), - initial_states - ) + initial_states) encoder_base.sort_and_run_forward( self.rnn.cuda(), self.tensor.cuda(), self.mask.cuda(), - initial_states[0] - ) + initial_states[0]) # Case 2: Encoder is stateful @@ -567,12 +513,10 @@ def test_elmo_lstm_cell_completes_forward_pass(self): hidden_size=5, cell_size=7, memory_cell_clip_value=2, - state_projection_clip_value=1, - ) + state_projection_clip_value=1,) output_sequence, lstm_state = lstm( input_tensor, [5, 4, 2, 1], (initial_hidden_state, - initial_memory_state) - ) + initial_memory_state)) numpy.testing.assert_array_equal( output_sequence.data[1, 4:, :].numpy(), 0.0) numpy.testing.assert_array_equal( @@ -605,8 +549,7 @@ def test_time_distributed_reshapes_named_arg_correctly(self): output = distributed_embedding(char_input) assert_almost_equal( output.data.numpy(), - [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]] - ) + [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]]) def test_time_distributed_reshapes_positional_kwarg_correctly(self): char_embedding = Embedding(2, 2) @@ -617,8 +560,7 @@ def test_time_distributed_reshapes_positional_kwarg_correctly(self): output = distributed_embedding(input=char_input) assert_almost_equal( output.data.numpy(), - [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]] - ) + [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]]) def test_time_distributed_works_with_multiple_inputs(self): module = lambda x, y: x + y @@ -631,10 +573,8 @@ def test_time_distributed_works_with_multiple_inputs(self): def test_time_distributed_reshapes_multiple_inputs_with_pass_through_tensor_correctly(self): class FakeModule(Module): - def forward(self, input_tensor, tensor_to_pass_through=None, another_tensor=None): - return input_tensor + tensor_to_pass_through + another_tensor module = FakeModule() @@ -648,8 +588,7 @@ def forward(self, input_tensor, tensor_to_pass_through=None, input_tensor1, tensor_to_pass_through=input_to_pass_through, another_tensor=input_tensor2, - pass_through=["tensor_to_pass_through"], - ) + pass_through=["tensor_to_pass_through"],) assert_almost_equal(output.data.numpy(), [[[8, 11], [15, 12]]]) def test_time_distributed_reshapes_multiple_inputs_with_pass_through_non_tensor_correctly(self): @@ -671,8 +610,7 @@ def forward(self, input_tensor, number=0, another_tensor=None): input_tensor1, number=input_number, another_tensor=input_tensor2, - pass_through=["number"], - ) + pass_through=["number"],) assert_almost_equal(output.data.numpy(), [[[10, 9], [17, 10]]]) @@ -691,26 +629,18 @@ def test_add_sentence_boundary_token_ids_handles_2D_input(self): def test_add_sentence_boundary_token_ids_handles_3D_input(self): tensor = torch.from_numpy( - numpy.array( - [ - [[1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2]], - [[4, 3, 2, 1], [8, 7, 6, 5], [0, 0, 0, 0]], - ] - ) - ) + numpy.array([[[1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2]], + [[4, 3, 2, 1], [8, 7, 6, 5], [0, 0, 0, 0]]])) mask = ((tensor > 0).sum(dim=-1) > 0).type(torch.LongTensor) bos = torch.from_numpy(numpy.array([9, 9, 9, 9])) eos = torch.from_numpy(numpy.array([10, 10, 10, 10])) new_tensor, new_mask = add_sentence_boundary_token_ids( tensor, mask, bos, eos) expected_new_tensor = numpy.array( - [ - [[9, 9, 9, 9], [1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2], - [10, 10, 10, 10]], - [[9, 9, 9, 9], [4, 3, 2, 1], [8, 7, 6, 5], [10, 10, 10, 10], - [0, 0, 0, 0]], - ] - ) + [[[9, 9, 9, 9], [1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2], + [10, 10, 10, 10]], + [[9, 9, 9, 9], [4, 3, 2, 1], [8, 7, 6, 5], [10, 10, 10, 10], + [0, 0, 0, 0]]]) assert (new_tensor.data.numpy() == expected_new_tensor).all() assert (new_mask.data.numpy() == ( (expected_new_tensor > 0).sum(axis=-1) > 0)).all() @@ -735,51 +665,6 @@ def test_remove_sentence_boundaries(self): [[0, 0, 0], [1, 1, 1], [1, 1, 0]])).long() assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all() - def test_lazy_groups_of(self): - xs = [1, 2, 3, 4, 5, 6, 7] - groups = lazy_groups_of(iter(xs), group_size=3) - assert next(groups) == [1, 2, 3] - assert next(groups) == [4, 5, 6] - assert next(groups) == [7] - with self.assertRaises(StopIteration): - _ = next(groups) - - def test_get_sequence_lengths_from_binary_mask(self): - binary_mask = torch.ByteTensor( - [[1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1], - [1, 0, 0, 0, 0, 0]] - ) - lengths = get_lengths_from_binary_sequence_mask(binary_mask) - numpy.testing.assert_array_equal(lengths.numpy(), - numpy.array([3, 2, 6, 1])) - - def test_sort_tensor_by_length(self): - tensor = torch.rand([5, 7, 9]) - tensor[0, 3:, :] = 0 - tensor[1, 4:, :] = 0 - tensor[2, 1:, :] = 0 - tensor[3, 5:, :] = 0 - - sequence_lengths = torch.LongTensor([3, 4, 1, 5, 7]) - sorted_tensor, sorted_lengths, reverse_indices, _ = \ - sort_batch_by_length(tensor, sequence_lengths) - - # Test sorted indices are padded correctly. - numpy.testing.assert_array_equal( - sorted_tensor[1, 5:, :].data.numpy(), 0.0) - numpy.testing.assert_array_equal( - sorted_tensor[2, 4:, :].data.numpy(), 0.0) - numpy.testing.assert_array_equal( - sorted_tensor[3, 3:, :].data.numpy(), 0.0) - numpy.testing.assert_array_equal( - sorted_tensor[4, 1:, :].data.numpy(), 0.0) - - assert sorted_lengths.data.equal(torch.LongTensor([7, 5, 4, 3, 1])) - - # Test restoration indices correctly recover the original tensor. - assert sorted_tensor.index_select(0, reverse_indices).data.equal( - tensor.data) - def test_block_orthogonal_can_initialize(self): tensor = torch.zeros([10, 6]) block_orthogonal(tensor, [5, 3]) @@ -788,8 +673,7 @@ def test_block_orthogonal_can_initialize(self): def test_block_is_orthogonal(block) -> None: matrix_product = block.T @ block numpy.testing.assert_array_almost_equal( - matrix_product, numpy.eye(matrix_product.shape[-1]), 6 - ) + matrix_product, numpy.eye(matrix_product.shape[-1]), 6) test_block_is_orthogonal(tensor[:5, :3]) test_block_is_orthogonal(tensor[:5, 3:]) @@ -798,22 +682,9 @@ def test_block_is_orthogonal(block) -> None: def test_block_orthogonal_raises_on_mismatching_dimensions(self): tensor = torch.zeros([10, 6, 8]) - with self.assertRaises(ConfigurationError): + with self.assertRaises(ValueError): block_orthogonal(tensor, [7, 2, 1]) - def test_combine_initial_dims(self): - tensor = torch.randn(4, 10, 20, 17, 5) - - tensor2d = combine_initial_dims(tensor) - assert list(tensor2d.size()) == [4 * 10 * 20 * 17, 5] - - def test_uncombine_initial_dims(self): - embedding2d = torch.randn(4 * 10 * 20 * 17 * 5, 12) - - embedding = uncombine_initial_dims(embedding2d, - torch.Size((4, 10, 20, 17, 5))) - assert list(embedding.size()) == [4, 10, 20, 17, 5, 12] - class TestScalarMix(unittest.TestCase): @@ -835,11 +706,11 @@ def test_scalar_mix_can_run_forward(self): def test_scalar_mix_throws_error_on_incorrect_number_of_inputs(self): mixture = ScalarMix(3) tensors = [torch.randn([3, 4, 5]) for _ in range(5)] - with self.assertRaises(ConfigurationError): + with self.assertRaises(ValueError): _ = mixture(tensors) def test_scalar_mix_throws_error_on_incorrect_initial_scalar_parameters_length(self): - with self.assertRaises(ConfigurationError): + with self.assertRaises(ValueError): ScalarMix(3, initial_scalar_parameters=[0.0, 0.0]) def test_scalar_mix_trainable_with_initial_scalar_parameters(self): diff --git a/texar/torch/utils/utils.py b/texar/torch/utils/utils.py index 426081587..bb71e76c8 100644 --- a/texar/torch/utils/utils.py +++ b/texar/torch/utils/utils.py @@ -19,10 +19,12 @@ import copy import inspect from functools import lru_cache +from itertools import islice from pydoc import locate from typing import ( - Any, Callable, Collection, Dict, List, MutableMapping, Optional, Sequence, - Tuple, Type, TypeVar, Union, cast, no_type_check, overload) + Any, Callable, Collection, Dict, Iterable, Iterator, List, MutableMapping, + Optional, Sequence, Tuple, Type, TypeVar, Union, cast, no_type_check, + overload) import funcsigs import numpy as np @@ -67,6 +69,11 @@ 'uniquify_str', 'ceildiv', 'sum_tensors', + 'lazy_groups_of', + 'sort_batch_by_length', + 'get_device_of', + 'combine_initial_dims', + 'uncombine_initial_dims', ] T = TypeVar('T') # type argument @@ -1196,3 +1203,103 @@ def truncate_seq_pair(tokens_a: Union[List[int], List[str]], tokens_a.pop() else: tokens_b.pop() + + +A = TypeVar("A") + + +def lazy_groups_of(iterable: Iterable[A], group_size: int) -> Iterator[List[A]]: + r"""Takes an iterable and batches the individual instances into lists of the + specified size. The last list may be smaller if there are instances left + over. + + Args: + iterable: An iterable object. + group_size: The group size. + + Returns: + An iterator. + """ + iterator = iter(iterable) + while True: + s = list(islice(iterator, group_size)) + if len(s) > 0: + yield s + else: + break + + +def sort_batch_by_length(tensor: torch.Tensor, + sequence_lengths: torch.Tensor) -> \ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Sort a batch first tensor by some specified lengths. + + Args: + tensor: A batch first tensor. + sequence_lengths: A tensor representing the lengths of some dimension of + the tensor which we want to sort by. + + Returns: + sorted_tensor: The original tensor sorted along the batch dimension + with respect to `sequence_lengths`. + sorted_sequence_lengths: The original `sequence_lengths` sorted by + decreasing size. + restoration_indices: Indices into the `sorted_tensor` such that + ``sorted_tensor.index_select(0, restoration_indices) == + original_tensor`` + permutation_index: The indices used to sort the tensor. This is useful + if you want to sort many tensors using the same ordering. + """ + if not isinstance(tensor, torch.Tensor) or \ + not isinstance(sequence_lengths, torch.Tensor): + raise ValueError( + "Both the tensor and sequence lengths must be torch.Tensors.") + + sorted_sequence_lengths, permutation_index = sequence_lengths.sort( + 0, descending=True) + sorted_tensor = tensor.index_select(0, permutation_index) + + index_range = torch.arange(0, len(sequence_lengths), + device=sequence_lengths.device) + # This is the equivalent of zipping with index, sorting by the original + # sequence lengths and returning the now sorted indices. + _, reverse_mapping = permutation_index.sort(0, descending=False) + restoration_indices = index_range.index_select(0, reverse_mapping) + return (sorted_tensor, sorted_sequence_lengths, restoration_indices, + permutation_index) + + +def get_device_of(tensor: torch.Tensor) -> int: + r"""Returns the device of the tensor. + """ + if not tensor.is_cuda: + return -1 + else: + return tensor.get_device() + + +def combine_initial_dims(tensor: torch.Tensor) -> torch.Tensor: + r"""Given a (possibly higher order) tensor with shape + `[d1, ..., dn, sequence_length]` Return a view that's + `[d1 * ... * dn, sequence_length]`. If original tensor is 1-d or 2-d, + return it as is. + """ + if tensor.dim() <= 2: + return tensor + else: + return tensor.view(-1, tensor.size(-1)) + + +def uncombine_initial_dims(tensor: torch.Tensor, + original_size: torch.Size) -> torch.Tensor: + r"""Given a tensor of embeddings with shape + `[d1 * ... * dn, sequence_length, embedding_dim]` and the original shape + `[d1, ..., dn, sequence_length]`, return the reshaped tensor of embeddings + with shape `[d1, ..., dn, sequence_length, embedding_dim]`. + If original size is 1-d or 2-d, return it as is. + """ + if len(original_size) <= 2: + return tensor + else: + view_args = list(original_size) + [tensor.size(-1)] + return tensor.view(*view_args) diff --git a/texar/torch/utils/utils_test.py b/texar/torch/utils/utils_test.py index 2eb543a77..190d95e62 100644 --- a/texar/torch/utils/utils_test.py +++ b/texar/torch/utils/utils_test.py @@ -195,25 +195,50 @@ def test_truncate_seq_pair(self): self.assertListEqual(tokens_a, [1]) self.assertListEqual(tokens_b, [2, 3]) - # def test_map_ids_to_strs(self): - # """Tests :func:`texar.torch.utils.map_ids_to_strs`. - # """ - # vocab_list = ['word', '词'] - # vocab_file = tempfile.NamedTemporaryFile() - # vocab_file.write('\n'.join(vocab_list).encode("utf-8")) - # vocab_file.flush() - # vocab = Vocab(vocab_file.name) - - # text = [['', 'word', '词', '', ''], - # ['word', '词', 'word', '词', '']] - # text = np.asarray(text) - # ids = vocab.map_tokens_to_ids_py(text) - - # ids = ids.tolist() - # text_ = utils.map_ids_to_strs(ids, vocab) - - # self.assertEqual(text_[0], 'word 词') - # self.assertEqual(text_[1], 'word 词 word 词') + def test_lazy_groups_of(self): + xs = [1, 2, 3, 4, 5, 6, 7] + groups = utils.lazy_groups_of(iter(xs), group_size=3) + assert next(groups) == [1, 2, 3] + assert next(groups) == [4, 5, 6] + assert next(groups) == [7] + with self.assertRaises(StopIteration): + _ = next(groups) + + def test_sort_batch_by_length(self): + tensor = torch.rand([5, 7, 9]) + tensor[0, 3:, :] = 0 + tensor[1, 4:, :] = 0 + tensor[2, 1:, :] = 0 + tensor[3, 5:, :] = 0 + + sequence_lengths = torch.LongTensor([3, 4, 1, 5, 7]) + sorted_tensor, sorted_lengths, reverse_indices, _ = \ + utils.sort_batch_by_length(tensor, sequence_lengths) + + # Test sorted indices are padded correctly. + np.testing.assert_array_equal(sorted_tensor[1, 5:, :].data.numpy(), 0.0) + np.testing.assert_array_equal(sorted_tensor[2, 4:, :].data.numpy(), 0.0) + np.testing.assert_array_equal(sorted_tensor[3, 3:, :].data.numpy(), 0.0) + np.testing.assert_array_equal(sorted_tensor[4, 1:, :].data.numpy(), 0.0) + + assert sorted_lengths.data.equal(torch.LongTensor([7, 5, 4, 3, 1])) + + # Test restoration indices correctly recover the original tensor. + assert sorted_tensor.index_select(0, reverse_indices).data.equal( + tensor.data) + + def test_combine_initial_dims(self): + tensor = torch.randn(4, 10, 20, 17, 5) + + tensor2d = utils.combine_initial_dims(tensor) + assert list(tensor2d.size()) == [4 * 10 * 20 * 17, 5] + + def test_uncombine_initial_dims(self): + embedding2d = torch.randn(4 * 10 * 20 * 17 * 5, 12) + + embedding = utils.uncombine_initial_dims(embedding2d, + torch.Size((4, 10, 20, 17, 5))) + assert list(embedding.size()) == [4, 10, 20, 17, 5, 12] if __name__ == "__main__":