From 5dc08f906151e2b3fcadebe5ba6062e2ad744515 Mon Sep 17 00:00:00 2001 From: spadeaiverxin Date: Mon, 4 Oct 2021 17:07:49 +0800 Subject: [PATCH 01/23] add bert japanese --- .../bert/convert_bert_japanese_params.py | 69 ++++ paddlenlp/transformers/bert/modeling.py | 63 ++++ paddlenlp/transformers/bert/tokenizer.py | 343 +++++++++++++++++- tests/transformers/bert/test_tokenizer.py | 85 ++++- 4 files changed, 553 insertions(+), 7 deletions(-) create mode 100644 paddlenlp/transformers/bert/convert_bert_japanese_params.py diff --git a/paddlenlp/transformers/bert/convert_bert_japanese_params.py b/paddlenlp/transformers/bert/convert_bert_japanese_params.py new file mode 100644 index 000000000000..1b0ab452d456 --- /dev/null +++ b/paddlenlp/transformers/bert/convert_bert_japanese_params.py @@ -0,0 +1,69 @@ +import paddle +import torch +import numpy as np +from paddle.utils.download import get_path_from_url + +model_names = [ + "bert-base-japanese", "bert-base-japanese-whole-word-masking", + "bert-base-japanese-char", "bert-base-japanese-char-whole-word-masking" +] + +for model_name in model_names: + torch_model_url = "https://huggingface.co/cl-tohoku/%s/resolve/main/pytorch_model.bin" % model_name + torch_model_path = get_path_from_url(torch_model_url, '.') + torch_state_dict = torch.load(torch_model_path) + + paddle_model_path = "%s.pdparams" % model_name + paddle_state_dict = {} + + # State_dict's keys mapping: from torch to paddle + keys_dict = { + # about embeddings + "embeddings.LayerNorm.gamma": "embeddings.layer_norm.weight", + "embeddings.LayerNorm.beta": "embeddings.layer_norm.bias", + + # about encoder layer + 'encoder.layer': 'encoder.layers', + 'attention.self.query': 'self_attn.q_proj', + 'attention.self.key': 'self_attn.k_proj', + 'attention.self.value': 'self_attn.v_proj', + 'attention.output.dense': 'self_attn.out_proj', + 'attention.output.LayerNorm.gamma': 'norm1.weight', + 'attention.output.LayerNorm.beta': 'norm1.bias', + 'intermediate.dense': 'linear1', + 'output.dense': 'linear2', + 'output.LayerNorm.gamma': 'norm2.weight', + 'output.LayerNorm.beta': 'norm2.bias', + + # about cls predictions + 'cls.predictions.transform.dense': 'cls.predictions.transform', + 'cls.predictions.decoder.weight': 'cls.predictions.decoder_weight', + 'cls.predictions.transform.LayerNorm.gamma': + 'cls.predictions.layer_norm.weight', + 'cls.predictions.transform.LayerNorm.beta': + 'cls.predictions.layer_norm.bias', + 'cls.predictions.bias': 'cls.predictions.decoder_bias' + } + + for torch_key in torch_state_dict: + paddle_key = torch_key + for k in keys_dict: + if k in paddle_key: + paddle_key = paddle_key.replace(k, keys_dict[k]) + + if ('linear' in paddle_key) or ('proj' in paddle_key) or ( + 'vocab' in paddle_key and 'weight' in paddle_key) or ( + "dense.weight" in paddle_key) or ( + 'transform.weight' in paddle_key) or ( + 'seq_relationship.weight' in paddle_key): + paddle_state_dict[paddle_key] = paddle.to_tensor(torch_state_dict[ + torch_key].cpu().numpy().transpose()) + else: + paddle_state_dict[paddle_key] = paddle.to_tensor(torch_state_dict[ + torch_key].cpu().numpy()) + + print("torch: ", torch_key, "\t", torch_state_dict[torch_key].shape) + print("paddle: ", paddle_key, "\t", paddle_state_dict[paddle_key].shape, + "\n") + + paddle.save(paddle_state_dict, paddle_model_path) diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py index 46ba389663c0..587bd995ddd8 100644 --- a/paddlenlp/transformers/bert/modeling.py +++ b/paddlenlp/transformers/bert/modeling.py @@ -270,6 +270,62 @@ class BertPretrainedModel(PretrainedModel): "initializer_range": 0.02, "pad_token_id": 0, }, + "bert-base-japanese": { + "vocab_size": 32000, + "hidden_size": 768, + "num_hidden_layers": 12, + "num_attention_heads": 12, + "intermediate_size": 3072, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 512, + "type_vocab_size": 2, + "initializer_range": 0.02, + "pad_token_id": 0, + }, + "bert-base-japanese-whole-word-masking": { + "vocab_size": 30522, + "hidden_size": 768, + "num_hidden_layers": 12, + "num_attention_heads": 12, + "intermediate_size": 3072, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 512, + "type_vocab_size": 2, + "initializer_range": 0.02, + "pad_token_id": 0, + }, + "bert-base-japanese-char ": { + "vocab_size": 4000, + "hidden_size": 768, + "num_hidden_layers": 12, + "num_attention_heads": 12, + "intermediate_size": 3072, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 512, + "type_vocab_size": 2, + "initializer_range": 0.02, + "pad_token_id": 0, + }, + "bert-base-japanese-char-whole-word-masking": { + "vocab_size": 4000, + "hidden_size": 768, + "num_hidden_layers": 12, + "num_attention_heads": 12, + "intermediate_size": 3072, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 512, + "type_vocab_size": 2, + "initializer_range": 0.02, + "pad_token_id": 0, + } } resource_files_names = {"model_state": "model_state.pdparams"} pretrained_resource_files_map = { @@ -298,6 +354,13 @@ class BertPretrainedModel(PretrainedModel): "https://paddlenlp.bj.bcebos.com/models/transformers/macbert/macbert-large-chinese.pdparams", "simbert-base-chinese": "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/simbert-base-chinese-v1.pdparams", + "bert-base-japanese": "bert-base-japanese.pdparams", # 从百度网盘下载 + "bert-base-japanese-whole-word-masking": + "bert-base-japanese-whole-word-masking.pdparams", # 从百度网盘下载 + "bert-base-japanese-char": + "bert-base-japanese-char.pdparams", # 从百度网盘下载 + "bert-base-japanese-char-whole-word-masking": + "bert-base-japanese-char-whole-word-masking.pdparams" # 从百度网盘下载 } } base_model_prefix = "bert" diff --git a/paddlenlp/transformers/bert/tokenizer.py b/paddlenlp/transformers/bert/tokenizer.py index dac665d0c343..ac9e419965e8 100644 --- a/paddlenlp/transformers/bert/tokenizer.py +++ b/paddlenlp/transformers/bert/tokenizer.py @@ -14,16 +14,17 @@ # limitations under the License. import copy -import io -import json import os -import six import unicodedata +import collections from .. import PretrainedTokenizer from ..tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation -__all__ = ['BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer'] +__all__ = [ + 'BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer', + 'BertJapaneseTokenizer', 'MecabTokenizer', 'CharacterTokenizer' +] class BasicTokenizer(object): @@ -296,7 +297,7 @@ class BertTokenizer(PretrainedTokenizer): print(inputs) ''' - {'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]} + ['he', 'was', 'a', 'puppet', '##eer'] ''' """ @@ -327,6 +328,14 @@ class BertTokenizer(PretrainedTokenizer): "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", "simbert-base-chinese": "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/vocab.txt", + "bert-base-japanese": + "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/vocab.txt", + "bert-base-japanese-whole-word-masking": + "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/vocab.txt", + "bert-base-japanese-char": + "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/vocab.txt", + "bert-base-japanese-char-whole-word-masking": + "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/vocab.txt" } } pretrained_init_configuration = { @@ -366,6 +375,26 @@ class BertTokenizer(PretrainedTokenizer): "simbert-base-chinese": { "do_lower_case": True }, + "bert-base-japanese": { + "do_lower_case": False, + "word_tokenizer_type": "mecab", + "subword_tokenizer_type": "wordpiece", + }, + "bert-base-japanese-whole-word-masking": { + "do_lower_case": False, + "word_tokenizer_type": "mecab", + "subword_tokenizer_type": "wordpiece", + }, + "bert-base-japanese-char": { + "do_lower_case": False, + "word_tokenizer_type": "mecab", + "subword_tokenizer_type": "character", + }, + "bert-base-japanese-char-whole-word-masking": { + "do_lower_case": False, + "word_tokenizer_type": "mecab", + "subword_tokenizer_type": "character", + }, } padding_side = 'right' @@ -554,7 +583,7 @@ def create_token_type_ids_from_sequences(self, 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | - If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). Args: token_ids_0 (List[int]): @@ -606,3 +635,305 @@ def get_special_tokens_mask(self, return [1] + ([0] * len(token_ids_0)) + [1] + ( [0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] + + +class BertJapaneseTokenizer(BertTokenizer): + """ + Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer. + Args: + vocab_file (str): + The vocabulary file path (ends with '.txt') required to instantiate + a `WordpieceTokenizer`. + do_lower_case (bool): + Whether or not to lowercase the input when tokenizing. + Defaults to`False`. + do_word_tokenize (bool): + Whether to do word tokenization. Defaults to`True`. + do_subword_tokenize (bool): + Whether to do subword tokenization. Defaults to`True`. + word_tokenizer_type (str): + Type of word tokenizer. Defaults to`basic`. + subword_tokenizer_type (str): + Type of subword tokenizer. Defaults to`wordpiece`. + unk_token (str): + A special token representing the *unknown (out-of-vocabulary)* token. + An unknown token is set to be `unk_token` inorder to be converted to an ID. + Defaults to "[UNK]". + sep_token (str): + A special token separating two different sentences in the same input. + Defaults to "[SEP]". + pad_token (str): + A special token used to make arrays of tokens the same size for batching purposes. + Defaults to "[PAD]". + cls_token (str): + A special token used for sequence classification. It is the last token + of the sequence when built with special tokens. Defaults to "[CLS]". + mask_token (str): + A special token representing a masked token. This is the token used + in the masked language modeling task which the model tries to predict the original unmasked ones. + Defaults to "[MASK]". + mecab_kwargs (str): + Dictionary passed to the `MecabTokenizer` constructor. + + Examples: + .. code-block:: + + from paddlenlp.transformers import BertJapaneseTokenizer + berttokenizer = BertJapaneseTokenizer.from_pretrained('bert-base-japanese') + + inputs = berttokenizer.tokenize('こんにちは') + print(inputs) + + ''' + ['こん', '##にち', '##は'] + ''' + + """ + + def __init__(self, + vocab_file, + do_lower_case=False, + do_word_tokenize=True, + do_subword_tokenize=True, + word_tokenizer_type="basic", + subword_tokenizer_type="wordpiece", + never_split=None, + mecab_kwargs=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]"): + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the " + "vocabulary from a pretrained model please use " + "`tokenizer = BertJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + .format(vocab_file)) + + self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.idx_to_token.items()]) + + self.do_word_tokenize = do_word_tokenize + self.word_tokenizer_type = word_tokenizer_type + self.lower_case = do_lower_case + self.never_split = never_split + self.mecab_kwargs = copy.deepcopy(mecab_kwargs) + if do_word_tokenize: + if word_tokenizer_type == "basic": + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=False) + elif word_tokenizer_type == "mecab": + self.basic_tokenizer = MecabTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + **(mecab_kwargs or {})) + else: + raise ValueError( + f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified." + ) + + self.do_subword_tokenize = do_subword_tokenize + self.subword_tokenizer_type = subword_tokenizer_type + if do_subword_tokenize: + if subword_tokenizer_type == "wordpiece": + self.wordpiece_tokenizer = WordpieceTokenizer( + vocab=self.vocab, unk_token=unk_token) + elif subword_tokenizer_type == "character": + self.wordpiece_tokenizer = CharacterTokenizer( + vocab=self.vocab, unk_token=unk_token) + else: + raise ValueError( + f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified." + ) + + @property + def do_lower_case(self): + return self.lower_case + + def __getstate__(self): + state = dict(self.__dict__) + if self.word_tokenizer_type == "mecab": + del state["basic_tokenizer"] + return state + + def __setstate__(self, state): + self.__dict__ = state + if self.word_tokenizer_type == "mecab": + self.basic_tokenizer = MecabTokenizer( + do_lower_case=self.do_lower_case, + never_split=self.never_split, + **(self.mecab_kwargs or {})) + + def _tokenize(self, text): + if self.do_word_tokenize: + tokens = self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens) + else: + tokens = [text] + + if self.do_subword_tokenize: + split_tokens = [ + sub_token + for token in tokens + for sub_token in self.wordpiece_tokenizer.tokenize(token) + ] + else: + split_tokens = tokens + + return split_tokens + + +class MecabTokenizer: + """Runs basic tokenization with MeCab morphological parser.""" + + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + mecab_dic="ipadic", + mecab_option=None, ): + """ + Constructs a MecabTokenizer. + + Args: + do_lower_case (bool): + Whether to lowercase the input. Defaults to`True`. + never_split: (list): + Kept for backward compatibility purposes. Defaults to`None`. + normalize_text (bool): + Whether to apply unicode normalization to text before tokenization. Defaults to`True`. + mecab_dic (string): + Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary, + set this option to `None` and modify `mecab_option`. Defaults to`ipadic`. + mecab_option (string): + String passed to MeCab constructor. Defaults to`None`. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + + try: + import fugashi + except ModuleNotFoundError as error: + raise error.__class__( + "You need to install fugashi to use MecabTokenizer. " + "See https://pypi.org/project/fugashi/ for installation.") + + mecab_option = mecab_option or "" + + if mecab_dic is not None: + if mecab_dic == "ipadic": + try: + import ipadic + except ModuleNotFoundError as error: + raise error.__class__( + "The ipadic dictionary is not installed. " + "See https://github.com/polm/ipadic-py for installation." + ) + + dic_dir = ipadic.DICDIR + + elif mecab_dic == "unidic_lite": + try: + import unidic_lite + except ModuleNotFoundError as error: + raise error.__class__( + "The unidic_lite dictionary is not installed. " + "See https://github.com/polm/unidic-lite for installation." + ) + + dic_dir = unidic_lite.DICDIR + + elif mecab_dic == "unidic": + try: + import unidic + except ModuleNotFoundError as error: + raise error.__class__( + "The unidic dictionary is not installed. " + "See https://github.com/polm/unidic-py for installation." + ) + + dic_dir = unidic.DICDIR + if not os.path.isdir(dic_dir): + raise RuntimeError( + "The unidic dictionary itself is not found." + "See https://github.com/polm/unidic-py for installation." + ) + else: + raise ValueError("Invalid mecab_dic is specified.") + + mecabrc = os.path.join(dic_dir, "mecabrc") + mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option + + self.mecab = fugashi.GenericTagger(mecab_option) + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + never_split = self.never_split + (never_split + if never_split is not None else []) + tokens = [] + + for word in self.mecab(text): + token = word.surface + + if self.do_lower_case and token not in never_split: + token = token.lower() + + tokens.append(token) + + return tokens + + +class CharacterTokenizer: + """Runs Character tokenization.""" + + def __init__(self, vocab, unk_token, normalize_text=True): + """ + Constructs a CharacterTokenizer. + + Args: + vocab: + Vocabulary object. + unk_token (str): + A special symbol for out-of-vocabulary token. + normalize_text (boolean): + Whether to apply unicode normalization to text before tokenization. Defaults to True. + """ + self.vocab = vocab + self.unk_token = unk_token + self.normalize_text = normalize_text + + def tokenize(self, text): + """ + Tokenizes a piece of text into characters. + + For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`. + + Args: + text: A single token or whitespace separated tokens. + This should have already been passed through `BasicTokenizer`. + + Returns: + A list of characters. + """ + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + output_tokens = [] + for char in text: + if char not in self.vocab: + output_tokens.append(self.unk_token) + continue + + output_tokens.append(char) + + return output_tokens diff --git a/tests/transformers/bert/test_tokenizer.py b/tests/transformers/bert/test_tokenizer.py index 451770f2c6d7..c0c1658f5e90 100644 --- a/tests/transformers/bert/test_tokenizer.py +++ b/tests/transformers/bert/test_tokenizer.py @@ -15,7 +15,7 @@ import numpy as np import os import unittest -from paddlenlp.transformers import BertTokenizer, BasicTokenizer, WordpieceTokenizer +from paddlenlp.transformers import BertTokenizer, BasicTokenizer, WordpieceTokenizer, BertJapaneseTokenizer from paddlenlp.data import Vocab from common_test import CpuCommonTest @@ -357,5 +357,88 @@ def test_from_pretrained_pad_left(self): expected_token_type_ids) +class TestBertJapaneseTokenizerFromPretrained(CpuCommonTest): + @slow + def test_from_pretrained(self): + tokenizer = BertJapaneseTokenizer.from_pretrained("bert-base-japanese") + text1 = "こんにちは" + text2 = "櫓を飛ばす" + # test batch_encode + expected_input_ids = [ + 2, 10350, 25746, 28450, 3, 20301, 11, 787, 12222, 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0 + ] + expected_token_type_ids = [ + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ] + expected_attention_mask = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ] + expected_special_tokens_mask = [ + 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + ] + results = tokenizer( + [text1], [text2], + 20, + stride=1, + pad_to_max_seq_len=True, + return_attention_mask=True, + return_special_tokens_mask=True) + + self.check_output_equal(results[0]['input_ids'], expected_input_ids) + self.check_output_equal(results[0]['token_type_ids'], + expected_token_type_ids) + self.check_output_equal(results[0]['attention_mask'], + expected_attention_mask) + self.check_output_equal(results[0]['special_tokens_mask'], + expected_special_tokens_mask) + # test encode + results = tokenizer(text1, text2, 20, stride=1, pad_to_max_seq_len=True) + self.check_output_equal(results['input_ids'], expected_input_ids) + self.check_output_equal(results['token_type_ids'], + expected_token_type_ids) + + @slow + def test_from_pretrained_pad_left(self): + tokenizer = BertJapaneseTokenizer.from_pretrained("bert-base-japanese") + tokenizer.padding_side = "left" + text1 = "こんにちは" + text2 = "櫓を飛ばす" + # test batch_encode + expected_input_ids = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 10350, 25746, 28450, 3, 20301, 11, + 787, 12222, 3 + ] + expected_token_type_ids = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1 + ] + expected_attention_mask = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + ] + expected_special_tokens_mask = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1 + ] + results = tokenizer( + [text1], [text2], + 20, + stride=1, + pad_to_max_seq_len=True, + return_attention_mask=True, + return_special_tokens_mask=True) + + self.check_output_equal(results[0]['input_ids'], expected_input_ids) + self.check_output_equal(results[0]['token_type_ids'], + expected_token_type_ids) + self.check_output_equal(results[0]['attention_mask'], + expected_attention_mask) + self.check_output_equal(results[0]['special_tokens_mask'], + expected_special_tokens_mask) + # test encode + results = tokenizer(text1, text2, 20, stride=1, pad_to_max_seq_len=True) + self.check_output_equal(results['input_ids'], expected_input_ids) + self.check_output_equal(results['token_type_ids'], + expected_token_type_ids) + + if __name__ == "__main__": unittest.main() From ff5db0c6eb2778b3e70eb27dac83e79638fc8f75 Mon Sep 17 00:00:00 2001 From: spade Date: Tue, 19 Oct 2021 14:02:18 +0800 Subject: [PATCH 02/23] fix model-weight files position --- community/iverxin/bert-base-japanese/README.md | 0 community/iverxin/bert-base-japanese/files.json | 6 ++++++ paddlenlp/transformers/bert/modeling.py | 9 +++++---- 3 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 community/iverxin/bert-base-japanese/README.md create mode 100644 community/iverxin/bert-base-japanese/files.json diff --git a/community/iverxin/bert-base-japanese/README.md b/community/iverxin/bert-base-japanese/README.md new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/community/iverxin/bert-base-japanese/files.json b/community/iverxin/bert-base-japanese/files.json new file mode 100644 index 000000000000..bdc2dea8a098 --- /dev/null +++ b/community/iverxin/bert-base-japanese/files.json @@ -0,0 +1,6 @@ +{ + "bert-base-japanese": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese.pdparams", + "bert-base-japanese-char": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char.pdparams", + "bert-base-japanese-char-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char-whole-word-masking.pdparams", + "bert-base-japanese-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-whole-word-masking.pdparams" +} \ No newline at end of file diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py index 587bd995ddd8..de330db7a6d3 100644 --- a/paddlenlp/transformers/bert/modeling.py +++ b/paddlenlp/transformers/bert/modeling.py @@ -354,13 +354,14 @@ class BertPretrainedModel(PretrainedModel): "https://paddlenlp.bj.bcebos.com/models/transformers/macbert/macbert-large-chinese.pdparams", "simbert-base-chinese": "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/simbert-base-chinese-v1.pdparams", - "bert-base-japanese": "bert-base-japanese.pdparams", # 从百度网盘下载 + "bert-base-japanese": + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese.pdparams", "bert-base-japanese-whole-word-masking": - "bert-base-japanese-whole-word-masking.pdparams", # 从百度网盘下载 + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-whole-word-masking.pdparams", "bert-base-japanese-char": - "bert-base-japanese-char.pdparams", # 从百度网盘下载 + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char.pdparams", "bert-base-japanese-char-whole-word-masking": - "bert-base-japanese-char-whole-word-masking.pdparams" # 从百度网盘下载 + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char-whole-word-masking.pdparams", } } base_model_prefix = "bert" From 539294501437995af43bbcc72b7f549056b406d7 Mon Sep 17 00:00:00 2001 From: spade Date: Tue, 19 Oct 2021 18:14:38 +0800 Subject: [PATCH 03/23] add weights files url --- community/iverxin/bert-base-japanese/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/community/iverxin/bert-base-japanese/README.md b/community/iverxin/bert-base-japanese/README.md index e69de29bb2d1..511c2af7d2de 100644 --- a/community/iverxin/bert-base-japanese/README.md +++ b/community/iverxin/bert-base-japanese/README.md @@ -0,0 +1,7 @@ +## bert-base-japanese + +bert-base-japanese相关的权重参数,其中包括: +- bert-base-japanese +- bert-base-japanese-char +- bert-base-japanese-char-whole-word-masking +- bert-base-japanese-whole-word-masking From 3e647affca4411dea16572c96a9cf56580207098 Mon Sep 17 00:00:00 2001 From: spade Date: Wed, 20 Oct 2021 16:17:40 +0800 Subject: [PATCH 04/23] create package: bert_japanese --- paddlenlp/transformers/__init__.py | 1 + paddlenlp/transformers/bert/tokenizer.py | 305 +--------------- .../transformers/bert_japanese/__init__.py | 0 .../convert_bert_japanese_params.py | 2 +- .../transformers/bert_japanese/tokenizer.py | 326 ++++++++++++++++++ 5 files changed, 329 insertions(+), 305 deletions(-) create mode 100644 paddlenlp/transformers/bert_japanese/__init__.py rename paddlenlp/transformers/{bert => bert_japanese}/convert_bert_japanese_params.py (97%) create mode 100644 paddlenlp/transformers/bert_japanese/tokenizer.py diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index 03fbf00ca653..497f24800a8f 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -18,6 +18,7 @@ from .bert.modeling import * from .bert.tokenizer import * +from .bert_japanese.tokenizer import * from .ernie.modeling import * from .ernie.tokenizer import * from .gpt.modeling import * diff --git a/paddlenlp/transformers/bert/tokenizer.py b/paddlenlp/transformers/bert/tokenizer.py index ac9e419965e8..c021291b66b4 100644 --- a/paddlenlp/transformers/bert/tokenizer.py +++ b/paddlenlp/transformers/bert/tokenizer.py @@ -23,7 +23,6 @@ __all__ = [ 'BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer', - 'BertJapaneseTokenizer', 'MecabTokenizer', 'CharacterTokenizer' ] @@ -634,306 +633,4 @@ def get_special_tokens_mask(self, if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ( [0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - - -class BertJapaneseTokenizer(BertTokenizer): - """ - Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer. - Args: - vocab_file (str): - The vocabulary file path (ends with '.txt') required to instantiate - a `WordpieceTokenizer`. - do_lower_case (bool): - Whether or not to lowercase the input when tokenizing. - Defaults to`False`. - do_word_tokenize (bool): - Whether to do word tokenization. Defaults to`True`. - do_subword_tokenize (bool): - Whether to do subword tokenization. Defaults to`True`. - word_tokenizer_type (str): - Type of word tokenizer. Defaults to`basic`. - subword_tokenizer_type (str): - Type of subword tokenizer. Defaults to`wordpiece`. - unk_token (str): - A special token representing the *unknown (out-of-vocabulary)* token. - An unknown token is set to be `unk_token` inorder to be converted to an ID. - Defaults to "[UNK]". - sep_token (str): - A special token separating two different sentences in the same input. - Defaults to "[SEP]". - pad_token (str): - A special token used to make arrays of tokens the same size for batching purposes. - Defaults to "[PAD]". - cls_token (str): - A special token used for sequence classification. It is the last token - of the sequence when built with special tokens. Defaults to "[CLS]". - mask_token (str): - A special token representing a masked token. This is the token used - in the masked language modeling task which the model tries to predict the original unmasked ones. - Defaults to "[MASK]". - mecab_kwargs (str): - Dictionary passed to the `MecabTokenizer` constructor. - - Examples: - .. code-block:: - - from paddlenlp.transformers import BertJapaneseTokenizer - berttokenizer = BertJapaneseTokenizer.from_pretrained('bert-base-japanese') - - inputs = berttokenizer.tokenize('こんにちは') - print(inputs) - - ''' - ['こん', '##にち', '##は'] - ''' - - """ - - def __init__(self, - vocab_file, - do_lower_case=False, - do_word_tokenize=True, - do_subword_tokenize=True, - word_tokenizer_type="basic", - subword_tokenizer_type="wordpiece", - never_split=None, - mecab_kwargs=None, - unk_token="[UNK]", - sep_token="[SEP]", - pad_token="[PAD]", - cls_token="[CLS]", - mask_token="[MASK]"): - - if not os.path.isfile(vocab_file): - raise ValueError( - "Can't find a vocabulary file at path '{}'. To load the " - "vocabulary from a pretrained model please use " - "`tokenizer = BertJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" - .format(vocab_file)) - - self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token) - self.ids_to_tokens = collections.OrderedDict( - [(ids, tok) for tok, ids in self.vocab.idx_to_token.items()]) - - self.do_word_tokenize = do_word_tokenize - self.word_tokenizer_type = word_tokenizer_type - self.lower_case = do_lower_case - self.never_split = never_split - self.mecab_kwargs = copy.deepcopy(mecab_kwargs) - if do_word_tokenize: - if word_tokenizer_type == "basic": - self.basic_tokenizer = BasicTokenizer( - do_lower_case=do_lower_case, - never_split=never_split, - tokenize_chinese_chars=False) - elif word_tokenizer_type == "mecab": - self.basic_tokenizer = MecabTokenizer( - do_lower_case=do_lower_case, - never_split=never_split, - **(mecab_kwargs or {})) - else: - raise ValueError( - f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified." - ) - - self.do_subword_tokenize = do_subword_tokenize - self.subword_tokenizer_type = subword_tokenizer_type - if do_subword_tokenize: - if subword_tokenizer_type == "wordpiece": - self.wordpiece_tokenizer = WordpieceTokenizer( - vocab=self.vocab, unk_token=unk_token) - elif subword_tokenizer_type == "character": - self.wordpiece_tokenizer = CharacterTokenizer( - vocab=self.vocab, unk_token=unk_token) - else: - raise ValueError( - f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified." - ) - - @property - def do_lower_case(self): - return self.lower_case - - def __getstate__(self): - state = dict(self.__dict__) - if self.word_tokenizer_type == "mecab": - del state["basic_tokenizer"] - return state - - def __setstate__(self, state): - self.__dict__ = state - if self.word_tokenizer_type == "mecab": - self.basic_tokenizer = MecabTokenizer( - do_lower_case=self.do_lower_case, - never_split=self.never_split, - **(self.mecab_kwargs or {})) - - def _tokenize(self, text): - if self.do_word_tokenize: - tokens = self.basic_tokenizer.tokenize( - text, never_split=self.all_special_tokens) - else: - tokens = [text] - - if self.do_subword_tokenize: - split_tokens = [ - sub_token - for token in tokens - for sub_token in self.wordpiece_tokenizer.tokenize(token) - ] - else: - split_tokens = tokens - - return split_tokens - - -class MecabTokenizer: - """Runs basic tokenization with MeCab morphological parser.""" - - def __init__( - self, - do_lower_case=False, - never_split=None, - normalize_text=True, - mecab_dic="ipadic", - mecab_option=None, ): - """ - Constructs a MecabTokenizer. - - Args: - do_lower_case (bool): - Whether to lowercase the input. Defaults to`True`. - never_split: (list): - Kept for backward compatibility purposes. Defaults to`None`. - normalize_text (bool): - Whether to apply unicode normalization to text before tokenization. Defaults to`True`. - mecab_dic (string): - Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary, - set this option to `None` and modify `mecab_option`. Defaults to`ipadic`. - mecab_option (string): - String passed to MeCab constructor. Defaults to`None`. - """ - self.do_lower_case = do_lower_case - self.never_split = never_split if never_split is not None else [] - self.normalize_text = normalize_text - - try: - import fugashi - except ModuleNotFoundError as error: - raise error.__class__( - "You need to install fugashi to use MecabTokenizer. " - "See https://pypi.org/project/fugashi/ for installation.") - - mecab_option = mecab_option or "" - - if mecab_dic is not None: - if mecab_dic == "ipadic": - try: - import ipadic - except ModuleNotFoundError as error: - raise error.__class__( - "The ipadic dictionary is not installed. " - "See https://github.com/polm/ipadic-py for installation." - ) - - dic_dir = ipadic.DICDIR - - elif mecab_dic == "unidic_lite": - try: - import unidic_lite - except ModuleNotFoundError as error: - raise error.__class__( - "The unidic_lite dictionary is not installed. " - "See https://github.com/polm/unidic-lite for installation." - ) - - dic_dir = unidic_lite.DICDIR - - elif mecab_dic == "unidic": - try: - import unidic - except ModuleNotFoundError as error: - raise error.__class__( - "The unidic dictionary is not installed. " - "See https://github.com/polm/unidic-py for installation." - ) - - dic_dir = unidic.DICDIR - if not os.path.isdir(dic_dir): - raise RuntimeError( - "The unidic dictionary itself is not found." - "See https://github.com/polm/unidic-py for installation." - ) - else: - raise ValueError("Invalid mecab_dic is specified.") - - mecabrc = os.path.join(dic_dir, "mecabrc") - mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option - - self.mecab = fugashi.GenericTagger(mecab_option) - - def tokenize(self, text, never_split=None, **kwargs): - """Tokenizes a piece of text.""" - if self.normalize_text: - text = unicodedata.normalize("NFKC", text) - - never_split = self.never_split + (never_split - if never_split is not None else []) - tokens = [] - - for word in self.mecab(text): - token = word.surface - - if self.do_lower_case and token not in never_split: - token = token.lower() - - tokens.append(token) - - return tokens - - -class CharacterTokenizer: - """Runs Character tokenization.""" - - def __init__(self, vocab, unk_token, normalize_text=True): - """ - Constructs a CharacterTokenizer. - - Args: - vocab: - Vocabulary object. - unk_token (str): - A special symbol for out-of-vocabulary token. - normalize_text (boolean): - Whether to apply unicode normalization to text before tokenization. Defaults to True. - """ - self.vocab = vocab - self.unk_token = unk_token - self.normalize_text = normalize_text - - def tokenize(self, text): - """ - Tokenizes a piece of text into characters. - - For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`. - - Args: - text: A single token or whitespace separated tokens. - This should have already been passed through `BasicTokenizer`. - - Returns: - A list of characters. - """ - if self.normalize_text: - text = unicodedata.normalize("NFKC", text) - - output_tokens = [] - for char in text: - if char not in self.vocab: - output_tokens.append(self.unk_token) - continue - - output_tokens.append(char) - - return output_tokens + return [1] + ([0] * len(token_ids_0)) + [1] \ No newline at end of file diff --git a/paddlenlp/transformers/bert_japanese/__init__.py b/paddlenlp/transformers/bert_japanese/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/paddlenlp/transformers/bert/convert_bert_japanese_params.py b/paddlenlp/transformers/bert_japanese/convert_bert_japanese_params.py similarity index 97% rename from paddlenlp/transformers/bert/convert_bert_japanese_params.py rename to paddlenlp/transformers/bert_japanese/convert_bert_japanese_params.py index 1b0ab452d456..75a63df67a49 100644 --- a/paddlenlp/transformers/bert/convert_bert_japanese_params.py +++ b/paddlenlp/transformers/bert_japanese/convert_bert_japanese_params.py @@ -10,7 +10,7 @@ for model_name in model_names: torch_model_url = "https://huggingface.co/cl-tohoku/%s/resolve/main/pytorch_model.bin" % model_name - torch_model_path = get_path_from_url(torch_model_url, '.') + torch_model_path = get_path_from_url(torch_model_url, '../bert') torch_state_dict = torch.load(torch_model_path) paddle_model_path = "%s.pdparams" % model_name diff --git a/paddlenlp/transformers/bert_japanese/tokenizer.py b/paddlenlp/transformers/bert_japanese/tokenizer.py new file mode 100644 index 000000000000..e429467323ea --- /dev/null +++ b/paddlenlp/transformers/bert_japanese/tokenizer.py @@ -0,0 +1,326 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +import unicodedata +import collections + +from .. import PretrainedTokenizer, BertTokenizer, BasicTokenizer, WordpieceTokenizer + +__all__ = [ + 'BertJapaneseTokenizer', 'MecabTokenizer', 'CharacterTokenizer' +] + +class BertJapaneseTokenizer(BertTokenizer): + """ + Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer. + Args: + vocab_file (str): + The vocabulary file path (ends with '.txt') required to instantiate + a `WordpieceTokenizer`. + do_lower_case (bool): + Whether or not to lowercase the input when tokenizing. + Defaults to`False`. + do_word_tokenize (bool): + Whether to do word tokenization. Defaults to`True`. + do_subword_tokenize (bool): + Whether to do subword tokenization. Defaults to`True`. + word_tokenizer_type (str): + Type of word tokenizer. Defaults to`basic`. + subword_tokenizer_type (str): + Type of subword tokenizer. Defaults to`wordpiece`. + unk_token (str): + A special token representing the *unknown (out-of-vocabulary)* token. + An unknown token is set to be `unk_token` inorder to be converted to an ID. + Defaults to "[UNK]". + sep_token (str): + A special token separating two different sentences in the same input. + Defaults to "[SEP]". + pad_token (str): + A special token used to make arrays of tokens the same size for batching purposes. + Defaults to "[PAD]". + cls_token (str): + A special token used for sequence classification. It is the last token + of the sequence when built with special tokens. Defaults to "[CLS]". + mask_token (str): + A special token representing a masked token. This is the token used + in the masked language modeling task which the model tries to predict the original unmasked ones. + Defaults to "[MASK]". + mecab_kwargs (str): + Dictionary passed to the `MecabTokenizer` constructor. + + Examples: + .. code-block:: + + from paddlenlp.transformers import BertJapaneseTokenizer + berttokenizer = BertJapaneseTokenizer.from_pretrained('bert-base-japanese') + + inputs = berttokenizer.tokenize('こんにちは') + print(inputs) + + ''' + ['こん', '##にち', '##は'] + ''' + + """ + + def __init__(self, + vocab_file, + do_lower_case=False, + do_word_tokenize=True, + do_subword_tokenize=True, + word_tokenizer_type="basic", + subword_tokenizer_type="wordpiece", + never_split=None, + mecab_kwargs=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]"): + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the " + "vocabulary from a pretrained model please use " + "`tokenizer = BertJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + .format(vocab_file)) + + self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.idx_to_token.items()]) + + self.do_word_tokenize = do_word_tokenize + self.word_tokenizer_type = word_tokenizer_type + self.lower_case = do_lower_case + self.never_split = never_split + self.mecab_kwargs = copy.deepcopy(mecab_kwargs) + if do_word_tokenize: + if word_tokenizer_type == "basic": + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=False) + elif word_tokenizer_type == "mecab": + self.basic_tokenizer = MecabTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + **(mecab_kwargs or {})) + else: + raise ValueError( + f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified." + ) + + self.do_subword_tokenize = do_subword_tokenize + self.subword_tokenizer_type = subword_tokenizer_type + if do_subword_tokenize: + if subword_tokenizer_type == "wordpiece": + self.wordpiece_tokenizer = WordpieceTokenizer( + vocab=self.vocab, unk_token=unk_token) + elif subword_tokenizer_type == "character": + self.wordpiece_tokenizer = CharacterTokenizer( + vocab=self.vocab, unk_token=unk_token) + else: + raise ValueError( + f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified." + ) + + @property + def do_lower_case(self): + return self.lower_case + + def __getstate__(self): + state = dict(self.__dict__) + if self.word_tokenizer_type == "mecab": + del state["basic_tokenizer"] + return state + + def __setstate__(self, state): + self.__dict__ = state + if self.word_tokenizer_type == "mecab": + self.basic_tokenizer = MecabTokenizer( + do_lower_case=self.do_lower_case, + never_split=self.never_split, + **(self.mecab_kwargs or {})) + + def _tokenize(self, text): + if self.do_word_tokenize: + tokens = self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens) + else: + tokens = [text] + + if self.do_subword_tokenize: + split_tokens = [ + sub_token + for token in tokens + for sub_token in self.wordpiece_tokenizer.tokenize(token) + ] + else: + split_tokens = tokens + + return split_tokens + + +class MecabTokenizer: + """Runs basic tokenization with MeCab morphological parser.""" + + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + mecab_dic="ipadic", + mecab_option=None, ): + """ + Constructs a MecabTokenizer. + + Args: + do_lower_case (bool): + Whether to lowercase the input. Defaults to`True`. + never_split: (list): + Kept for backward compatibility purposes. Defaults to`None`. + normalize_text (bool): + Whether to apply unicode normalization to text before tokenization. Defaults to`True`. + mecab_dic (string): + Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary, + set this option to `None` and modify `mecab_option`. Defaults to`ipadic`. + mecab_option (string): + String passed to MeCab constructor. Defaults to`None`. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + + try: + import fugashi + except ModuleNotFoundError as error: + raise error.__class__( + "You need to install fugashi to use MecabTokenizer. " + "See https://pypi.org/project/fugashi/ for installation.") + + mecab_option = mecab_option or "" + + if mecab_dic is not None: + if mecab_dic == "ipadic": + try: + import ipadic + except ModuleNotFoundError as error: + raise error.__class__( + "The ipadic dictionary is not installed. " + "See https://github.com/polm/ipadic-py for installation." + ) + + dic_dir = ipadic.DICDIR + + elif mecab_dic == "unidic_lite": + try: + import unidic_lite + except ModuleNotFoundError as error: + raise error.__class__( + "The unidic_lite dictionary is not installed. " + "See https://github.com/polm/unidic-lite for installation." + ) + + dic_dir = unidic_lite.DICDIR + + elif mecab_dic == "unidic": + try: + import unidic + except ModuleNotFoundError as error: + raise error.__class__( + "The unidic dictionary is not installed. " + "See https://github.com/polm/unidic-py for installation." + ) + + dic_dir = unidic.DICDIR + if not os.path.isdir(dic_dir): + raise RuntimeError( + "The unidic dictionary itself is not found." + "See https://github.com/polm/unidic-py for installation." + ) + else: + raise ValueError("Invalid mecab_dic is specified.") + + mecabrc = os.path.join(dic_dir, "mecabrc") + mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option + + self.mecab = fugashi.GenericTagger(mecab_option) + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + never_split = self.never_split + (never_split + if never_split is not None else []) + tokens = [] + + for word in self.mecab(text): + token = word.surface + + if self.do_lower_case and token not in never_split: + token = token.lower() + + tokens.append(token) + + return tokens + + +class CharacterTokenizer: + """Runs Character tokenization.""" + + def __init__(self, vocab, unk_token, normalize_text=True): + """ + Constructs a CharacterTokenizer. + + Args: + vocab: + Vocabulary object. + unk_token (str): + A special symbol for out-of-vocabulary token. + normalize_text (boolean): + Whether to apply unicode normalization to text before tokenization. Defaults to True. + """ + self.vocab = vocab + self.unk_token = unk_token + self.normalize_text = normalize_text + + def tokenize(self, text): + """ + Tokenizes a piece of text into characters. + + For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`. + + Args: + text: A single token or whitespace separated tokens. + This should have already been passed through `BasicTokenizer`. + + Returns: + A list of characters. + """ + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + output_tokens = [] + for char in text: + if char not in self.vocab: + output_tokens.append(self.unk_token) + continue + + output_tokens.append(char) + + return output_tokens From 2ca28b48d56744fd3822c593d3100882678c2d8c Mon Sep 17 00:00:00 2001 From: spade Date: Wed, 20 Oct 2021 19:21:04 +0800 Subject: [PATCH 05/23] update weights readme --- community/iverxin/bert-base-japanese/README.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/community/iverxin/bert-base-japanese/README.md b/community/iverxin/bert-base-japanese/README.md index 511c2af7d2de..82ac43d5672d 100644 --- a/community/iverxin/bert-base-japanese/README.md +++ b/community/iverxin/bert-base-japanese/README.md @@ -1,7 +1,11 @@ ## bert-base-japanese -bert-base-japanese相关的权重参数,其中包括: -- bert-base-japanese -- bert-base-japanese-char -- bert-base-japanese-char-whole-word-masking -- bert-base-japanese-whole-word-masking +基于bert的日语训练模型的相关权重参数,其中包括: + +| Pretrained Weight | Language | Details of the model | +| ------------------------------------------ | -------- | ------------------------------------------------------------ | +| bert-base-japanese | Japanese | 12 repeating layers, 768-hidden, 12-heads. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. [reference](https://huggingface.co/cl-tohoku/bert-base-japanese) | +| bert-base-japanese-char | Japanese | 12 repeating layers, 768-hidden, 12-heads. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization. [reference](https://huggingface.co/cl-tohoku/bert-base-japanese-char) | +| bert-base-japanese-char-whole-word-masking | Japanese | 12 repeating layers, 768-hidden, 12-heads. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization. Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective..[reference](https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking) | +| bert-base-japanese-whole-word-masking | Japanese | 12 repeating layers, 768-hidden, 12-heads. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. [reference](https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking) | + From dff2c5c1bd4755cd609a278f45408b7f334bfc68 Mon Sep 17 00:00:00 2001 From: spade Date: Wed, 20 Oct 2021 19:47:13 +0800 Subject: [PATCH 06/23] update weights files --- .../README.md | 5 +++++ .../files.json | 3 +++ community/iverxin/bert-base-japanese-char/README.md | 5 +++++ community/iverxin/bert-base-japanese-char/files.json | 3 +++ .../bert-base-japanese-whole-word-masking/README.md | 5 +++++ .../bert-base-japanese-whole-word-masking/files.json | 3 +++ community/iverxin/bert-base-japanese/README.md | 11 +++-------- community/iverxin/bert-base-japanese/files.json | 5 +---- 8 files changed, 28 insertions(+), 12 deletions(-) create mode 100644 community/iverxin/bert-base-japanese-char-whole-word-masking/README.md create mode 100644 community/iverxin/bert-base-japanese-char-whole-word-masking/files.json create mode 100644 community/iverxin/bert-base-japanese-char/README.md create mode 100644 community/iverxin/bert-base-japanese-char/files.json create mode 100644 community/iverxin/bert-base-japanese-whole-word-masking/README.md create mode 100644 community/iverxin/bert-base-japanese-whole-word-masking/files.json diff --git a/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md b/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md new file mode 100644 index 000000000000..3f2a11ed5cf1 --- /dev/null +++ b/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md @@ -0,0 +1,5 @@ +## bert-base-japanese +12 repeating layers, 768-hidden, 12-heads. + +This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization. Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. +[reference](https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking) diff --git a/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json b/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json new file mode 100644 index 000000000000..f817f685b6ef --- /dev/null +++ b/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json @@ -0,0 +1,3 @@ +{ + "bert-base-japanese-char-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char-whole-word-masking.pdparams" +} \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese-char/README.md b/community/iverxin/bert-base-japanese-char/README.md new file mode 100644 index 000000000000..f538d756a16f --- /dev/null +++ b/community/iverxin/bert-base-japanese-char/README.md @@ -0,0 +1,5 @@ +## bert-base-japanese +12 repeating layers, 768-hidden, 12-heads. + +This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization. +[reference](https://huggingface.co/cl-tohoku/bert-base-japanese-char) diff --git a/community/iverxin/bert-base-japanese-char/files.json b/community/iverxin/bert-base-japanese-char/files.json new file mode 100644 index 000000000000..4cdae70b3bd1 --- /dev/null +++ b/community/iverxin/bert-base-japanese-char/files.json @@ -0,0 +1,3 @@ +{ + "bert-base-japanese-char": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char.pdparams" +} \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese-whole-word-masking/README.md b/community/iverxin/bert-base-japanese-whole-word-masking/README.md new file mode 100644 index 000000000000..ae1217d911f0 --- /dev/null +++ b/community/iverxin/bert-base-japanese-whole-word-masking/README.md @@ -0,0 +1,5 @@ +## bert-base-japanese +12 repeating layers, 768-hidden, 12-heads. + +This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. +[reference](https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking) diff --git a/community/iverxin/bert-base-japanese-whole-word-masking/files.json b/community/iverxin/bert-base-japanese-whole-word-masking/files.json new file mode 100644 index 000000000000..4f6761876b85 --- /dev/null +++ b/community/iverxin/bert-base-japanese-whole-word-masking/files.json @@ -0,0 +1,3 @@ +{ + "bert-base-japanese-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-whole-word-masking.pdparams" +} \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese/README.md b/community/iverxin/bert-base-japanese/README.md index 82ac43d5672d..671c9e33b90a 100644 --- a/community/iverxin/bert-base-japanese/README.md +++ b/community/iverxin/bert-base-japanese/README.md @@ -1,11 +1,6 @@ ## bert-base-japanese +12 repeating layers, 768-hidden, 12-heads. -基于bert的日语训练模型的相关权重参数,其中包括: - -| Pretrained Weight | Language | Details of the model | -| ------------------------------------------ | -------- | ------------------------------------------------------------ | -| bert-base-japanese | Japanese | 12 repeating layers, 768-hidden, 12-heads. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. [reference](https://huggingface.co/cl-tohoku/bert-base-japanese) | -| bert-base-japanese-char | Japanese | 12 repeating layers, 768-hidden, 12-heads. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization. [reference](https://huggingface.co/cl-tohoku/bert-base-japanese-char) | -| bert-base-japanese-char-whole-word-masking | Japanese | 12 repeating layers, 768-hidden, 12-heads. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization. Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective..[reference](https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking) | -| bert-base-japanese-whole-word-masking | Japanese | 12 repeating layers, 768-hidden, 12-heads. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. [reference](https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking) | +This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. +[reference](https://huggingface.co/cl-tohoku/bert-base-japanese) diff --git a/community/iverxin/bert-base-japanese/files.json b/community/iverxin/bert-base-japanese/files.json index bdc2dea8a098..344fce03988a 100644 --- a/community/iverxin/bert-base-japanese/files.json +++ b/community/iverxin/bert-base-japanese/files.json @@ -1,6 +1,3 @@ { - "bert-base-japanese": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese.pdparams", - "bert-base-japanese-char": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char.pdparams", - "bert-base-japanese-char-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char-whole-word-masking.pdparams", - "bert-base-japanese-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-whole-word-masking.pdparams" + "bert-base-japanese": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese.pdparams" } \ No newline at end of file From 22bd7c4f142f127b33b1fa9c328fdbe84013b6a6 Mon Sep 17 00:00:00 2001 From: spade Date: Wed, 20 Oct 2021 19:52:27 +0800 Subject: [PATCH 07/23] update config pretrain weights https --- .../bert-base-japanese-char-whole-word-masking/files.json | 2 +- community/iverxin/bert-base-japanese-char/files.json | 2 +- .../bert-base-japanese-whole-word-masking/files.json | 2 +- paddlenlp/transformers/bert/modeling.py | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json b/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json index f817f685b6ef..e955795fd06b 100644 --- a/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json +++ b/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json @@ -1,3 +1,3 @@ { - "bert-base-japanese-char-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char-whole-word-masking.pdparams" + "bert-base-japanese-char-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/bert-base-japanese-char-whole-word-masking.pdparams" } \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese-char/files.json b/community/iverxin/bert-base-japanese-char/files.json index 4cdae70b3bd1..7834320ee184 100644 --- a/community/iverxin/bert-base-japanese-char/files.json +++ b/community/iverxin/bert-base-japanese-char/files.json @@ -1,3 +1,3 @@ { - "bert-base-japanese-char": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char.pdparams" + "bert-base-japanese-char": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/bert-base-japanese-char.pdparams" } \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese-whole-word-masking/files.json b/community/iverxin/bert-base-japanese-whole-word-masking/files.json index 4f6761876b85..de4bfd454ef8 100644 --- a/community/iverxin/bert-base-japanese-whole-word-masking/files.json +++ b/community/iverxin/bert-base-japanese-whole-word-masking/files.json @@ -1,3 +1,3 @@ { - "bert-base-japanese-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-whole-word-masking.pdparams" + "bert-base-japanese-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/bert-base-japanese-whole-word-masking.pdparams" } \ No newline at end of file diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py index 523296c3ccdf..7a19f56658a7 100644 --- a/paddlenlp/transformers/bert/modeling.py +++ b/paddlenlp/transformers/bert/modeling.py @@ -357,11 +357,11 @@ class BertPretrainedModel(PretrainedModel): "bert-base-japanese": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese.pdparams", "bert-base-japanese-whole-word-masking": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-whole-word-masking.pdparams", + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/bert-base-japanese-whole-word-masking.pdparams", "bert-base-japanese-char": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char.pdparams", + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/bert-base-japanese-char.pdparams", "bert-base-japanese-char-whole-word-masking": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese-char-whole-word-masking.pdparams", + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/bert-base-japanese-char-whole-word-masking.pdparams", } } base_model_prefix = "bert" From de0f7bfb574d35253194bd5df2faa050c9b2104f Mon Sep 17 00:00:00 2001 From: spade Date: Thu, 21 Oct 2021 13:28:53 +0800 Subject: [PATCH 08/23] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=9D=83=E9=87=8D?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bert-base-japanese-char-whole-word-masking/files.json | 3 ++- community/iverxin/bert-base-japanese-char/files.json | 3 ++- .../bert-base-japanese-whole-word-masking/files.json | 3 ++- community/iverxin/bert-base-japanese/files.json | 3 ++- paddlenlp/transformers/bert/modeling.py | 8 ++++---- paddlenlp/transformers/bert/tokenizer.py | 8 ++++---- paddlenlp/transformers/bert_japanese/tokenizer.py | 2 +- 7 files changed, 17 insertions(+), 13 deletions(-) diff --git a/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json b/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json index e955795fd06b..8ec4e20939a0 100644 --- a/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json +++ b/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json @@ -1,3 +1,4 @@ { - "bert-base-japanese-char-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/bert-base-japanese-char-whole-word-masking.pdparams" + "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/model_state.pdparams", + "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/vocab.txt" } \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese-char/files.json b/community/iverxin/bert-base-japanese-char/files.json index 7834320ee184..672121ce21d7 100644 --- a/community/iverxin/bert-base-japanese-char/files.json +++ b/community/iverxin/bert-base-japanese-char/files.json @@ -1,3 +1,4 @@ { - "bert-base-japanese-char": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/bert-base-japanese-char.pdparams" + "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/model_state.pdparams", + "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/vocab.txt" } \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese-whole-word-masking/files.json b/community/iverxin/bert-base-japanese-whole-word-masking/files.json index de4bfd454ef8..3e3d11ebee0b 100644 --- a/community/iverxin/bert-base-japanese-whole-word-masking/files.json +++ b/community/iverxin/bert-base-japanese-whole-word-masking/files.json @@ -1,3 +1,4 @@ { - "bert-base-japanese-whole-word-masking": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/bert-base-japanese-whole-word-masking.pdparams" + "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/model_state.pdparams", + "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/vocab.txt" } \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese/files.json b/community/iverxin/bert-base-japanese/files.json index 344fce03988a..c5800878042a 100644 --- a/community/iverxin/bert-base-japanese/files.json +++ b/community/iverxin/bert-base-japanese/files.json @@ -1,3 +1,4 @@ { - "bert-base-japanese": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese.pdparams" + "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/model_state.pdparams", + "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/vocab.txt" } \ No newline at end of file diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py index 7a19f56658a7..d92602308e97 100644 --- a/paddlenlp/transformers/bert/modeling.py +++ b/paddlenlp/transformers/bert/modeling.py @@ -355,13 +355,13 @@ class BertPretrainedModel(PretrainedModel): "simbert-base-chinese": "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/simbert-base-chinese-v1.pdparams", "bert-base-japanese": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/bert-base-japanese.pdparams", + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/model_state.pdparams", "bert-base-japanese-whole-word-masking": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/bert-base-japanese-whole-word-masking.pdparams", + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/model_state.pdparams", "bert-base-japanese-char": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/bert-base-japanese-char.pdparams", + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/model_state.pdparams", "bert-base-japanese-char-whole-word-masking": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/bert-base-japanese-char-whole-word-masking.pdparams", + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/model_state.pdparams", } } base_model_prefix = "bert" diff --git a/paddlenlp/transformers/bert/tokenizer.py b/paddlenlp/transformers/bert/tokenizer.py index c021291b66b4..ab6a1a44197b 100644 --- a/paddlenlp/transformers/bert/tokenizer.py +++ b/paddlenlp/transformers/bert/tokenizer.py @@ -328,13 +328,13 @@ class BertTokenizer(PretrainedTokenizer): "simbert-base-chinese": "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/vocab.txt", "bert-base-japanese": - "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/vocab.txt", + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/vocab.txt", "bert-base-japanese-whole-word-masking": - "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/vocab.txt", + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/vocab.txt", "bert-base-japanese-char": - "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/vocab.txt", + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/vocab.txt", "bert-base-japanese-char-whole-word-masking": - "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/vocab.txt" + "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/vocab.txt" } } pretrained_init_configuration = { diff --git a/paddlenlp/transformers/bert_japanese/tokenizer.py b/paddlenlp/transformers/bert_japanese/tokenizer.py index e429467323ea..d6bce8763a54 100644 --- a/paddlenlp/transformers/bert_japanese/tokenizer.py +++ b/paddlenlp/transformers/bert_japanese/tokenizer.py @@ -18,7 +18,7 @@ import unicodedata import collections -from .. import PretrainedTokenizer, BertTokenizer, BasicTokenizer, WordpieceTokenizer +from .. import BertTokenizer, BasicTokenizer, WordpieceTokenizer __all__ = [ 'BertJapaneseTokenizer', 'MecabTokenizer', 'CharacterTokenizer' From c74338a91751724a44819b5212bdb2e853367f9c Mon Sep 17 00:00:00 2001 From: spade Date: Fri, 22 Oct 2021 10:20:07 +0800 Subject: [PATCH 09/23] retest CI --- community/iverxin/bert-base-japanese/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/iverxin/bert-base-japanese/README.md b/community/iverxin/bert-base-japanese/README.md index 671c9e33b90a..39d9c9f3dcf3 100644 --- a/community/iverxin/bert-base-japanese/README.md +++ b/community/iverxin/bert-base-japanese/README.md @@ -1,6 +1,6 @@ ## bert-base-japanese 12 repeating layers, 768-hidden, 12-heads. -This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. +This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. [reference](https://huggingface.co/cl-tohoku/bert-base-japanese) From 919fcb6bf596fed6b77ad55615cdca7e5d71018a Mon Sep 17 00:00:00 2001 From: spade Date: Sun, 24 Oct 2021 22:01:49 +0800 Subject: [PATCH 10/23] update --- .../transformers/bert_japanese/tokenizer.py | 20 ++----------------- 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/paddlenlp/transformers/bert_japanese/tokenizer.py b/paddlenlp/transformers/bert_japanese/tokenizer.py index d6bce8763a54..37143e3bcad5 100644 --- a/paddlenlp/transformers/bert_japanese/tokenizer.py +++ b/paddlenlp/transformers/bert_japanese/tokenizer.py @@ -1,18 +1,3 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import copy import os import unicodedata @@ -20,9 +5,8 @@ from .. import BertTokenizer, BasicTokenizer, WordpieceTokenizer -__all__ = [ - 'BertJapaneseTokenizer', 'MecabTokenizer', 'CharacterTokenizer' -] +__all__ = ['BertJapaneseTokenizer', 'MecabTokenizer', 'CharacterTokenizer'] + class BertJapaneseTokenizer(BertTokenizer): """ From c3942ce5f7e984ed2bca496a1c9af78a40864f84 Mon Sep 17 00:00:00 2001 From: spade Date: Sun, 24 Oct 2021 22:05:09 +0800 Subject: [PATCH 11/23] update --- .../transformers/bert_japanese/tokenizer.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/paddlenlp/transformers/bert_japanese/tokenizer.py b/paddlenlp/transformers/bert_japanese/tokenizer.py index 37143e3bcad5..99fbb18d8b68 100644 --- a/paddlenlp/transformers/bert_japanese/tokenizer.py +++ b/paddlenlp/transformers/bert_japanese/tokenizer.py @@ -1,3 +1,18 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import copy import os import unicodedata @@ -11,6 +26,7 @@ class BertJapaneseTokenizer(BertTokenizer): """ Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer. + Args: vocab_file (str): The vocabulary file path (ends with '.txt') required to instantiate From 6177a033c9744e4dd58d4a02f6fdc1f44366923f Mon Sep 17 00:00:00 2001 From: spade Date: Mon, 25 Oct 2021 14:55:07 +0800 Subject: [PATCH 12/23] fix docstring --- .../transformers/bert_japanese/tokenizer.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/paddlenlp/transformers/bert_japanese/tokenizer.py b/paddlenlp/transformers/bert_japanese/tokenizer.py index 99fbb18d8b68..a36bd21b7fe6 100644 --- a/paddlenlp/transformers/bert_japanese/tokenizer.py +++ b/paddlenlp/transformers/bert_japanese/tokenizer.py @@ -31,17 +31,21 @@ class BertJapaneseTokenizer(BertTokenizer): vocab_file (str): The vocabulary file path (ends with '.txt') required to instantiate a `WordpieceTokenizer`. - do_lower_case (bool): + do_lower_case (bool, optional): Whether or not to lowercase the input when tokenizing. Defaults to`False`. - do_word_tokenize (bool): + do_word_tokenize (bool, optional): Whether to do word tokenization. Defaults to`True`. - do_subword_tokenize (bool): + do_subword_tokenize (bool, optional): Whether to do subword tokenization. Defaults to`True`. - word_tokenizer_type (str): + word_tokenizer_type (str, optional): Type of word tokenizer. Defaults to`basic`. - subword_tokenizer_type (str): + subword_tokenizer_type (str, optional): Type of subword tokenizer. Defaults to`wordpiece`. + never_split (bool, optional): + Kept for backward compatibility purposes. Defaults to`None`. + mecab_kwargs (str, optional): + Dictionary passed to the `MecabTokenizer` constructor. unk_token (str): A special token representing the *unknown (out-of-vocabulary)* token. An unknown token is set to be `unk_token` inorder to be converted to an ID. @@ -59,8 +63,7 @@ class BertJapaneseTokenizer(BertTokenizer): A special token representing a masked token. This is the token used in the masked language modeling task which the model tries to predict the original unmasked ones. Defaults to "[MASK]". - mecab_kwargs (str): - Dictionary passed to the `MecabTokenizer` constructor. + Examples: .. code-block:: @@ -111,9 +114,7 @@ def __init__(self, if do_word_tokenize: if word_tokenizer_type == "basic": self.basic_tokenizer = BasicTokenizer( - do_lower_case=do_lower_case, - never_split=never_split, - tokenize_chinese_chars=False) + do_lower_case=do_lower_case, ) elif word_tokenizer_type == "mecab": self.basic_tokenizer = MecabTokenizer( do_lower_case=do_lower_case, From ed9c933c2481630d9431405d3558872085e25b70 Mon Sep 17 00:00:00 2001 From: spade Date: Mon, 25 Oct 2021 16:47:46 +0800 Subject: [PATCH 13/23] update --- paddlenlp/transformers/bert/tokenizer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/bert/tokenizer.py b/paddlenlp/transformers/bert/tokenizer.py index ab6a1a44197b..53f703a9aa91 100644 --- a/paddlenlp/transformers/bert/tokenizer.py +++ b/paddlenlp/transformers/bert/tokenizer.py @@ -22,7 +22,9 @@ from ..tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation __all__ = [ - 'BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer', + 'BasicTokenizer', + 'BertTokenizer', + 'WordpieceTokenizer', ] @@ -633,4 +635,4 @@ def get_special_tokens_mask(self, if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ( [0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] \ No newline at end of file + return [1] + ([0] * len(token_ids_0)) + [1] From b494f71ca65e6d9c561a5f27e66c925ff038c915 Mon Sep 17 00:00:00 2001 From: spade Date: Mon, 25 Oct 2021 20:46:54 +0800 Subject: [PATCH 14/23] =?UTF-8?q?=E9=A2=84=E8=AE=AD=E7=BB=83=E6=9D=83?= =?UTF-8?q?=E9=87=8D=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../files.json | 2 + .../bert-base-japanese-char/files.json | 2 + .../files.json | 2 + .../iverxin/bert-base-japanese/files.json | 2 + paddlenlp/transformers/bert/modeling.py | 56 ------------------- paddlenlp/transformers/bert/tokenizer.py | 8 --- 6 files changed, 8 insertions(+), 64 deletions(-) diff --git a/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json b/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json index 8ec4e20939a0..a3cbf1572cd8 100644 --- a/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json +++ b/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json @@ -1,4 +1,6 @@ { + "model_config_file":"https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/model_config.json", "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/model_state.pdparams", + "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/tokenizer_config.pdparams", "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/vocab.txt" } \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese-char/files.json b/community/iverxin/bert-base-japanese-char/files.json index 672121ce21d7..7248ecfdff0d 100644 --- a/community/iverxin/bert-base-japanese-char/files.json +++ b/community/iverxin/bert-base-japanese-char/files.json @@ -1,4 +1,6 @@ { + "model_config_file":"https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/model_config.json", "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/model_state.pdparams", + "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/tokenizer_config.pdparams", "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/vocab.txt" } \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese-whole-word-masking/files.json b/community/iverxin/bert-base-japanese-whole-word-masking/files.json index 3e3d11ebee0b..fa70ef2381a2 100644 --- a/community/iverxin/bert-base-japanese-whole-word-masking/files.json +++ b/community/iverxin/bert-base-japanese-whole-word-masking/files.json @@ -1,4 +1,6 @@ { + "model_config_file":"https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/model_config.json", "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/model_state.pdparams", + "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/tokenizer_config.pdparams", "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/vocab.txt" } \ No newline at end of file diff --git a/community/iverxin/bert-base-japanese/files.json b/community/iverxin/bert-base-japanese/files.json index c5800878042a..e27cee4963d1 100644 --- a/community/iverxin/bert-base-japanese/files.json +++ b/community/iverxin/bert-base-japanese/files.json @@ -1,4 +1,6 @@ { + "model_config_file":"https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/model_config.json", "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/model_state.pdparams", + "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/tokenizer_config.pdparams", "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/vocab.txt" } \ No newline at end of file diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py index 233a5c26b515..dca3dc702d17 100644 --- a/paddlenlp/transformers/bert/modeling.py +++ b/paddlenlp/transformers/bert/modeling.py @@ -271,62 +271,6 @@ class BertPretrainedModel(PretrainedModel): "initializer_range": 0.02, "pad_token_id": 0, }, - "bert-base-japanese": { - "vocab_size": 32000, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-japanese-whole-word-masking": { - "vocab_size": 30522, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-japanese-char ": { - "vocab_size": 4000, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-japanese-char-whole-word-masking": { - "vocab_size": 4000, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - } } resource_files_names = {"model_state": "model_state.pdparams"} pretrained_resource_files_map = { diff --git a/paddlenlp/transformers/bert/tokenizer.py b/paddlenlp/transformers/bert/tokenizer.py index 53f703a9aa91..c875584a2f96 100644 --- a/paddlenlp/transformers/bert/tokenizer.py +++ b/paddlenlp/transformers/bert/tokenizer.py @@ -329,14 +329,6 @@ class BertTokenizer(PretrainedTokenizer): "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", "simbert-base-chinese": "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/vocab.txt", - "bert-base-japanese": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/vocab.txt", - "bert-base-japanese-whole-word-masking": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/vocab.txt", - "bert-base-japanese-char": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/vocab.txt", - "bert-base-japanese-char-whole-word-masking": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/vocab.txt" } } pretrained_init_configuration = { From b096712468a01f8d4404de2881e8297383af45ca Mon Sep 17 00:00:00 2001 From: spade Date: Tue, 26 Oct 2021 14:11:34 +0800 Subject: [PATCH 15/23] update weights readme --- .../README.md | 67 +++++++++++++++++-- .../iverxin/bert-base-japanese-char/README.md | 63 +++++++++++++++-- .../README.md | 66 ++++++++++++++++-- .../iverxin/bert-base-japanese/README.md | 61 +++++++++++++++-- 4 files changed, 241 insertions(+), 16 deletions(-) diff --git a/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md b/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md index 3f2a11ed5cf1..06377f7c1d81 100644 --- a/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md +++ b/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md @@ -1,5 +1,64 @@ -## bert-base-japanese -12 repeating layers, 768-hidden, 12-heads. -This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization. Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. -[reference](https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking) + +# BERT base Japanese (character tokenization, whole word masking enabled) + +This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. + +This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization. + +Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. + +The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/tree/v1.0). + +## Model architecture + +The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads. + +## Training Data + +The model is trained on Japanese Wikipedia as of September 1, 2019. + +To generate the training corpus, [WikiExtractor](https://github.com/attardi/wikiextractor) is used to extract plain texts from a dump file of Wikipedia articles. + +The text files used for the training are 2.6GB in size, consisting of approximately 17M sentences. + +## Tokenization + +The texts are first tokenized by [MeCab](https://taku910.github.io/mecab/) morphological parser with the IPA dictionary and then split into characters. + +The vocabulary size is 4000. + +## Training + +The model is trained with the same configuration as the original BERT; 512 tokens per instance, 256 instances per batch, and 1M training steps. + +For the training of the MLM (masked language modeling) objective, we introduced the **Whole Word Masking** in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once. + +## Licenses + +The pretrained models are distributed under the terms of the [Creative Commons Attribution-ShareAlike 3.0](https://creativecommons.org/licenses/by-sa/3.0/). + +## Acknowledgments + +For training models, we used Cloud TPUs provided by [TensorFlow Research Cloud](https://www.tensorflow.org/tfrc/) program. + +## Usage +```python +import paddle +from paddlenlp.transformers import BertJapaneseTokenizer, BertForMaskedLM + +path = "iverxin/bert-base-japanese-char-whole-word-masking" +tokenizer = BertJapaneseTokenizer.from_pretrained(path) +model = BertForMaskedLM.from_pretrained(path) +text1 = "こんにちは" + +model.eval() +inputs = tokenizer(text1) +inputs = {k: paddle.to_tensor([v]) for (k, v) in inputs.items()} +output = model(**inputs) +print(output.shape) +# [1, 5, 32000] +``` + +## Weights source +https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking diff --git a/community/iverxin/bert-base-japanese-char/README.md b/community/iverxin/bert-base-japanese-char/README.md index f538d756a16f..4e37c27036a3 100644 --- a/community/iverxin/bert-base-japanese-char/README.md +++ b/community/iverxin/bert-base-japanese-char/README.md @@ -1,5 +1,60 @@ -## bert-base-japanese -12 repeating layers, 768-hidden, 12-heads. -This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization. -[reference](https://huggingface.co/cl-tohoku/bert-base-japanese-char) + +# BERT base Japanese (character tokenization) + +This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. + +This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization. + +The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/tree/v1.0). + +## Model architecture + +The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads. + +## Training Data + +The model is trained on Japanese Wikipedia as of September 1, 2019. + +To generate the training corpus, [WikiExtractor](https://github.com/attardi/wikiextractor) is used to extract plain texts from a dump file of Wikipedia articles. + +The text files used for the training are 2.6GB in size, consisting of approximately 17M sentences. + +## Tokenization + +The texts are first tokenized by [MeCab](https://taku910.github.io/mecab/) morphological parser with the IPA dictionary and then split into characters. + +The vocabulary size is 4000. + +## Training + +The model is trained with the same configuration as the original BERT; 512 tokens per instance, 256 instances per batch, and 1M training steps. + +## Licenses + +The pretrained models are distributed under the terms of the [Creative Commons Attribution-ShareAlike 3.0](https://creativecommons.org/licenses/by-sa/3.0/). + +## Acknowledgments + +For training models, we used Cloud TPUs provided by [TensorFlow Research Cloud](https://www.tensorflow.org/tfrc/) program. + +## Usage +```python +import paddle +from paddlenlp.transformers import BertJapaneseTokenizer, BertForMaskedLM, MecabTokenizer + +path = "iverxin/bert-base-japanese-char" +tokenizer = BertJapaneseTokenizer.from_pretrained(path) +model = BertForMaskedLM.from_pretrained(path) +text1 = "こんにちは" +text2 = "櫓を飛ばす" + +model.eval() +inputs = tokenizer(text1) +inputs = {k: paddle.to_tensor([v]) for (k, v) in inputs.items()} +output = model(**inputs) +print(output.shape) +``` + +## Weights source +https://huggingface.co/cl-tohoku/bert-base-japanese-char diff --git a/community/iverxin/bert-base-japanese-whole-word-masking/README.md b/community/iverxin/bert-base-japanese-whole-word-masking/README.md index ae1217d911f0..a341670bc5c9 100644 --- a/community/iverxin/bert-base-japanese-whole-word-masking/README.md +++ b/community/iverxin/bert-base-japanese-whole-word-masking/README.md @@ -1,5 +1,63 @@ -## bert-base-japanese -12 repeating layers, 768-hidden, 12-heads. -This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. -[reference](https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking) + +# BERT base Japanese (IPA dictionary, whole word masking enabled) + +This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. + +This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. + +Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. + +The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/tree/v1.0). + +## Model architecture + +The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads. + +## Training Data + +The model is trained on Japanese Wikipedia as of September 1, 2019. + +To generate the training corpus, [WikiExtractor](https://github.com/attardi/wikiextractor) is used to extract plain texts from a dump file of Wikipedia articles. + +The text files used for the training are 2.6GB in size, consisting of approximately 17M sentences. + +## Tokenization + +The texts are first tokenized by [MeCab](https://taku910.github.io/mecab/) morphological parser with the IPA dictionary and then split into subwords by the WordPiece algorithm. + +The vocabulary size is 32000. + +## Training + +The model is trained with the same configuration as the original BERT; 512 tokens per instance, 256 instances per batch, and 1M training steps. + +For the training of the MLM (masked language modeling) objective, we introduced the **Whole Word Masking** in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once. + +## Licenses + +The pretrained models are distributed under the terms of the [Creative Commons Attribution-ShareAlike 3.0](https://creativecommons.org/licenses/by-sa/3.0/). + +## Acknowledgments + +For training models, we used Cloud TPUs provided by [TensorFlow Research Cloud](https://www.tensorflow.org/tfrc/) program. + +## Usage +```python +import paddle +from paddlenlp.transformers import BertJapaneseTokenizer, BertForMaskedLM + +path = "iverxin/bert-base-japanese-whole-word-masking" +tokenizer = BertJapaneseTokenizer.from_pretrained(path) +model = BertForMaskedLM.from_pretrained(path) +text1 = "こんにちは" + +model.eval() +inputs = tokenizer(text1) +inputs = {k: paddle.to_tensor([v]) for (k, v) in inputs.items()} +output = model(**inputs) +print(output.shape) +``` + +## Weights source +https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking diff --git a/community/iverxin/bert-base-japanese/README.md b/community/iverxin/bert-base-japanese/README.md index 39d9c9f3dcf3..85d6c561b341 100644 --- a/community/iverxin/bert-base-japanese/README.md +++ b/community/iverxin/bert-base-japanese/README.md @@ -1,6 +1,59 @@ -## bert-base-japanese -12 repeating layers, 768-hidden, 12-heads. +# BERT base Japanese (IPA dictionary) -This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. -[reference](https://huggingface.co/cl-tohoku/bert-base-japanese) +This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. +This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. + +The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/tree/v1.0). + +## Model architecture + +The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads. + +## Training Data + +The model is trained on Japanese Wikipedia as of September 1, 2019. + +To generate the training corpus, [WikiExtractor](https://github.com/attardi/wikiextractor) is used to extract plain texts from a dump file of Wikipedia articles. + +The text files used for the training are 2.6GB in size, consisting of approximately 17M sentences. + +## Tokenization + +The texts are first tokenized by [MeCab](https://taku910.github.io/mecab/) morphological parser with the IPA dictionary and then split into subwords by the WordPiece algorithm. + +The vocabulary size is 32000. + +## Training + +The model is trained with the same configuration as the original BERT; 512 tokens per instance, 256 instances per batch, and 1M training steps. + +## Licenses + +The pretrained models are distributed under the terms of the [Creative Commons Attribution-ShareAlike 3.0](https://creativecommons.org/licenses/by-sa/3.0/). + +## Acknowledgments + +For training models, we used Cloud TPUs provided by [TensorFlow Research Cloud](https://www.tensorflow.org/tfrc/) program. + + +## Usage +```python +import paddle +from paddlenlp.transformers import BertJapaneseTokenizer, BertForMaskedLM + +path = "iverxin/bert-base-japanese" +tokenizer = BertJapaneseTokenizer.from_pretrained(path) +model = BertForMaskedLM.from_pretrained(path) +text1 = "こんにちは" + +model.eval() +inputs = tokenizer(text1) +inputs = {k: paddle.to_tensor([v]) for (k, v) in inputs.items()} +output = model(**inputs) +print(output.shape) +``` + + +## Weights source +https://huggingface.co/cl-tohoku/bert-base-japanese From f85adb5b89e29ca935d21a545a2fc9109f14305e Mon Sep 17 00:00:00 2001 From: spade Date: Tue, 26 Oct 2021 14:14:17 +0800 Subject: [PATCH 16/23] remove weights url in codes --- paddlenlp/transformers/bert/modeling.py | 10 +--------- paddlenlp/transformers/bert/tokenizer.py | 22 +--------------------- 2 files changed, 2 insertions(+), 30 deletions(-) diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py index dca3dc702d17..fec3d1d9a159 100644 --- a/paddlenlp/transformers/bert/modeling.py +++ b/paddlenlp/transformers/bert/modeling.py @@ -298,15 +298,7 @@ class BertPretrainedModel(PretrainedModel): "macbert-large-chinese": "https://paddlenlp.bj.bcebos.com/models/transformers/macbert/macbert-large-chinese.pdparams", "simbert-base-chinese": - "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/simbert-base-chinese-v1.pdparams", - "bert-base-japanese": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/model_state.pdparams", - "bert-base-japanese-whole-word-masking": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/model_state.pdparams", - "bert-base-japanese-char": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/model_state.pdparams", - "bert-base-japanese-char-whole-word-masking": - "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/model_state.pdparams", + "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/simbert-base-chinese-v1.pdparams" } } base_model_prefix = "bert" diff --git a/paddlenlp/transformers/bert/tokenizer.py b/paddlenlp/transformers/bert/tokenizer.py index c875584a2f96..6fcf40b02578 100644 --- a/paddlenlp/transformers/bert/tokenizer.py +++ b/paddlenlp/transformers/bert/tokenizer.py @@ -367,27 +367,7 @@ class BertTokenizer(PretrainedTokenizer): }, "simbert-base-chinese": { "do_lower_case": True - }, - "bert-base-japanese": { - "do_lower_case": False, - "word_tokenizer_type": "mecab", - "subword_tokenizer_type": "wordpiece", - }, - "bert-base-japanese-whole-word-masking": { - "do_lower_case": False, - "word_tokenizer_type": "mecab", - "subword_tokenizer_type": "wordpiece", - }, - "bert-base-japanese-char": { - "do_lower_case": False, - "word_tokenizer_type": "mecab", - "subword_tokenizer_type": "character", - }, - "bert-base-japanese-char-whole-word-masking": { - "do_lower_case": False, - "word_tokenizer_type": "mecab", - "subword_tokenizer_type": "character", - }, + } } padding_side = 'right' From 1fb1e1e98c3773a6073c3c4d7192192ddcb8abc7 Mon Sep 17 00:00:00 2001 From: spade Date: Tue, 26 Oct 2021 14:23:50 +0800 Subject: [PATCH 17/23] update... --- paddlenlp/transformers/bert/tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/bert/tokenizer.py b/paddlenlp/transformers/bert/tokenizer.py index 6fcf40b02578..690de285b6b1 100644 --- a/paddlenlp/transformers/bert/tokenizer.py +++ b/paddlenlp/transformers/bert/tokenizer.py @@ -292,9 +292,9 @@ class BertTokenizer(PretrainedTokenizer): .. code-block:: from paddlenlp.transformers import BertTokenizer - berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - inputs = berttokenizer.tokenize('He was a puppeteer') + inputs = tokenizer.tokenize('He was a puppeteer') print(inputs) ''' From bc55f1e843b47b1c956248f8792a4bc170c83c34 Mon Sep 17 00:00:00 2001 From: spade Date: Tue, 26 Oct 2021 14:25:01 +0800 Subject: [PATCH 18/23] update... --- .../bert_japanese/test_tokenizer.py | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 tests/transformers/bert_japanese/test_tokenizer.py diff --git a/tests/transformers/bert_japanese/test_tokenizer.py b/tests/transformers/bert_japanese/test_tokenizer.py new file mode 100644 index 000000000000..8db04a288b44 --- /dev/null +++ b/tests/transformers/bert_japanese/test_tokenizer.py @@ -0,0 +1,110 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import os +import unittest +from paddlenlp.transformers import BertTokenizer, BertJapaneseTokenizer +from paddlenlp.data import Vocab + +from common_test import CpuCommonTest +from util import slow, assert_raises +import unittest + + +class TestBertJapaneseTokenizerFromPretrained(CpuCommonTest): + @slow + def test_from_pretrained(self): + tokenizer = BertJapaneseTokenizer.from_pretrained("bert-base-japanese") + text1 = "こんにちは" + text2 = "櫓を飛ばす" + # test batch_encode + expected_input_ids = [ + 2, 10350, 25746, 28450, 3, 20301, 11, 787, 12222, 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0 + ] + expected_token_type_ids = [ + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ] + expected_attention_mask = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ] + expected_special_tokens_mask = [ + 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + ] + results = tokenizer( + [text1], [text2], + 20, + stride=1, + pad_to_max_seq_len=True, + return_attention_mask=True, + return_special_tokens_mask=True) + + self.check_output_equal(results[0]['input_ids'], expected_input_ids) + self.check_output_equal(results[0]['token_type_ids'], + expected_token_type_ids) + self.check_output_equal(results[0]['attention_mask'], + expected_attention_mask) + self.check_output_equal(results[0]['special_tokens_mask'], + expected_special_tokens_mask) + # test encode + results = tokenizer(text1, text2, 20, stride=1, pad_to_max_seq_len=True) + self.check_output_equal(results['input_ids'], expected_input_ids) + self.check_output_equal(results['token_type_ids'], + expected_token_type_ids) + + @slow + def test_from_pretrained_pad_left(self): + tokenizer = BertJapaneseTokenizer.from_pretrained("bert-base-japanese") + tokenizer.padding_side = "left" + text1 = "こんにちは" + text2 = "櫓を飛ばす" + # test batch_encode + expected_input_ids = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 10350, 25746, 28450, 3, 20301, 11, + 787, 12222, 3 + ] + expected_token_type_ids = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1 + ] + expected_attention_mask = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + ] + expected_special_tokens_mask = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1 + ] + results = tokenizer( + [text1], [text2], + 20, + stride=1, + pad_to_max_seq_len=True, + return_attention_mask=True, + return_special_tokens_mask=True) + + self.check_output_equal(results[0]['input_ids'], expected_input_ids) + self.check_output_equal(results[0]['token_type_ids'], + expected_token_type_ids) + self.check_output_equal(results[0]['attention_mask'], + expected_attention_mask) + self.check_output_equal(results[0]['special_tokens_mask'], + expected_special_tokens_mask) + # test encode + results = tokenizer(text1, text2, 20, stride=1, pad_to_max_seq_len=True) + self.check_output_equal(results['input_ids'], expected_input_ids) + self.check_output_equal(results['token_type_ids'], + expected_token_type_ids) + + +if __name__ == "__main__": + unittest.main() From eec73dd0fcb301297dd15d8061727c2e9bacd9ef Mon Sep 17 00:00:00 2001 From: spade Date: Wed, 27 Oct 2021 09:47:19 +0800 Subject: [PATCH 19/23] update weights readme --- .../bert-base-japanese-char-whole-word-masking/README.md | 2 +- community/iverxin/bert-base-japanese-char/README.md | 2 +- .../iverxin/bert-base-japanese-whole-word-masking/README.md | 2 +- community/iverxin/bert-base-japanese/README.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md b/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md index 06377f7c1d81..2e5839e58e8b 100644 --- a/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md +++ b/community/iverxin/bert-base-japanese-char-whole-word-masking/README.md @@ -47,7 +47,7 @@ For training models, we used Cloud TPUs provided by [TensorFlow Research Cloud]( import paddle from paddlenlp.transformers import BertJapaneseTokenizer, BertForMaskedLM -path = "iverxin/bert-base-japanese-char-whole-word-masking" +path = "iverxin/bert-base-japanese-char-whole-word-masking/" tokenizer = BertJapaneseTokenizer.from_pretrained(path) model = BertForMaskedLM.from_pretrained(path) text1 = "こんにちは" diff --git a/community/iverxin/bert-base-japanese-char/README.md b/community/iverxin/bert-base-japanese-char/README.md index 4e37c27036a3..8083d5ec774e 100644 --- a/community/iverxin/bert-base-japanese-char/README.md +++ b/community/iverxin/bert-base-japanese-char/README.md @@ -43,7 +43,7 @@ For training models, we used Cloud TPUs provided by [TensorFlow Research Cloud]( import paddle from paddlenlp.transformers import BertJapaneseTokenizer, BertForMaskedLM, MecabTokenizer -path = "iverxin/bert-base-japanese-char" +path = "iverxin/bert-base-japanese-char/" tokenizer = BertJapaneseTokenizer.from_pretrained(path) model = BertForMaskedLM.from_pretrained(path) text1 = "こんにちは" diff --git a/community/iverxin/bert-base-japanese-whole-word-masking/README.md b/community/iverxin/bert-base-japanese-whole-word-masking/README.md index a341670bc5c9..0945f9c51634 100644 --- a/community/iverxin/bert-base-japanese-whole-word-masking/README.md +++ b/community/iverxin/bert-base-japanese-whole-word-masking/README.md @@ -47,7 +47,7 @@ For training models, we used Cloud TPUs provided by [TensorFlow Research Cloud]( import paddle from paddlenlp.transformers import BertJapaneseTokenizer, BertForMaskedLM -path = "iverxin/bert-base-japanese-whole-word-masking" +path = "iverxin/bert-base-japanese-whole-word-masking/" tokenizer = BertJapaneseTokenizer.from_pretrained(path) model = BertForMaskedLM.from_pretrained(path) text1 = "こんにちは" diff --git a/community/iverxin/bert-base-japanese/README.md b/community/iverxin/bert-base-japanese/README.md index 85d6c561b341..2d9089e95805 100644 --- a/community/iverxin/bert-base-japanese/README.md +++ b/community/iverxin/bert-base-japanese/README.md @@ -42,7 +42,7 @@ For training models, we used Cloud TPUs provided by [TensorFlow Research Cloud]( import paddle from paddlenlp.transformers import BertJapaneseTokenizer, BertForMaskedLM -path = "iverxin/bert-base-japanese" +path = "iverxin/bert-base-japanese/" tokenizer = BertJapaneseTokenizer.from_pretrained(path) model = BertForMaskedLM.from_pretrained(path) text1 = "こんにちは" From 2858749be536313d9aef6e6ef3c4cfe7586f8435 Mon Sep 17 00:00:00 2001 From: spade Date: Wed, 27 Oct 2021 10:59:24 +0800 Subject: [PATCH 20/23] update --- tests/transformers/bert_japanese/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/transformers/bert_japanese/__init__.py diff --git a/tests/transformers/bert_japanese/__init__.py b/tests/transformers/bert_japanese/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 From 00b6b1dcc5a39a1e37bf2dfb73c04f45b62c87cc Mon Sep 17 00:00:00 2001 From: spade Date: Wed, 27 Oct 2021 13:00:38 +0800 Subject: [PATCH 21/23] update --- .../bert-base-japanese-char-whole-word-masking/files.json | 2 +- community/iverxin/bert-base-japanese-char/files.json | 2 +- .../bert-base-japanese-whole-word-masking/files.json | 2 +- community/iverxin/bert-base-japanese/files.json | 2 +- paddlenlp/transformers/bert/modeling.py | 2 +- paddlenlp/transformers/bert/tokenizer.py | 7 +++---- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json b/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json index a3cbf1572cd8..1c799cd70cdf 100644 --- a/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json +++ b/community/iverxin/bert-base-japanese-char-whole-word-masking/files.json @@ -3,4 +3,4 @@ "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/model_state.pdparams", "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/tokenizer_config.pdparams", "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char-whole-word-masking/vocab.txt" -} \ No newline at end of file +} diff --git a/community/iverxin/bert-base-japanese-char/files.json b/community/iverxin/bert-base-japanese-char/files.json index 7248ecfdff0d..3d10c499bda0 100644 --- a/community/iverxin/bert-base-japanese-char/files.json +++ b/community/iverxin/bert-base-japanese-char/files.json @@ -3,4 +3,4 @@ "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/model_state.pdparams", "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/tokenizer_config.pdparams", "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-char/vocab.txt" -} \ No newline at end of file +} diff --git a/community/iverxin/bert-base-japanese-whole-word-masking/files.json b/community/iverxin/bert-base-japanese-whole-word-masking/files.json index fa70ef2381a2..04e128b55d8d 100644 --- a/community/iverxin/bert-base-japanese-whole-word-masking/files.json +++ b/community/iverxin/bert-base-japanese-whole-word-masking/files.json @@ -3,4 +3,4 @@ "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/model_state.pdparams", "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/tokenizer_config.pdparams", "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese-whole-word-masking/vocab.txt" -} \ No newline at end of file +} diff --git a/community/iverxin/bert-base-japanese/files.json b/community/iverxin/bert-base-japanese/files.json index e27cee4963d1..053deb857bbf 100644 --- a/community/iverxin/bert-base-japanese/files.json +++ b/community/iverxin/bert-base-japanese/files.json @@ -3,4 +3,4 @@ "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/model_state.pdparams", "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/tokenizer_config.pdparams", "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/iverxin/bert-base-japanese/vocab.txt" -} \ No newline at end of file +} diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py index b8d93cc9f9fb..f1ca1d509d00 100644 --- a/paddlenlp/transformers/bert/modeling.py +++ b/paddlenlp/transformers/bert/modeling.py @@ -298,7 +298,7 @@ class BertPretrainedModel(PretrainedModel): "macbert-large-chinese": "https://paddlenlp.bj.bcebos.com/models/transformers/macbert/macbert-large-chinese.pdparams", "simbert-base-chinese": - "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/simbert-base-chinese-v1.pdparams" + "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/simbert-base-chinese-v1.pdparams", } } base_model_prefix = "bert" diff --git a/paddlenlp/transformers/bert/tokenizer.py b/paddlenlp/transformers/bert/tokenizer.py index 690de285b6b1..1086df17358b 100644 --- a/paddlenlp/transformers/bert/tokenizer.py +++ b/paddlenlp/transformers/bert/tokenizer.py @@ -16,7 +16,6 @@ import copy import os import unicodedata -import collections from .. import PretrainedTokenizer from ..tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation @@ -294,11 +293,11 @@ class BertTokenizer(PretrainedTokenizer): from paddlenlp.transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - inputs = tokenizer.tokenize('He was a puppeteer') + inputs = tokenizer('He was a puppeteer') print(inputs) ''' - ['he', 'was', 'a', 'puppet', '##eer'] + {'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]} ''' """ @@ -367,7 +366,7 @@ class BertTokenizer(PretrainedTokenizer): }, "simbert-base-chinese": { "do_lower_case": True - } + }, } padding_side = 'right' From c47c4733729bc4210f7d3cee685d7bf55f1176b0 Mon Sep 17 00:00:00 2001 From: spade Date: Wed, 27 Oct 2021 13:12:00 +0800 Subject: [PATCH 22/23] update docstring --- paddlenlp/transformers/bert_japanese/tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlenlp/transformers/bert_japanese/tokenizer.py b/paddlenlp/transformers/bert_japanese/tokenizer.py index a36bd21b7fe6..db5161c1bd71 100644 --- a/paddlenlp/transformers/bert_japanese/tokenizer.py +++ b/paddlenlp/transformers/bert_japanese/tokenizer.py @@ -69,13 +69,13 @@ class BertJapaneseTokenizer(BertTokenizer): .. code-block:: from paddlenlp.transformers import BertJapaneseTokenizer - berttokenizer = BertJapaneseTokenizer.from_pretrained('bert-base-japanese') + tokenizer = BertJapaneseTokenizer.from_pretrained('iverxin/bert-base-japanese/') - inputs = berttokenizer.tokenize('こんにちは') + inputs = tokenizer('こんにちは') print(inputs) ''' - ['こん', '##にち', '##は'] + {'input_ids': [2, 10350, 25746, 28450, 3], 'token_type_ids': [0, 0, 0, 0, 0]} ''' """ From 5734afd0f0d299fd29d0b3fefcd10d4bbdafd4f3 Mon Sep 17 00:00:00 2001 From: spade Date: Wed, 27 Oct 2021 13:25:16 +0800 Subject: [PATCH 23/23] =?UTF-8?q?=E6=B8=85=E7=90=86=E5=86=97=E4=BD=99?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/transformers/bert/test_tokenizer.py | 85 +---------------------- 1 file changed, 1 insertion(+), 84 deletions(-) diff --git a/tests/transformers/bert/test_tokenizer.py b/tests/transformers/bert/test_tokenizer.py index c0c1658f5e90..451770f2c6d7 100644 --- a/tests/transformers/bert/test_tokenizer.py +++ b/tests/transformers/bert/test_tokenizer.py @@ -15,7 +15,7 @@ import numpy as np import os import unittest -from paddlenlp.transformers import BertTokenizer, BasicTokenizer, WordpieceTokenizer, BertJapaneseTokenizer +from paddlenlp.transformers import BertTokenizer, BasicTokenizer, WordpieceTokenizer from paddlenlp.data import Vocab from common_test import CpuCommonTest @@ -357,88 +357,5 @@ def test_from_pretrained_pad_left(self): expected_token_type_ids) -class TestBertJapaneseTokenizerFromPretrained(CpuCommonTest): - @slow - def test_from_pretrained(self): - tokenizer = BertJapaneseTokenizer.from_pretrained("bert-base-japanese") - text1 = "こんにちは" - text2 = "櫓を飛ばす" - # test batch_encode - expected_input_ids = [ - 2, 10350, 25746, 28450, 3, 20301, 11, 787, 12222, 3, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0 - ] - expected_token_type_ids = [ - 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - ] - expected_attention_mask = [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - ] - expected_special_tokens_mask = [ - 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - ] - results = tokenizer( - [text1], [text2], - 20, - stride=1, - pad_to_max_seq_len=True, - return_attention_mask=True, - return_special_tokens_mask=True) - - self.check_output_equal(results[0]['input_ids'], expected_input_ids) - self.check_output_equal(results[0]['token_type_ids'], - expected_token_type_ids) - self.check_output_equal(results[0]['attention_mask'], - expected_attention_mask) - self.check_output_equal(results[0]['special_tokens_mask'], - expected_special_tokens_mask) - # test encode - results = tokenizer(text1, text2, 20, stride=1, pad_to_max_seq_len=True) - self.check_output_equal(results['input_ids'], expected_input_ids) - self.check_output_equal(results['token_type_ids'], - expected_token_type_ids) - - @slow - def test_from_pretrained_pad_left(self): - tokenizer = BertJapaneseTokenizer.from_pretrained("bert-base-japanese") - tokenizer.padding_side = "left" - text1 = "こんにちは" - text2 = "櫓を飛ばす" - # test batch_encode - expected_input_ids = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 10350, 25746, 28450, 3, 20301, 11, - 787, 12222, 3 - ] - expected_token_type_ids = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1 - ] - expected_attention_mask = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - ] - expected_special_tokens_mask = [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1 - ] - results = tokenizer( - [text1], [text2], - 20, - stride=1, - pad_to_max_seq_len=True, - return_attention_mask=True, - return_special_tokens_mask=True) - - self.check_output_equal(results[0]['input_ids'], expected_input_ids) - self.check_output_equal(results[0]['token_type_ids'], - expected_token_type_ids) - self.check_output_equal(results[0]['attention_mask'], - expected_attention_mask) - self.check_output_equal(results[0]['special_tokens_mask'], - expected_special_tokens_mask) - # test encode - results = tokenizer(text1, text2, 20, stride=1, pad_to_max_seq_len=True) - self.check_output_equal(results['input_ids'], expected_input_ids) - self.check_output_equal(results['token_type_ids'], - expected_token_type_ids) - - if __name__ == "__main__": unittest.main()