Skip to content

Commit

Permalink
Convert SentencePieceTokenizer and associated models to new assets pa…
Browse files Browse the repository at this point in the history
…radigm (#1323)

* Convert SentencePiece tokenizer to save_assets/load_assets

* Convert albert to new assets paradigm

* Convert DebertaV3 to new assets paradigm

* Fix formatting issues

* Convert FNet to new assets paradigm

* Convert XLMRoberta to new assets paradigm

* Convert T5 Tokenizer to new assets paradigm

* Fix sentencepiece tokenizer config test

* Change set_vocabulary to set_proto

* Change proto to raw proto

* Change to proto_bytes
  • Loading branch information
nkovela1 authored Nov 21, 2023
1 parent 4f88a16 commit 4ca2516
Show file tree
Hide file tree
Showing 23 changed files with 350 additions and 189 deletions.
2 changes: 1 addition & 1 deletion keras_nlp/models/albert/albert_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def setUp(self):
hidden_dim=2,
embedding_dim=2,
intermediate_dim=4,
max_sequence_length=self.preprocessor.packer.sequence_length,
max_sequence_length=self.preprocessor.sequence_length,
)
self.init_kwargs = {
"preprocessor": self.preprocessor,
Expand Down
37 changes: 23 additions & 14 deletions keras_nlp/models/albert/albert_masked_lm_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,29 +131,38 @@ def __init__(
truncate=truncate,
**kwargs,
)

self.mask_selection_rate = mask_selection_rate
self.mask_selection_length = mask_selection_length
self.mask_token_rate = mask_token_rate
self.random_token_rate = random_token_rate
self.masker = None

def build(self, input_shape):
super().build(input_shape)
# Defer masker creation to `build()` so that we can be sure tokenizer
# assets have loaded when restoring a saved model.
self.masker = MaskedLMMaskGenerator(
mask_selection_rate=mask_selection_rate,
mask_selection_length=mask_selection_length,
mask_token_rate=mask_token_rate,
random_token_rate=random_token_rate,
vocabulary_size=tokenizer.vocabulary_size(),
mask_token_id=tokenizer.mask_token_id,
mask_selection_rate=self.mask_selection_rate,
mask_selection_length=self.mask_selection_length,
mask_token_rate=self.mask_token_rate,
random_token_rate=self.random_token_rate,
vocabulary_size=self.tokenizer.vocabulary_size(),
mask_token_id=self.tokenizer.mask_token_id,
unselectable_token_ids=[
tokenizer.cls_token_id,
tokenizer.sep_token_id,
tokenizer.pad_token_id,
self.tokenizer.cls_token_id,
self.tokenizer.sep_token_id,
self.tokenizer.pad_token_id,
],
)

def get_config(self):
config = super().get_config()
config.update(
{
"mask_selection_rate": self.masker.mask_selection_rate,
"mask_selection_length": self.masker.mask_selection_length,
"mask_token_rate": self.masker.mask_token_rate,
"random_token_rate": self.masker.random_token_rate,
"mask_selection_rate": self.mask_selection_rate,
"mask_selection_length": self.mask_selection_length,
"mask_token_rate": self.mask_token_rate,
"random_token_rate": self.random_token_rate,
}
)
return config
Expand Down
2 changes: 1 addition & 1 deletion keras_nlp/models/albert/albert_masked_lm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def setUp(self):
hidden_dim=2,
embedding_dim=2,
intermediate_dim=4,
max_sequence_length=self.preprocessor.packer.sequence_length,
max_sequence_length=self.preprocessor.sequence_length,
)
self.init_kwargs = {
"preprocessor": self.preprocessor,
Expand Down
16 changes: 12 additions & 4 deletions keras_nlp/models/albert/albert_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,20 +158,28 @@ def __init__(
):
super().__init__(**kwargs)
self.tokenizer = tokenizer
self.truncate = truncate
self.sequence_length = sequence_length
self.packer = None

def build(self, input_shape):
# Defer packer creation to `build()` so that we can be sure tokenizer
# assets have loaded when restoring a saved model.
self.packer = MultiSegmentPacker(
start_value=self.tokenizer.cls_token_id,
end_value=self.tokenizer.sep_token_id,
pad_value=self.tokenizer.pad_token_id,
truncate=truncate,
sequence_length=sequence_length,
truncate=self.truncate,
sequence_length=self.sequence_length,
)
self.built = True

def get_config(self):
config = super().get_config()
config.update(
{
"sequence_length": self.packer.sequence_length,
"truncate": self.packer.truncate,
"sequence_length": self.sequence_length,
"truncate": self.truncate,
}
)
return config
Expand Down
47 changes: 30 additions & 17 deletions keras_nlp/models/albert/albert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,25 +87,38 @@ class AlbertTokenizer(SentencePieceTokenizer):
"""

def __init__(self, proto, **kwargs):
self.cls_token = "[CLS]"
self.sep_token = "[SEP]"
self.pad_token = "<pad>"
self.mask_token = "[MASK]"

super().__init__(proto=proto, **kwargs)

# Check for necessary special tokens.
cls_token = "[CLS]"
sep_token = "[SEP]"
pad_token = "<pad>"
mask_token = "[MASK]"
for token in [cls_token, sep_token, pad_token, mask_token]:
if token not in self.get_vocabulary():
raise ValueError(
f"Cannot find token `'{token}'` in the provided "
f"`vocabulary`. Please provide `'{token}'` in your "
"`vocabulary` or use a pretrained `vocabulary` name."
)

self.cls_token_id = self.token_to_id(cls_token)
self.sep_token_id = self.token_to_id(sep_token)
self.pad_token_id = self.token_to_id(pad_token)
self.mask_token_id = self.token_to_id(mask_token)
def set_proto(self, proto):
super().set_proto(proto)
if proto is not None:
for token in [
self.cls_token,
self.sep_token,
self.pad_token,
self.mask_token,
]:
if token not in self.get_vocabulary():
raise ValueError(
f"Cannot find token `'{token}'` in the provided "
f"`vocabulary`. Please provide `'{token}'` in your "
"`vocabulary` or use a pretrained `vocabulary` name."
)

self.cls_token_id = self.token_to_id(self.cls_token)
self.sep_token_id = self.token_to_id(self.sep_token)
self.pad_token_id = self.token_to_id(self.pad_token)
self.mask_token_id = self.token_to_id(self.mask_token)
else:
self.cls_token_id = None
self.sep_token_id = None
self.pad_token_id = None
self.mask_token_id = None

@classproperty
def presets(cls):
Expand Down
2 changes: 1 addition & 1 deletion keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def setUp(self):
num_heads=2,
hidden_dim=2,
intermediate_dim=4,
max_sequence_length=self.preprocessor.packer.sequence_length,
max_sequence_length=self.preprocessor.sequence_length,
)
self.init_kwargs = {
"preprocessor": self.preprocessor,
Expand Down
36 changes: 23 additions & 13 deletions keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,28 +133,38 @@ def __init__(
**kwargs,
)

self.mask_selection_rate = mask_selection_rate
self.mask_selection_length = mask_selection_length
self.mask_token_rate = mask_token_rate
self.random_token_rate = random_token_rate
self.masker = None

def build(self, input_shape):
super().build(input_shape)
# Defer masker creation to `build()` so that we can be sure tokenizer
# assets have loaded when restoring a saved model.
self.masker = MaskedLMMaskGenerator(
mask_selection_rate=mask_selection_rate,
mask_selection_length=mask_selection_length,
mask_token_rate=mask_token_rate,
random_token_rate=random_token_rate,
vocabulary_size=tokenizer.vocabulary_size(),
mask_token_id=tokenizer.mask_token_id,
mask_selection_rate=self.mask_selection_rate,
mask_selection_length=self.mask_selection_length,
mask_token_rate=self.mask_token_rate,
random_token_rate=self.random_token_rate,
vocabulary_size=self.tokenizer.vocabulary_size(),
mask_token_id=self.tokenizer.mask_token_id,
unselectable_token_ids=[
tokenizer.cls_token_id,
tokenizer.sep_token_id,
tokenizer.pad_token_id,
self.tokenizer.cls_token_id,
self.tokenizer.sep_token_id,
self.tokenizer.pad_token_id,
],
)

def get_config(self):
config = super().get_config()
config.update(
{
"mask_selection_rate": self.masker.mask_selection_rate,
"mask_selection_length": self.masker.mask_selection_length,
"mask_token_rate": self.masker.mask_token_rate,
"random_token_rate": self.masker.random_token_rate,
"mask_selection_rate": self.mask_selection_rate,
"mask_selection_length": self.mask_selection_length,
"mask_token_rate": self.mask_token_rate,
"random_token_rate": self.random_token_rate,
}
)
return config
Expand Down
2 changes: 1 addition & 1 deletion keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def setUp(self):
num_heads=2,
hidden_dim=2,
intermediate_dim=4,
max_sequence_length=self.preprocessor.packer.sequence_length,
max_sequence_length=self.preprocessor.sequence_length,
)
self.init_kwargs = {
"preprocessor": self.preprocessor,
Expand Down
16 changes: 12 additions & 4 deletions keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,20 +156,28 @@ def __init__(
):
super().__init__(**kwargs)
self.tokenizer = tokenizer
self.truncate = truncate
self.sequence_length = sequence_length
self.packer = None

def build(self, input_shape):
# Defer packer creation to `build()` so that we can be sure tokenizer
# assets have loaded when restoring a saved model.
self.packer = MultiSegmentPacker(
start_value=self.tokenizer.cls_token_id,
end_value=self.tokenizer.sep_token_id,
pad_value=self.tokenizer.pad_token_id,
truncate=truncate,
sequence_length=sequence_length,
truncate=self.truncate,
sequence_length=self.sequence_length,
)
self.built = True

def get_config(self):
config = super().get_config()
config.update(
{
"sequence_length": self.packer.sequence_length,
"truncate": self.packer.truncate,
"sequence_length": self.sequence_length,
"truncate": self.truncate,
}
)
return config
Expand Down
53 changes: 29 additions & 24 deletions keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,33 +93,38 @@ class DebertaV3Tokenizer(SentencePieceTokenizer):
"""

def __init__(self, proto, **kwargs):
self.cls_token = "[CLS]"
self.sep_token = "[SEP]"
self.pad_token = "[PAD]"
self.mask_token = "[MASK]"

super().__init__(proto=proto, **kwargs)

# Check for necessary special tokens.
cls_token = "[CLS]"
sep_token = "[SEP]"
pad_token = "[PAD]"
mask_token = "[MASK]"

# We do not throw an error if `mask_token` is not present in the
# vocabulary.
for token in [cls_token, pad_token, sep_token]:
if token not in super().get_vocabulary():
raise ValueError(
f"Cannot find token `'{token}'` in the provided "
f"`vocabulary`. Please provide `'{token}'` in your "
"`vocabulary` or use a pretrained `vocabulary` name."
)

self.cls_token_id = self.token_to_id(cls_token)
self.sep_token_id = self.token_to_id(sep_token)
self.pad_token_id = self.token_to_id(pad_token)
# If the mask token is not in the vocabulary, add it to the end of the
# vocabulary.
if mask_token in super().get_vocabulary():
self.mask_token_id = super().token_to_id(mask_token)
def set_proto(self, proto):
super().set_proto(proto)
if proto is not None:
for token in [self.cls_token, self.pad_token, self.sep_token]:
if token not in super().get_vocabulary():
raise ValueError(
f"Cannot find token `'{token}'` in the provided "
f"`vocabulary`. Please provide `'{token}'` in your "
"`vocabulary` or use a pretrained `vocabulary` name."
)

self.cls_token_id = self.token_to_id(self.cls_token)
self.sep_token_id = self.token_to_id(self.sep_token)
self.pad_token_id = self.token_to_id(self.pad_token)
# If the mask token is not in the vocabulary, add it to the end of the
# vocabulary.
if self.mask_token in super().get_vocabulary():
self.mask_token_id = super().token_to_id(self.mask_token)
else:
self.mask_token_id = super().vocabulary_size()
else:
self.mask_token_id = super().vocabulary_size()
self.cls_token_id = None
self.sep_token_id = None
self.pad_token_id = None
self.mask_token_id = None

def vocabulary_size(self):
sentence_piece_size = super().vocabulary_size()
Expand Down
2 changes: 1 addition & 1 deletion keras_nlp/models/f_net/f_net_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def setUp(self):
num_layers=2,
hidden_dim=2,
intermediate_dim=4,
max_sequence_length=self.preprocessor.packer.sequence_length,
max_sequence_length=self.preprocessor.sequence_length,
)
self.init_kwargs = {
"preprocessor": self.preprocessor,
Expand Down
37 changes: 23 additions & 14 deletions keras_nlp/models/f_net/f_net_masked_lm_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,29 +136,38 @@ def __init__(
truncate=truncate,
**kwargs,
)

self.mask_selection_rate = mask_selection_rate
self.mask_selection_length = mask_selection_length
self.mask_token_rate = mask_token_rate
self.random_token_rate = random_token_rate
self.masker = None

def build(self, input_shape):
super().build(input_shape)
# Defer masker creation to `build()` so that we can be sure tokenizer
# assets have loaded when restoring a saved model.
self.masker = MaskedLMMaskGenerator(
mask_selection_rate=mask_selection_rate,
mask_selection_length=mask_selection_length,
mask_token_rate=mask_token_rate,
random_token_rate=random_token_rate,
vocabulary_size=tokenizer.vocabulary_size(),
mask_token_id=tokenizer.mask_token_id,
mask_selection_rate=self.mask_selection_rate,
mask_selection_length=self.mask_selection_length,
mask_token_rate=self.mask_token_rate,
random_token_rate=self.random_token_rate,
vocabulary_size=self.tokenizer.vocabulary_size(),
mask_token_id=self.tokenizer.mask_token_id,
unselectable_token_ids=[
tokenizer.cls_token_id,
tokenizer.sep_token_id,
tokenizer.pad_token_id,
self.tokenizer.cls_token_id,
self.tokenizer.sep_token_id,
self.tokenizer.pad_token_id,
],
)

def get_config(self):
config = super().get_config()
config.update(
{
"mask_selection_rate": self.masker.mask_selection_rate,
"mask_selection_length": self.masker.mask_selection_length,
"mask_token_rate": self.masker.mask_token_rate,
"random_token_rate": self.masker.random_token_rate,
"mask_selection_rate": self.mask_selection_rate,
"mask_selection_length": self.mask_selection_length,
"mask_token_rate": self.mask_token_rate,
"random_token_rate": self.random_token_rate,
}
)
return config
Expand Down
2 changes: 1 addition & 1 deletion keras_nlp/models/f_net/f_net_masked_lm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def setUp(self):
num_layers=2,
hidden_dim=2,
intermediate_dim=4,
max_sequence_length=self.preprocessor.packer.sequence_length,
max_sequence_length=self.preprocessor.sequence_length,
)
self.init_kwargs = {
"preprocessor": self.preprocessor,
Expand Down
Loading

0 comments on commit 4ca2516

Please sign in to comment.