diff --git a/ivy_models/__init__.py b/ivy_models/__init__.py index 4f1c1d2b..7e1a6d6f 100644 --- a/ivy_models/__init__.py +++ b/ivy_models/__init__.py @@ -23,4 +23,8 @@ from . import bert from .bert import * + +from . import roberta +from .roberta import * + from .vit import * diff --git a/ivy_models/roberta/__init__.py b/ivy_models/roberta/__init__.py new file mode 100644 index 00000000..e1af40c4 --- /dev/null +++ b/ivy_models/roberta/__init__.py @@ -0,0 +1 @@ +from .roberta import RobertaModel, roberta_base diff --git a/ivy_models/roberta/layers.py b/ivy_models/roberta/layers.py new file mode 100644 index 00000000..7149d516 --- /dev/null +++ b/ivy_models/roberta/layers.py @@ -0,0 +1,55 @@ +import ivy +from ivy_models.bert.layers import BertEmbedding + + +class RobertaEmbeddings(BertEmbedding): + """Same as Bert Embedding with tiny change in the positional indexing.""" + + def __init__( + self, + vocab_size, + hidden_size, + max_position_embeddings, + type_vocab_size=1, + pad_token_id=None, + embd_drop_rate=0.1, + layer_norm_eps=1e-5, + position_embedding_type="absolute", + v=None, + ): + super(RobertaEmbeddings, self).__init__( + vocab_size, + hidden_size, + max_position_embeddings, + type_vocab_size, + pad_token_id, + embd_drop_rate, + layer_norm_eps, + position_embedding_type, + v, + ) + self.padding_idx = 1 + + def _forward( + self, + input_ids, + token_type_ids=None, + position_ids=None, + past_key_values_length: int = 0, + ): + input_shape = input_ids.shape + seq_length = input_shape[1] + + if position_ids is None: + position_ids = ivy.expand_dims( + ivy.arange(self.padding_idx + 1, seq_length + self.padding_idx), axis=0 + ) + position_ids = position_ids[ + :, past_key_values_length : seq_length + past_key_values_length + ] + return super(RobertaEmbeddings, self)._forward( + input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + past_key_values_length=past_key_values_length, + ) diff --git a/ivy_models/roberta/roberta.py b/ivy_models/roberta/roberta.py new file mode 100644 index 00000000..c224f83b --- /dev/null +++ b/ivy_models/roberta/roberta.py @@ -0,0 +1,91 @@ +from ivy_models.helpers import load_transformers_weights +from ivy_models.bert import BertConfig, BertModel +from .layers import RobertaEmbeddings + + +class RobertaModel(BertModel): + def __init__(self, config: BertConfig, pooler_out=False): + super(RobertaModel, self).__init__(config, pooler_out=pooler_out) + + @classmethod + def get_spec_class(self): + return BertConfig + + def _build(self, *args, **kwargs): + self.embeddings = RobertaEmbeddings(**self.config.get_embd_attrs()) + super(RobertaModel, self)._build(*args, **kwargs) + + def _forward( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + ): + if input_ids[:, 0].sum().item() != 0: + print("NOT ALLOWED") + return super(RobertaModel, self)._forward( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + +def _roberta_weights_mapping(name): + key_map = [(f"__v{i}__", f"__{j}__") for i, j in zip(range(12), range(12))] + key_map = key_map + [ + ("attention__dense", "attention.output.dense"), + ("attention__LayerNorm", "attention.output.LayerNorm"), + ] + key_map = key_map + [ + ("ffd__dense1", "intermediate.dense"), + ("ffd__dense2", "output.dense"), + ("ffd__LayerNorm", "output.LayerNorm"), + ] + name = name.replace("__w", ".weight").replace("__b", ".bias") + name = ( + name.replace("biasias", "bias") + .replace("weighteight", "weight") + .replace(".weightord", ".word") + ) + for ref, new in key_map: + name = name.replace(ref, new) + name = name.replace("__", ".") + return name + + +def roberta_base(pretrained=True): + # instantiate the hyperparameters same as bert + # set the dropout rate to 0.0 to avoid stochasticity in the output + + config = BertConfig( + vocab_size=50265, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout=0.0, + attn_drop_rate=0.0, + max_position_embeddings=514, + type_vocab_size=1, + ) + + model = RobertaModel(config, pooler_out=True) + if pretrained: + w_clean = load_transformers_weights( + "roberta-base", model, _roberta_weights_mapping + ) + model.v = w_clean + return model diff --git a/ivy_models_tests/roberta/roberta_inputs.npy b/ivy_models_tests/roberta/roberta_inputs.npy new file mode 100644 index 00000000..f5abe506 Binary files /dev/null and b/ivy_models_tests/roberta/roberta_inputs.npy differ diff --git a/ivy_models_tests/roberta/roberta_pooled_output.npy b/ivy_models_tests/roberta/roberta_pooled_output.npy new file mode 100644 index 00000000..3cde87af Binary files /dev/null and b/ivy_models_tests/roberta/roberta_pooled_output.npy differ diff --git a/ivy_models_tests/roberta/test_roberta.py b/ivy_models_tests/roberta/test_roberta.py new file mode 100644 index 00000000..4a8064b2 --- /dev/null +++ b/ivy_models_tests/roberta/test_roberta.py @@ -0,0 +1,26 @@ +import os +import ivy +import pytest +import numpy as np +from ivy_models import roberta_base + + +@pytest.mark.parametrize("batch_shape", [[1]]) +@pytest.mark.parametrize("load_weights", [False, True]) +def test_roberta(device, fw, batch_shape, load_weights): + """Test RoBerta Base Sequence Classification""" + + num_dims = 768 + this_dir = os.path.dirname(os.path.realpath(__file__)) + input_path = os.path.join(this_dir, "roberta_inputs.npy") + inputs = np.load(input_path, allow_pickle=True).tolist() + model = roberta_base(load_weights) + + inputs = {k: ivy.asarray(v) for k, v in inputs.items()} + logits = model(**inputs)["pooler_output"] + assert logits.shape == tuple([ivy.to_scalar(batch_shape), num_dims]) + + if load_weights: + ref_logits_path = os.path.join(this_dir, "roberta_pooled_output.npy") + ref_logits = np.load(ref_logits_path) + assert np.allclose(ref_logits, ivy.to_numpy(logits), rtol=0.005, atol=0.005)