Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds HuggingFace pretrained encoder unit tests #2962

Merged
merged 17 commits into from
Jan 24, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ludwig/encoders/text_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1338,7 +1338,7 @@ def input_dtype(self):
@DeveloperAPI
@register_encoder("camembert", TEXT)
class CamemBERTEncoder(HFTextEncoder):
DEFAULT_MODEL_NAME = "jplu/camembert-base"
DEFAULT_MODEL_NAME = "camembert-base"

def __init__(
self,
Expand Down
2 changes: 1 addition & 1 deletion ludwig/schema/metadata/configs/encoders.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1008,7 +1008,7 @@ CamemBERT:
short_description:
Language model trained on large French text corpus.
long_description:
The camembert encoder loads a pretrained CamemBERT (default jplu/tf-camembert-base) model using
The camembert encoder loads a pretrained CamemBERT (default camembert-base) model using
the Hugging Face transformers package. CamemBERT is pre-trained on 138GB of French text.
literature_references:
- https://arxiv.org/abs/1911.03894
Expand Down
81 changes: 56 additions & 25 deletions tests/ludwig/encoders/test_text_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,61 @@

from ludwig.encoders import text_encoders
from tests.integration_tests.parameter_update_utils import check_module_parameters_updated
from tests.integration_tests.utils import slow


@pytest.mark.parametrize(
"encoder_cls",
[
text_encoders.ALBERTEncoder,
text_encoders.BERTEncoder,
text_encoders.XLMEncoder,
text_encoders.GPTEncoder,
text_encoders.RoBERTaEncoder,
text_encoders.GPT2Encoder,
text_encoders.DistilBERTEncoder,
text_encoders.TransformerXLEncoder,
text_encoders.CTRLEncoder,
text_encoders.CamemBERTEncoder,
text_encoders.MT5Encoder,
text_encoders.XLMRoBERTaEncoder,
text_encoders.LongformerEncoder,
text_encoders.ELECTRAEncoder,
text_encoders.FlauBERTEncoder,
text_encoders.T5Encoder,
text_encoders.XLNetEncoder,
text_encoders.DistilBERTEncoder,
],
)
def test_hf_pretrained_default_exists(tmpdir, encoder_cls: text_encoders.HFTextEncoder):
"""Test that the default pretrained model exists on the HuggingFace Hub.

This test merely checks that the default model name is valid. It does not check
the model end-to-end, as that would require downloading the model weights, which
can cause problems in the CI due to memory/runtime constraints.

TODO: add an end-to-end test for pretrained HF encoders.
"""
from huggingface_hub import HfApi

hf_api = HfApi()
hf_api.model_info(encoder_cls.DEFAULT_MODEL_NAME)


@pytest.mark.parametrize("pretrained_model_name_or_path", ["bert-base-uncased"])
@pytest.mark.parametrize("reduce_output", [None, "sum", "cls_pooled"])
@pytest.mark.parametrize("max_sequence_length", [20])
def test_auto_transformer_encoder(
jeffkinnison marked this conversation as resolved.
Show resolved Hide resolved
tmpdir, pretrained_model_name_or_path: str, reduce_output: str, max_sequence_length: int
):
encoder = text_encoders.AutoTransformerEncoder(
pretrained_model_name_or_path=pretrained_model_name_or_path,
reduce_output=reduce_output,
max_sequence_length=max_sequence_length,
pretrained_kwargs=dict(cache_dir=tmpdir),
)
inputs = torch.rand((2, max_sequence_length)).type(encoder.input_dtype)
outputs = encoder(inputs)
assert outputs["encoder_output"].shape[1:] == encoder.output_shape


@pytest.mark.parametrize("use_pretrained", [False])
Expand Down Expand Up @@ -34,7 +88,6 @@ def test_bert_encoder(use_pretrained: bool, reduce_output: str, max_sequence_len
assert outputs["encoder_output"].shape[1:] == bert.output_shape


@slow
@pytest.mark.parametrize("use_pretrained", [False])
@pytest.mark.parametrize("reduce_output", ["last", "sum", "mean"])
@pytest.mark.parametrize("max_sequence_length", [20])
Expand All @@ -49,7 +102,6 @@ def test_xlm_encoder(use_pretrained: bool, reduce_output: str, max_sequence_leng
assert outputs["encoder_output"].shape[1:] == xlm_encoder.output_shape


@slow
@pytest.mark.parametrize("use_pretrained", [False])
@pytest.mark.parametrize("reduce_output", [None, "sum"])
@pytest.mark.parametrize("max_sequence_length", [20])
Expand Down Expand Up @@ -78,8 +130,7 @@ def test_roberta_encoder(use_pretrained: bool, reduce_output: str, max_sequence_
assert outputs["encoder_output"].shape[1:] == roberta_encoder.output_shape


@slow
@pytest.mark.parametrize("use_pretrained", [True, False])
@pytest.mark.parametrize("use_pretrained", [False])
@pytest.mark.parametrize("reduce_output", [None, "sum"])
@pytest.mark.parametrize("max_sequence_length", [20])
def test_gpt2_encoder(use_pretrained: bool, reduce_output: str, max_sequence_length: int):
Expand Down Expand Up @@ -121,7 +172,6 @@ def test_transfoxl_encoder(use_pretrained: bool, reduce_output: str, max_sequenc
assert outputs["encoder_output"].shape[1:] == transfo.output_shape


@slow
@pytest.mark.parametrize("use_pretrained", [False])
@pytest.mark.parametrize("reduce_output", [None, "sum"])
@pytest.mark.parametrize("max_sequence_length", [20])
Expand All @@ -136,7 +186,6 @@ def test_ctrl_encoder(use_pretrained: bool, reduce_output: str, max_sequence_len
assert outputs["encoder_output"].shape[1:] == encoder.output_shape


@slow
@pytest.mark.parametrize("use_pretrained", [False])
@pytest.mark.parametrize("reduce_output", [None, "cls_pooled"])
@pytest.mark.parametrize("max_sequence_length", [20])
Expand Down Expand Up @@ -165,7 +214,6 @@ def test_mt5_encoder(use_pretrained: bool, reduce_output: str, max_sequence_leng
assert outputs["encoder_output"].shape[1:] == mt5_encoder.output_shape


@slow
@pytest.mark.parametrize("use_pretrained", [False])
@pytest.mark.parametrize("reduce_output", [None, "sum"])
@pytest.mark.parametrize("max_sequence_length", [20])
Expand All @@ -192,7 +240,6 @@ def test_longformer_encoder(use_pretrained: bool, reduce_output: str, max_sequen
assert outputs["encoder_output"].shape[1:] == encoder.output_shape


@slow
@pytest.mark.parametrize("use_pretrained", [False])
@pytest.mark.parametrize("reduce_output", [None, "sum"])
@pytest.mark.parametrize("max_sequence_length", [20])
Expand All @@ -205,21 +252,6 @@ def test_electra_encoder(use_pretrained: bool, reduce_output: str, max_sequence_
assert outputs["encoder_output"].shape[1:] == encoder.output_shape


@pytest.mark.parametrize("pretrained_model_name_or_path", ["bert-base-uncased"])
@pytest.mark.parametrize("reduce_output", [None, "sum", "cls_pooled"])
@pytest.mark.parametrize("max_sequence_length", [20])
def test_auto_transformer_encoder(pretrained_model_name_or_path: str, reduce_output: str, max_sequence_length: int):
encoder = text_encoders.AutoTransformerEncoder(
pretrained_model_name_or_path=pretrained_model_name_or_path,
reduce_output=reduce_output,
max_sequence_length=max_sequence_length,
)
inputs = torch.rand((2, max_sequence_length)).type(encoder.input_dtype)
outputs = encoder(inputs)
assert outputs["encoder_output"].shape[1:] == encoder.output_shape


@slow
@pytest.mark.parametrize("use_pretrained", [False])
@pytest.mark.parametrize("reduce_output", [None, "sum"])
@pytest.mark.parametrize("max_sequence_length", [20])
Expand All @@ -244,7 +276,6 @@ def test_t5_encoder(use_pretrained: bool, reduce_output: str, max_sequence_lengt
assert outputs["encoder_output"].shape[1:] == encoder.output_shape


@slow
@pytest.mark.parametrize("use_pretrained", [False])
@pytest.mark.parametrize("reduce_output", [None, "sum"])
@pytest.mark.parametrize("max_sequence_length", [20])
Expand Down