From 476f40a1cab2b432fa5cc206cb48934e941ba1c3 Mon Sep 17 00:00:00 2001 From: saattrupdan Date: Thu, 31 Oct 2024 16:47:00 +0100 Subject: [PATCH] tests: Add tests for adapt_tokenizer and convert_token_to_string --- tests/models/test_vllm.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tests/models/test_vllm.py diff --git a/tests/models/test_vllm.py b/tests/models/test_vllm.py new file mode 100644 index 000000000..ecbbd5a00 --- /dev/null +++ b/tests/models/test_vllm.py @@ -0,0 +1,29 @@ +"""Tests for the `vllm` module.""" + +from outlines.models.vllm import adapt_tokenizer, convert_token_to_string +from transformers import AutoTokenizer, SPIECE_UNDERLINE +import pytest + +TEST_MODEL = "hf-internal-testing/tiny-random-GPTJForCausalLM" + + +def test_adapt_tokenizer(): + tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL, padding_side="left") + adapted_tokenizer = adapt_tokenizer(tokenizer=tokenizer) + assert hasattr(adapted_tokenizer, "vocabulary") + assert hasattr(adapted_tokenizer, "special_tokens") + assert adapted_tokenizer.convert_token_to_string == convert_token_to_string + + +@pytest.mark.parametrize( + "token, expected", + [ + ("baz", "baz"), + ("<0x20>", " <0x20>"), + (SPIECE_UNDERLINE, f" {SPIECE_UNDERLINE}"), + ], +) +def test_convert_token_to_string(token, expected): + tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL, padding_side="left") + output = convert_token_to_string(token=token, tokenizer=tokenizer) + assert output == expected