Skip to content

Commit

Permalink
Fix custom tokenizers test (huggingface#19052)
Browse files Browse the repository at this point in the history
* Fix CI for custom tokenizers

* Add nightly tests

* Run CI, run!

* Fix paths

* Typos

* Fix test
  • Loading branch information
sgugger authored and oneraghavan committed Sep 26, 2022
1 parent c16e39c commit 7655fa0
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 6 deletions.
74 changes: 69 additions & 5 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -578,19 +578,45 @@ jobs:
key: v0.5-custom_tokenizers-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: python utils/tests_fetcher.py | tee test_preparation.txt
- store_artifacts:
path: ~/transformers/test_preparation.txt
- run: |
if [ -f test_list.txt ]; then
python -m pytest --max-worker-restart=0 -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt
fi
- run: |
if [ -f test_list.txt ]; then
python -m pytest -n 1 --max-worker-restart=0 tests/test_tokenization_clip.py --dist=loadfile -s --make-reports=tests_tokenization_clip --durations=100 | tee tests_output.txt
python -m pytest --max-worker-restart=0 -s --make-reports=tests_custom_tokenizers ./tests/models/bert_japanese/test_tokenization_bert_japanese.py ./tests/models/openai/test_tokenization_openai.py ./tests/models/clip/test_tokenization_clip.py | tee tests_output.txt
fi
- store_artifacts:
path: ~/transformers/tests_output.txt
- store_artifacts:
path: ~/transformers/reports

run_tests_custom_tokenizers_all:
working_directory: ~/transformers
docker:
- image: cimg/python:3.7.12
environment:
RUN_CUSTOM_TOKENIZERS: yes
TRANSFORMERS_IS_CI: yes
PYTEST_TIMEOUT: 120
steps:
- checkout
- restore_cache:
keys:
- v0.5-custom_tokenizers-{{ checksum "setup.py" }}
- v0.5-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]
- run: python -m unidic download
- save_cache:
key: v0.5-custom_tokenizers-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: python -m pytest --max-worker-restart=0 -s --make-reports=tests_custom_tokenizers ./tests/models/bert_japanese/test_tokenization_bert_japanese.py ./tests/models/openai/test_tokenization_openai.py ./tests/models/clip/test_tokenization_clip.py | tee tests_output.txt
- store_artifacts:
path: ~/transformers/tests_output.txt
- store_artifacts:
path: ~/transformers/reports

run_examples_torch:
working_directory: ~/transformers
docker:
Expand Down Expand Up @@ -1026,6 +1052,42 @@ jobs:
- store_artifacts:
path: ~/transformers/reports

run_tests_layoutlmv2_and_v3_all:
working_directory: ~/transformers
docker:
- image: cimg/python:3.7.12
environment:
OMP_NUM_THREADS: 1
TRANSFORMERS_IS_CI: yes
PYTEST_TIMEOUT: 120
resource_class: xlarge
parallelism: 1
steps:
- checkout
- restore_cache:
keys:
- v0.5-torch-{{ checksum "setup.py" }}
- v0.5-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
- run: pip install --upgrade pip
- run: pip install .[torch,testing,vision]
- run: pip install torchvision
# The commit `36a65a0907d90ed591479b2ebaa8b61cfa0b4ef0` in `detectron2` break things.
# See https://github.com/facebookresearch/detectron2/commit/36a65a0907d90ed591479b2ebaa8b61cfa0b4ef0#comments.
# TODO: Revert this change back once the above issue is fixed.
- run: python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
- run: sudo apt install tesseract-ocr
- run: pip install pytesseract
- save_cache:
key: v0.5-torch-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: python -m pytest -n 1 --max-worker-restart=0 tests/models/*layoutlmv* --dist=loadfile -s --make-reports=tests_layoutlmv2_and_v3 --durations=100
- store_artifacts:
path: ~/transformers/tests_output.txt
- store_artifacts:
path: ~/transformers/reports

# TPU JOBS
run_examples_tpu:
docker:
Expand Down Expand Up @@ -1094,6 +1156,7 @@ workflows:
- run_examples_torch_all
- run_examples_tensorflow_all
- run_examples_flax_all
- run_tests_custom_tokenizers_all
- run_tests_torch_and_tf_all
- run_tests_torch_and_flax_all
- run_tests_torch_all
Expand All @@ -1103,6 +1166,7 @@ workflows:
- run_tests_pipelines_tf_all
- run_tests_onnxruntime_all
- run_tests_hub_all
- run_tests_layoutlmv2_and_v3_all

# tpu_testing_jobs:
# triggers:
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def run(self):


extras = {}
extras["blob"] = []

extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic")
extras["sklearn"] = deps_list("scikit-learn")
Expand Down
5 changes: 4 additions & 1 deletion tests/test_tokenization_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
SpecialTokensMixin,
Trainer,
TrainingArguments,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
Expand Down Expand Up @@ -2928,8 +2929,10 @@ def test_batch_encode_dynamic_overflowing(self):
returned_tensor = "pt"
elif is_tf_available():
returned_tensor = "tf"
else:
elif is_flax_available():
returned_tensor = "jax"
else:
return

if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
return
Expand Down

0 comments on commit 7655fa0

Please sign in to comment.