Fix custom tokenizers test (huggingface#19052)

* Fix CI for custom tokenizers * Add nightly tests * Run CI, run! * Fix paths * Typos * Fix test
oneraghavan · Sep 26, 2022 · 7655fa0 · 7655fa0
1 parent c16e39c
commit 7655fa0
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 6 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -578,19 +578,45 @@ jobs:
                   key: v0.5-custom_tokenizers-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
             - run: |
                   if [ -f test_list.txt ]; then
-                    python -m pytest --max-worker-restart=0 -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt
-                  fi
-            - run: |
-                  if [ -f test_list.txt ]; then
-                    python -m pytest -n 1 --max-worker-restart=0 tests/test_tokenization_clip.py --dist=loadfile -s --make-reports=tests_tokenization_clip --durations=100 | tee tests_output.txt
+                    python -m pytest --max-worker-restart=0 -s --make-reports=tests_custom_tokenizers ./tests/models/bert_japanese/test_tokenization_bert_japanese.py ./tests/models/openai/test_tokenization_openai.py ./tests/models/clip/test_tokenization_clip.py | tee tests_output.txt
                   fi
             - store_artifacts:
                   path: ~/transformers/tests_output.txt
             - store_artifacts:
                   path: ~/transformers/reports
 
+    run_tests_custom_tokenizers_all:
+        working_directory: ~/transformers
+        docker:
+            - image: cimg/python:3.7.12
+        environment:
+            RUN_CUSTOM_TOKENIZERS: yes
+            TRANSFORMERS_IS_CI: yes
+            PYTEST_TIMEOUT: 120
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.5-custom_tokenizers-{{ checksum "setup.py" }}
+                      - v0.5-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]
+            - run: python -m unidic download
+            - save_cache:
+                  key: v0.5-custom_tokenizers-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest --max-worker-restart=0 -s --make-reports=tests_custom_tokenizers ./tests/models/bert_japanese/test_tokenization_bert_japanese.py ./tests/models/openai/test_tokenization_openai.py ./tests/models/clip/test_tokenization_clip.py | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
     run_examples_torch:
         working_directory: ~/transformers
         docker:
@@ -1026,6 +1052,42 @@ jobs:
             - store_artifacts:
                   path: ~/transformers/reports
 
+    run_tests_layoutlmv2_and_v3_all:
+        working_directory: ~/transformers
+        docker:
+            - image: cimg/python:3.7.12
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+            PYTEST_TIMEOUT: 120
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.5-torch-{{ checksum "setup.py" }}
+                      - v0.5-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,testing,vision]
+            - run: pip install torchvision
+            # The commit `36a65a0907d90ed591479b2ebaa8b61cfa0b4ef0` in `detectron2` break things.
+            # See https://github.com/facebookresearch/detectron2/commit/36a65a0907d90ed591479b2ebaa8b61cfa0b4ef0#comments.
+            # TODO: Revert this change back once the above issue is fixed.
+            - run: python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
+            - run: sudo apt install tesseract-ocr
+            - run: pip install pytesseract
+            - save_cache:
+                  key: v0.5-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -n 1 --max-worker-restart=0 tests/models/*layoutlmv* --dist=loadfile -s --make-reports=tests_layoutlmv2_and_v3 --durations=100
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
 # TPU JOBS
     run_examples_tpu:
         docker:
@@ -1094,6 +1156,7 @@ workflows:
             - run_examples_torch_all
             - run_examples_tensorflow_all
             - run_examples_flax_all
+            - run_tests_custom_tokenizers_all
             - run_tests_torch_and_tf_all
             - run_tests_torch_and_flax_all
             - run_tests_torch_all
@@ -1103,6 +1166,7 @@ workflows:
             - run_tests_pipelines_tf_all
             - run_tests_onnxruntime_all
             - run_tests_hub_all
+            - run_tests_layoutlmv2_and_v3_all
 
 #    tpu_testing_jobs:
 #        triggers:

diff --git a/setup.py b/setup.py
@@ -236,6 +236,7 @@ def run(self):
 
 
 extras = {}
+extras["blob"] = []
 
 extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic")
 extras["sklearn"] = deps_list("scikit-learn")

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
@@ -45,6 +45,7 @@
     SpecialTokensMixin,
     Trainer,
     TrainingArguments,
+    is_flax_available,
     is_tf_available,
     is_tokenizers_available,
     is_torch_available,
@@ -2928,8 +2929,10 @@ def test_batch_encode_dynamic_overflowing(self):
                     returned_tensor = "pt"
                 elif is_tf_available():
                     returned_tensor = "tf"
-                else:
+                elif is_flax_available():
                     returned_tensor = "jax"
+                else:
+                    return
 
                 if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
                     return