diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 30898f9e1c2a4d..31627dbe594a75 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -432,10 +432,13 @@ def job_name(self): tests_to_run="tests/repo_utils", ) -# At this moment, only the files that are in `utils/documentation_tests.txt` will be kept (together with a dummy file). -py_command = 'import os; import json; fp = open("pr_documentation_tests.txt"); data_1 = fp.read().strip().split("\\n"); fp = open("utils/documentation_tests.txt"); data_2 = fp.read().strip().split("\\n"); to_test = [x for x in data_1 if x in set(data_2)] + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)' + +# We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest +# hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove +# the bash output redirection.) +py_command = 'from utils.tests_fetcher import get_doctest_files; to_test = get_doctest_files() + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)' py_command = f"$(python3 -c '{py_command}')" -command = f'echo "{py_command}" > pr_documentation_tests_filtered.txt' +command = f'echo "{py_command}" > pr_documentation_tests_temp.txt' doc_test_job = CircleCIJob( "pr_documentation_tests", additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"}, @@ -451,27 +454,20 @@ def job_name(self): "touch dummy.py", { "name": "Get files to test", - "command": - "git remote add upstream https://github.com/huggingface/transformers.git && git fetch upstream \n" - "git diff --name-only --relative --diff-filter=AMR refs/remotes/upstream/main...HEAD | grep -E '\.(py|mdx)$' | grep -Ev '^\..*|/\.' | grep -Ev '__' > pr_documentation_tests.txt" - }, - { - "name": "List files beings changed: pr_documentation_tests.txt", - "command": - "cat pr_documentation_tests.txt" + "command": command, }, { - "name": "Filter pr_documentation_tests.txt", + "name": "Show information in `Get files to test`", "command": - command + "cat pr_documentation_tests_temp.txt" }, { - "name": "List files beings tested: pr_documentation_tests_filtered.txt", + "name": "Get the last line in `pr_documentation_tests.txt`", "command": - "cat pr_documentation_tests_filtered.txt" + "tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests.txt" }, ], - tests_to_run="$(cat pr_documentation_tests_filtered.txt)", # noqa + tests_to_run="$(cat pr_documentation_tests.txt)", # noqa pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None}, command_timeout=1200, # test cannot run longer than 1200 seconds pytest_num_workers=1, diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index 55c09b1acc829b..76eeb001dfd2b3 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -25,11 +25,17 @@ jobs: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: + - name: uninstall transformers (installed during docker image build) + run: python3 -m pip uninstall -y transformers + - uses: actions/checkout@v3 - name: NVIDIA-SMI run: | nvidia-smi + - name: Install transformers in edit mode + run: python3 -m pip install -e . + - name: GPU visibility run: | python3 utils/print_env.py diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index 702b1e126ab468..8aa1015f394798 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -216,6 +216,66 @@ def get_modified_python_files(diff_with_last_commit=False): return get_diff(repo, repo.head.commit, parent_commits) +def get_diff_for_py_and_mdx_files(repo, base_commit, commits): + """ + Get's the diff between one or several commits and the head of the repository. + """ + print("\n### DIFF ###\n") + code_diff = [] + for commit in commits: + for diff_obj in commit.diff(base_commit): + # We always add new python files + if diff_obj.change_type in ["A", "M", "R"] and ( + diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx") + ): + code_diff.append(diff_obj.b_path) + + return code_diff + + +def get_modified_python_and_mdx_files(diff_with_last_commit=False): + """ + Return a list of python and mdx files that have been modified between: + + - the current head and the main branch if `diff_with_last_commit=False` (default) + - the current head and its parent commit otherwise. + """ + repo = Repo(PATH_TO_REPO) + + if not diff_with_last_commit: + print(f"main is at {repo.refs.main.commit}") + print(f"Current head is at {repo.head.commit}") + + branching_commits = repo.merge_base(repo.refs.main, repo.head) + for commit in branching_commits: + print(f"Branching commit: {commit}") + return get_diff_for_py_and_mdx_files(repo, repo.head.commit, branching_commits) + else: + print(f"main is at {repo.head.commit}") + parent_commits = repo.head.commit.parents + for commit in parent_commits: + print(f"Parent commit: {commit}") + return get_diff_for_py_and_mdx_files(repo, repo.head.commit, parent_commits) + + +def get_doctest_files(diff_with_last_commit=False): + """ + Return a list of python and mdx files that have been modified between: + + - the current head and the main branch if `diff_with_last_commit=False` (default) + - the current head and its parent commit otherwise. + """ + test_files_to_run = get_modified_python_and_mdx_files(diff_with_last_commit) + with open("utils/documentation_tests.txt") as fp: + documentation_tests = set(fp.read().strip().split("\n")) + # So far we don't have 100% coverage for doctest. This line will be removed once we achieve 100%. + test_files_to_run = [x for x in test_files_to_run if x in documentation_tests] + # Make sure we did not end up with a test file that was removed + test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()] + + return test_files_to_run + + # (:?^|\n) -> Non-catching group for the beginning of the doc or a new line. # \s*from\s+(\.+\S+)\s+import\s+([^\n]+) -> Line only contains from .xxx import yyy and we catch .xxx and yyy # (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every