Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix doctest files fetch issue #23277

Merged
merged 8 commits into from
May 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions .circleci/create_circleci_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,10 +432,13 @@ def job_name(self):
tests_to_run="tests/repo_utils",
)

# At this moment, only the files that are in `utils/documentation_tests.txt` will be kept (together with a dummy file).
py_command = 'import os; import json; fp = open("pr_documentation_tests.txt"); data_1 = fp.read().strip().split("\\n"); fp = open("utils/documentation_tests.txt"); data_2 = fp.read().strip().split("\\n"); to_test = [x for x in data_1 if x in set(data_2)] + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

put the logic to some new functions in utils/tests_fetcher.py.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's cleaner thanks!

# We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest
# hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove
# the bash output redirection.)
py_command = 'from utils.tests_fetcher import get_doctest_files; to_test = get_doctest_files() + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
py_command = f"$(python3 -c '{py_command}')"
command = f'echo "{py_command}" > pr_documentation_tests_filtered.txt'
command = f'echo "{py_command}" > pr_documentation_tests_temp.txt'
doc_test_job = CircleCIJob(
"pr_documentation_tests",
additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
Expand All @@ -451,27 +454,20 @@ def job_name(self):
"touch dummy.py",
{
"name": "Get files to test",
"command":
"git remote add upstream https://github.com/huggingface/transformers.git && git fetch upstream \n"
"git diff --name-only --relative --diff-filter=AMR refs/remotes/upstream/main...HEAD | grep -E '\.(py|mdx)$' | grep -Ev '^\..*|/\.' | grep -Ev '__' > pr_documentation_tests.txt"
Comment on lines -455 to -456
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the stuff that probably needs to be different on main vs a PR branch. The test fetcher has this test that can be reused/adapted.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problems comes from the fact grep gives exit code 1 if the input is empty.

circleci@9576b4afebd1:~/transformers$ echo "" | grep -E '\.(mdx)$'
circleci@9576b4afebd1:~/transformers$ echo $?
1

Considering the chain of 3 grep in the 2nd line here, I think it's best to have a python function to do things.

},
{
"name": "List files beings changed: pr_documentation_tests.txt",
"command":
"cat pr_documentation_tests.txt"
"command": command,
},
{
"name": "Filter pr_documentation_tests.txt",
"name": "Show information in `Get files to test`",
"command":
command
"cat pr_documentation_tests_temp.txt"
},
{
"name": "List files beings tested: pr_documentation_tests_filtered.txt",
"name": "Get the last line in `pr_documentation_tests.txt`",
"command":
"cat pr_documentation_tests_filtered.txt"
"tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests.txt"
},
],
tests_to_run="$(cat pr_documentation_tests_filtered.txt)", # noqa
tests_to_run="$(cat pr_documentation_tests.txt)", # noqa
pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None},
command_timeout=1200, # test cannot run longer than 1200 seconds
pytest_num_workers=1,
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/doctests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,17 @@ jobs:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: uninstall transformers (installed during docker image build)
run: python3 -m pip uninstall -y transformers

- uses: actions/checkout@v3
- name: NVIDIA-SMI
run: |
nvidia-smi

- name: Install transformers in edit mode
run: python3 -m pip install -e .

- name: GPU visibility
run: |
python3 utils/print_env.py
Expand Down
60 changes: 60 additions & 0 deletions utils/tests_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,66 @@ def get_modified_python_files(diff_with_last_commit=False):
return get_diff(repo, repo.head.commit, parent_commits)


def get_diff_for_py_and_mdx_files(repo, base_commit, commits):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just copies of existing functions with some modifications for doctest test files.

"""
Get's the diff between one or several commits and the head of the repository.
"""
print("\n### DIFF ###\n")
code_diff = []
for commit in commits:
for diff_obj in commit.diff(base_commit):
# We always add new python files
if diff_obj.change_type in ["A", "M", "R"] and (
diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")
):
code_diff.append(diff_obj.b_path)

return code_diff


def get_modified_python_and_mdx_files(diff_with_last_commit=False):
"""
Return a list of python and mdx files that have been modified between:

- the current head and the main branch if `diff_with_last_commit=False` (default)
- the current head and its parent commit otherwise.
"""
repo = Repo(PATH_TO_REPO)

if not diff_with_last_commit:
print(f"main is at {repo.refs.main.commit}")
print(f"Current head is at {repo.head.commit}")

branching_commits = repo.merge_base(repo.refs.main, repo.head)
for commit in branching_commits:
print(f"Branching commit: {commit}")
return get_diff_for_py_and_mdx_files(repo, repo.head.commit, branching_commits)
else:
print(f"main is at {repo.head.commit}")
parent_commits = repo.head.commit.parents
for commit in parent_commits:
print(f"Parent commit: {commit}")
return get_diff_for_py_and_mdx_files(repo, repo.head.commit, parent_commits)


def get_doctest_files(diff_with_last_commit=False):
"""
Return a list of python and mdx files that have been modified between:

- the current head and the main branch if `diff_with_last_commit=False` (default)
- the current head and its parent commit otherwise.
"""
test_files_to_run = get_modified_python_and_mdx_files(diff_with_last_commit)
with open("utils/documentation_tests.txt") as fp:
documentation_tests = set(fp.read().strip().split("\n"))
# So far we don't have 100% coverage for doctest. This line will be removed once we achieve 100%.
test_files_to_run = [x for x in test_files_to_run if x in documentation_tests]
# Make sure we did not end up with a test file that was removed
test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]

return test_files_to_run


# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
# \s*from\s+(\.+\S+)\s+import\s+([^\n]+) -> Line only contains from .xxx import yyy and we catch .xxx and yyy
# (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every
Expand Down