-
Notifications
You must be signed in to change notification settings - Fork 27.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix doctest files fetch issue #23277
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -432,10 +432,13 @@ def job_name(self): | |
tests_to_run="tests/repo_utils", | ||
) | ||
|
||
# At this moment, only the files that are in `utils/documentation_tests.txt` will be kept (together with a dummy file). | ||
py_command = 'import os; import json; fp = open("pr_documentation_tests.txt"); data_1 = fp.read().strip().split("\\n"); fp = open("utils/documentation_tests.txt"); data_2 = fp.read().strip().split("\\n"); to_test = [x for x in data_1 if x in set(data_2)] + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)' | ||
|
||
# We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest | ||
# hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove | ||
# the bash output redirection.) | ||
py_command = 'from utils.tests_fetcher import get_doctest_files; to_test = get_doctest_files() + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)' | ||
py_command = f"$(python3 -c '{py_command}')" | ||
command = f'echo "{py_command}" > pr_documentation_tests_filtered.txt' | ||
command = f'echo "{py_command}" > pr_documentation_tests_temp.txt' | ||
doc_test_job = CircleCIJob( | ||
"pr_documentation_tests", | ||
additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"}, | ||
|
@@ -451,27 +454,20 @@ def job_name(self): | |
"touch dummy.py", | ||
{ | ||
"name": "Get files to test", | ||
"command": | ||
"git remote add upstream https://github.com/huggingface/transformers.git && git fetch upstream \n" | ||
"git diff --name-only --relative --diff-filter=AMR refs/remotes/upstream/main...HEAD | grep -E '\.(py|mdx)$' | grep -Ev '^\..*|/\.' | grep -Ev '__' > pr_documentation_tests.txt" | ||
Comment on lines
-455
to
-456
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the stuff that probably needs to be different on main vs a PR branch. The test fetcher has this test that can be reused/adapted. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problems comes from the fact circleci@9576b4afebd1:~/transformers$ echo "" | grep -E '\.(mdx)$'
circleci@9576b4afebd1:~/transformers$ echo $?
1 Considering the chain of 3 grep in the 2nd line here, I think it's best to have a python function to do things. |
||
}, | ||
{ | ||
"name": "List files beings changed: pr_documentation_tests.txt", | ||
"command": | ||
"cat pr_documentation_tests.txt" | ||
"command": command, | ||
}, | ||
{ | ||
"name": "Filter pr_documentation_tests.txt", | ||
"name": "Show information in `Get files to test`", | ||
"command": | ||
command | ||
"cat pr_documentation_tests_temp.txt" | ||
}, | ||
{ | ||
"name": "List files beings tested: pr_documentation_tests_filtered.txt", | ||
"name": "Get the last line in `pr_documentation_tests.txt`", | ||
"command": | ||
"cat pr_documentation_tests_filtered.txt" | ||
"tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests.txt" | ||
}, | ||
], | ||
tests_to_run="$(cat pr_documentation_tests_filtered.txt)", # noqa | ||
tests_to_run="$(cat pr_documentation_tests.txt)", # noqa | ||
pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None}, | ||
command_timeout=1200, # test cannot run longer than 1200 seconds | ||
pytest_num_workers=1, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -216,6 +216,66 @@ def get_modified_python_files(diff_with_last_commit=False): | |
return get_diff(repo, repo.head.commit, parent_commits) | ||
|
||
|
||
def get_diff_for_py_and_mdx_files(repo, base_commit, commits): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just copies of existing functions with some modifications for doctest test files. |
||
""" | ||
Get's the diff between one or several commits and the head of the repository. | ||
""" | ||
print("\n### DIFF ###\n") | ||
code_diff = [] | ||
for commit in commits: | ||
for diff_obj in commit.diff(base_commit): | ||
# We always add new python files | ||
if diff_obj.change_type in ["A", "M", "R"] and ( | ||
diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx") | ||
): | ||
code_diff.append(diff_obj.b_path) | ||
|
||
return code_diff | ||
|
||
|
||
def get_modified_python_and_mdx_files(diff_with_last_commit=False): | ||
""" | ||
Return a list of python and mdx files that have been modified between: | ||
|
||
- the current head and the main branch if `diff_with_last_commit=False` (default) | ||
- the current head and its parent commit otherwise. | ||
""" | ||
repo = Repo(PATH_TO_REPO) | ||
|
||
if not diff_with_last_commit: | ||
print(f"main is at {repo.refs.main.commit}") | ||
print(f"Current head is at {repo.head.commit}") | ||
|
||
branching_commits = repo.merge_base(repo.refs.main, repo.head) | ||
for commit in branching_commits: | ||
print(f"Branching commit: {commit}") | ||
return get_diff_for_py_and_mdx_files(repo, repo.head.commit, branching_commits) | ||
else: | ||
print(f"main is at {repo.head.commit}") | ||
parent_commits = repo.head.commit.parents | ||
for commit in parent_commits: | ||
print(f"Parent commit: {commit}") | ||
return get_diff_for_py_and_mdx_files(repo, repo.head.commit, parent_commits) | ||
|
||
|
||
def get_doctest_files(diff_with_last_commit=False): | ||
""" | ||
Return a list of python and mdx files that have been modified between: | ||
|
||
- the current head and the main branch if `diff_with_last_commit=False` (default) | ||
- the current head and its parent commit otherwise. | ||
""" | ||
test_files_to_run = get_modified_python_and_mdx_files(diff_with_last_commit) | ||
with open("utils/documentation_tests.txt") as fp: | ||
documentation_tests = set(fp.read().strip().split("\n")) | ||
# So far we don't have 100% coverage for doctest. This line will be removed once we achieve 100%. | ||
test_files_to_run = [x for x in test_files_to_run if x in documentation_tests] | ||
# Make sure we did not end up with a test file that was removed | ||
test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()] | ||
|
||
return test_files_to_run | ||
|
||
|
||
# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line. | ||
# \s*from\s+(\.+\S+)\s+import\s+([^\n]+) -> Line only contains from .xxx import yyy and we catch .xxx and yyy | ||
# (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
put the logic to some new functions in
utils/tests_fetcher.py
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's cleaner thanks!