Skip to content

Commit

Permalink
Document toc check and doctest check scripts (huggingface#25319)
Browse files Browse the repository at this point in the history
* Clean doc toc check and make doctest list better

* Add to Makefile
  • Loading branch information
sgugger authored Aug 4, 2023
1 parent ce6d153 commit fdaef33
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 9 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ fix-copies:
python utils/check_copies.py --fix_and_overwrite
python utils/check_table.py --fix_and_overwrite
python utils/check_dummies.py --fix_and_overwrite
python utils/check_doctest_list.py --fix_and_overwrite
python utils/check_task_guides.py --fix_and_overwrite

# Run tests for the library
Expand Down
38 changes: 37 additions & 1 deletion utils/check_doc_toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script is responsible for cleaning the model section of the table of content by removing duplicates and sorting
the entries in alphabetical order.
Usage (from the root of the repo):
Check that the table of content is properly sorted (used in `make quality`):
```bash
python utils/check_doc_toc.py
```
Auto-sort the table of content if it is not properly sorted (used in `make style`):
```bash
python utils/check_doc_toc.py --fix_and_overwrite
```
"""


import argparse
from collections import defaultdict
Expand All @@ -24,7 +43,15 @@

def clean_model_doc_toc(model_doc):
"""
Cleans the table of content of the model documentation by removing duplicates and sorting models alphabetically.
Cleans a section of the table of content of the model documentation (one specific modality) by removing duplicates
and sorting models alphabetically.
Args:
model_doc (`List[dict]`):
The list of dictionaries extracted from the `_toctree.yml` file for this specific modality.
Returns:
`List[dict]`: List of dictionaries like the input, but cleaned up and sorted.
"""
counts = defaultdict(int)
for doc in model_doc:
Expand All @@ -51,6 +78,14 @@ def clean_model_doc_toc(model_doc):


def check_model_doc(overwrite=False):
"""
Check that the content of the table of content in `_toctree.yml` is clean (no duplicates and sorted for the model
API doc) and potentially auto-cleans it.
Args:
overwrite (`bool`, *optional*, defaults to `False`):
Whether to just check if the TOC is clean or to auto-clean it (when `overwrite=True`).
"""
with open(PATH_TO_TOC, encoding="utf-8") as f:
content = yaml.safe_load(f.read())

Expand All @@ -67,6 +102,7 @@ def check_model_doc(overwrite=False):

model_doc = api_doc[model_idx]["sections"]

# Extract the modalities and clean them one by one.
modalities_docs = [(idx, section) for idx, section in enumerate(model_doc) if "sections" in section]
diff = False
for idx, modality_doc in modalities_docs:
Expand Down
53 changes: 45 additions & 8 deletions utils/check_doctest_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,65 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script is responsible for cleaning the list of doctests by making sure the entries all exist and are in
alphabetical order.
Usage (from the root of the repo):
Check that the doctest list is properly sorted and all files exist (used in `make repo-consistency`):
```bash
python utils/check_doctest_list.py
```
Auto-sort the doctest list if it is not properly sorted (used in `make fix-copies`):
```bash
python utils/check_doctest_list.py --fix_and_overwrite
```
"""
import argparse
import os


# All paths are set with the intent you should run this script from the root of the repo with the command
# python utils/check_doctest_list.py
REPO_PATH = "."
DOCTEST_FILE_PATHS = ["documentation_tests.txt", "slow_documentation_tests.txt"]


if __name__ == "__main__":
doctest_file_path = os.path.join(REPO_PATH, "utils/documentation_tests.txt")
def clean_doctest_list(doctest_file, overwrite=False):
non_existent_paths = []
all_paths = []
with open(doctest_file_path) as fp:
for line in fp:
with open(doctest_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
path = os.path.join(REPO_PATH, line)
if not (os.path.isfile(path) or os.path.isdir(path)):
non_existent_paths.append(line)
all_paths.append(path)
all_paths.append(line)

if len(non_existent_paths) > 0:
non_existent_paths = "\n".join(non_existent_paths)
non_existent_paths = "\n".join([f"- {f}" for f in non_existent_paths])
raise ValueError(f"`utils/documentation_tests.txt` contains non-existent paths:\n{non_existent_paths}")
if all_paths != sorted(all_paths):
raise ValueError("Files in `utils/documentation_tests.txt` are not in alphabetical order.")

sorted_paths = sorted(all_paths)
if all_paths != sorted_paths:
if not overwrite:
raise ValueError(
f"Files in `{doctest_file}` are not in alphabetical order, run `make fix-copies` to fix "
"this automatically."
)
with open(doctest_file, "w", encoding="utf-8") as f:
f.write("\n".join(sorted_paths) + "\n")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
args = parser.parse_args()

for doctest_file in DOCTEST_FILE_PATHS:
doctest_file = os.path.join(REPO_PATH, "utils", doctest_file)
clean_doctest_list(doctest_file, args.fix_and_overwrite)

0 comments on commit fdaef33

Please sign in to comment.