open-mmlab · zengyh1900 · Oct 8, 2022 · Sep 27, 2022
diff --git a/.dev_scripts/README.md b/.dev_scripts/README.md
@@ -9,6 +9,7 @@
 - [4. Monitor your training](#4-monitor-your-training)
 - [5. Train with a list of models](#5-train-with-a-list-of-models)
 - [6. Train with skipping a list of models](#6-train-with-skipping-a-list-of-models)
+- [7. Automatically check links](#automatically-check-links)
 
 ## 1. Check UT
 
@@ -128,7 +129,7 @@ python .dev_scripts/train_benchmark.py mm_lol \
     --quotatype=auto
 ```
 
-# 4. Monitor your training
+## 4. Monitor your training
 
 After you submitting jobs following [3-Train-all-the-models](#3-train-all-the-models), you will find a `xxx.log` file.
 This log file list all the job name of job id you have submitted. With this log file, you can monitor your training by running `.dev_scripts/job_watcher.py`.
@@ -141,7 +142,7 @@ python .dev_scripts/job_watcher.py --work-dir work_dirs/benchmark_fp32/ --log 20
 
 Then, you will find `20220923-140317.csv`, which reports the status and recent log of each job.
 
-# 5. Train with a list of models
+## 5. Train with a list of models
 
 If you only need to run some of the models, you can list all the models' name in a file, and specify the models when using `train_benchmark.py`.
 
@@ -162,7 +163,7 @@ python .dev_scripts/train_benchmark.py mm_lol \
 
 Specifically, you need to enable `--rerun`, and specify the list of models to rerun by `--rerun-list`
 
-# 6. Train with skipping a list of models
+## 6. Train with skipping a list of models
 
 If you want to train all the models while skipping some models, you can also list all the models' name in a file, and specify the models when running `train_benchmark.py`.
 
@@ -182,3 +183,18 @@ python .dev_scripts/train_benchmark.py mm_lol \
 ```
 
 Specifically, you need to enable `--skip`, and specify the list of models to skip by `--skip-list`
+
+## Automatically check links
+
+Use the following script to check whether the links in documentations are valid:
+
+```shell
+python3 .github/scripts/doc_link_checker.py --target docs/zh_cn
+python3 .github/scripts/doc_link_checker.py --target README_zh-CN.md
+python3 .github/scripts/doc_link_checker.py --target docs/en
+python3 .github/scripts/doc_link_checker.py --target README.md
+```
+
+You can specify the `--target` by a file or a directory.
+
+**Notes:** DO NOT use it in CI, because requiring too many http requirements by CI will cause 503 and CI will propabaly fail.
diff --git a/.dev_scripts/doc_link_checker.py b/.dev_scripts/doc_link_checker.py
@@ -0,0 +1,85 @@
+# Copyright (c) MegFlow. All rights reserved.
+# /bin/python3
+
+import argparse
+import os
+import re
+
+
+def make_parser():
+    parser = argparse.ArgumentParser('Doc link checker')
+    parser.add_argument(
+        '--http', default=False, type=bool, help='check http or not ')
+    parser.add_argument(
+        '--target',
+        default='./docs',
+        type=str,
+        help='the directory or file to check')
+    return parser
+
+
+pattern = re.compile(r'\[.*?\]\(.*?\)')
+
+
+def analyze_doc(home, path):
+    print('analyze {}'.format(path))
+    problem_list = []
+    code_block = 0
+    with open(path) as f:
+        lines = f.readlines()
+        for line in lines:
+            line = line.strip()
+            if line.startswith('```'):
+                code_block = 1 - code_block
+
+            if code_block > 0:
+                continue
+
+            if '[' in line and ']' in line and '(' in line and ')' in line:
+                all = pattern.findall(line)
+                for item in all:
+                    # skip  ![]()
+                    if item.find('[') == item.find(']') - 1:
+                        continue
+
+                    # process the case [text()]()
+                    offset = item.find('](')
+                    if offset == -1:
+                        continue
+                    item = item[offset:]
+                    start = item.find('(')
+                    end = item.find(')')
+                    ref = item[start + 1:end]
+
+                    if ref.startswith('http') or ref.startswith('#'):
+                        continue
+                    if '.md#' in ref:
+                        ref = ref[ref.find('#'):]
+                    fullpath = os.path.join(home, ref)
+                    if not os.path.exists(fullpath):
+                        problem_list.append(ref)
+            else:
+                continue
+    if len(problem_list) > 0:
+        print(f'{path}:')
+        for item in problem_list:
+            print(f'\t {item}')
+        print('\n')
+        raise Exception('found link error')
+
+
+def traverse(target):
+    if os.path.isfile(target):
+        analyze_doc(os.path.dirname(target), target)
+        return
+    for home, dirs, files in os.walk(target):
+        for filename in files:
+            if filename.endswith('.md'):
+                path = os.path.join(home, filename)
+                if os.path.islink(path) is False:
+                    analyze_doc(home, path)
+
+
+if __name__ == '__main__':
+    args = make_parser().parse_args()
+    traverse(args.target)