Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
7b8aab5
Script to check links on .md files
dashohoxha Oct 10, 2019
3c78e17
Small fix
dashohoxha Oct 10, 2019
efbaa24
Remove debug code etc.
dashohoxha Oct 12, 2019
f448a34
test:scripts: update check-links
casperdcl Jan 29, 2020
4e53a6c
test:pre-commit:add hooks and config
casperdcl Jan 29, 2020
0a03cdd
test:ci:check diff links for 404
casperdcl Jan 29, 2020
ccc6211
test:check-links:minor url detector fix
casperdcl Jan 29, 2020
220cd2a
test:check-links:exclude list
casperdcl Jan 29, 2020
ac0b426
test:check-links:move exclusions to external file
casperdcl Jan 29, 2020
fd2ece5
test:check-links:fix spaces and backticks
casperdcl Jan 30, 2020
63ce875
test:check-links:more status codes and warnings
casperdcl Jan 30, 2020
8fb1075
test:check-links:fix status code display
casperdcl Jan 30, 2020
34d3640
test:check-links:tidy output whitespace
casperdcl Jan 30, 2020
0b56a18
test:check-links:add bonus html hrefs
casperdcl Jan 30, 2020
f585d13
test:yarn:add link-check
casperdcl Jan 30, 2020
beb1aaa
test:ci:add full yarn url-check
casperdcl Jan 30, 2020
6c697c4
test:check-links:comments
casperdcl Jan 30, 2020
d5aa06b
test:check-links:pre-commit checks md & js
casperdcl Jan 30, 2020
7752ee5
test:check-links:add yarn/husky pre-commit diff
casperdcl Jan 30, 2020
774d8b3
test:ci:check-links revert to only diff for now
casperdcl Jan 30, 2020
f3c05aa
test:yarn:check-links js & md
casperdcl Jan 30, 2020
eef515b
test:check-links:remove .pre-commit-config.yaml conflict with husky, …
casperdcl Jan 30, 2020
979ecf9
test:check-links:remove silly } warning
casperdcl Jan 30, 2020
73a4317
test:yarn:link-check all files
casperdcl Jan 30, 2020
60f7350
lint:add .github/ to link checking
casperdcl Feb 1, 2020
3580e23
scripts:set -euxo pipefail
casperdcl Feb 1, 2020
d4a1933
scripts:remove annoying -x from one script
casperdcl Feb 1, 2020
816bcf7
lint:check-links with pcre recursion
casperdcl Feb 2, 2020
175e694
lint:check-links:updare exclusions
casperdcl Feb 2, 2020
e92317b
lint:rename check-links => link-check, misc tidy
casperdcl Feb 2, 2020
1ab8a8c
scripts:add link-check-git-all concurrent helper
casperdcl Feb 2, 2020
1613146
scripts:link-check missing exclude {} from js
casperdcl Feb 2, 2020
ba42034
scripts:minor link-check ';' fix
casperdcl Feb 2, 2020
a53b6aa
lint:link-check:remove pre-commit hook, tidy CI
casperdcl Feb 2, 2020
2b55fe6
ci:daily full test
casperdcl Feb 2, 2020
b3a6b2a
CI:missing pcregrep dependency
casperdcl Feb 2, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 61 additions & 19 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,21 @@
#
# Check https://circleci.com/docs/2.0/language-javascript/ for more details
#
version: 2
jobs:
build:
docker:
# specify the version you desire here
- image: circleci/node:10.16.3
version: 2.1

# Specify service dependencies here if necessary
# CircleCI maintains a library of pre-built images
# documented at https://circleci.com/docs/2.0/circleci-images/
# - image: circleci/mongo:3.4.4
defaults: &defaults
working_directory: ~/repo
docker:
# specify the version you desire here
- image: circleci/node:10.16.3

working_directory: ~/repo
# Specify service dependencies here if necessary
# CircleCI maintains a library of pre-built images
# documented at https://circleci.com/docs/2.0/circleci-images/
# - image: circleci/mongo:3.4.4

commands:
install:
steps:
- checkout

Expand All @@ -28,24 +29,65 @@ jobs:
git reset --hard origin/master
git checkout -

# Download and cache dependencies
# Download cached dependencies
- restore_cache:
keys:
- v1-dependencies-{{ checksum "yarn.lock" }}
# fallback to using the latest cache if no exact match is found
- v1-dependencies-

- run: yarn
- run:
name: apt dependencies
command: |
sudo apt-get update
sudo apt-get install pcregrep

cache:
steps:
# Upload dependencies cache
- save_cache:
paths:
- node_modules
key: v1-dependencies-{{ checksum "yarn.lock" }}
jobs:
test:
<<: *defaults
steps:
- install
- run: yarn
- run: yarn build

- run: yarn test

- run: yarn format-check
- run: yarn lint-check
- run: yarn link-check-diff
- cache

test_full:
<<: *defaults
steps:
- install
- run: yarn
- run: yarn build
- run: yarn test
- run: yarn format-check
- run: yarn lint-check
- run: yarn link-check
- cache

- save_cache:
paths:
- node_modules
key: v1-dependencies-{{ checksum "yarn.lock" }}
workflows:
version: 2

commit:
jobs:
- test

daily:
triggers:
- schedule:
cron: '0 0 * * *'
filters:
branches:
only:
- master
jobs:
- test_full
6 changes: 6 additions & 0 deletions .pre-commit-hooks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- id: dead-url
name: Dead URL Checker
entry: scripts/link-check.sh
language: script
types: [text]
description: This hook searches for problematic URLs.
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
"format-check": "prettier --check '{.,pages/**,public/static/docs/**,src/**}/*.{js,md}'",
"lint-check": "eslint src pages",
"format-all": "prettier --write '{.,pages/**,public/static/docs/**,src/**}/*.{js,md}'",
"format": "prettier --write"
"format": "prettier --write",
"link-check": "scripts/link-check-git-all.sh",
"link-check-diff": "scripts/link-check-git-diff.sh"
},
"repository": {
"type": "git",
Expand Down
20 changes: 20 additions & 0 deletions scripts/exclude-links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
http://127.0.0.1:10000/devstoreaccount1;
http://localhost:3000/
https://$
https://api.github.com/repos/$
https://blog.$
https://discuss.$
https://dvc.org/some.link
https://example.com/data.txt
https://example.com/path/to/data
https://example.com/path/to/data.csv
https://example.com/path/to/dir
https://github.com/$
https://github.com/dataversioncontrol/myrepo.git
https://github.com/example/registry
https://github.com/iterative/dvc.org/blob/master/public$
https://github.com/iterative/dvc/releases/download/$
Comment on lines +15 to +16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes my regexes match only up to the literal $. It seems a good compromise between blacklisting false negatives and programming false negatives detection.

https://github.com/myaccount/myproject.git
https://myendpoint.com
https://object-storage.example.com
https://www.youtube.com/embed/$
3 changes: 3 additions & 0 deletions scripts/link-check-git-all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash
(find pages/ public/static/docs/ src/ .github/ -name '*.md' -o -name '*.js' && ls *.md *.js) \
| xargs -n1 -P8 $(dirname "$0")/link-check.sh
3 changes: 3 additions & 0 deletions scripts/link-check-git-diff.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash
set -euxo pipefail
$(dirname "$0")/link-check.sh <(git diff origin/master -U0)
53 changes: 53 additions & 0 deletions scripts/link-check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env bash
# Check HTTP status codes of links in the given files.
# Success: 2xx, Errors: 4xx/5xx, Warnings: anything else.
# Redirects (3xx) are followed.
# Usage:
# link-check.sh [<files>]
set -euo pipefail

base_url="${CHECK_LINKS_RELATIVE_URL:-https://dvc.org}"
exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname $0)/exclude-links.txt}"
[ -f "$exclude" ] && exclude="$(cat $exclude)"

finder(){ # expects list of files
# explicit links not in markdown
pcregrep -o '(?<!\]\()https?://[^\s<>{}"'"'"'`]+' "$@"
# explicit links in markdown
pcregrep -o '(?<=\])\(https?://[^[\]\s]+\)' "$@" | pcregrep -o '\((?:[^)(]*(?R)?)*+\)' | pcregrep -o '(?<=\().*(?=\))'
# relative links in markdown
sed -nr 's/.*]\((\/[^)[:space:]]+).*/\1/p' "$@" | xargs -n1 -II echo ${base_url}I
# relative links in html
sed -nr 's/.*href=["'"'"'](\/[^"'"'"']+?)["'"'"'].*/\1/p' "$@" | xargs -n1 -II echo ${base_url}I
}
checker(){ # expects list of urls
errors=0
for url in "$@"; do
status="$(curl -IL -w '%{http_code}' -so /dev/null "$url")"
case "$status" in
2??)
# success
;;
[45]??)
echo
echo " ERROR:$status:$url" >&2
errors=$(($errors + 1))
;;
*)
echo
echo " WARNING:$status:$url" >&2
;;
esac
done
return $errors
}

fails=0
for file in "$@"; do
echo -n "$file:"
prev=$fails
checker $(finder "$file" | sort -u | comm -23 - <(echo "$exclude" | sort -u)) || fails=$(($fails + 1))
[ $prev -eq $fails ] && echo OK
done
[ $fails -eq 0 ] || echo -e "ERROR:$fails failures\n---" >&2
exit $fails