Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recursive folders, aliases, parallel requests, simplifications #289

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,15 +155,10 @@ markdown-link-check ./README.md

#### Check links from a local markdown folder (recursive)

Avoid using `find -exec` because it will swallow the error from each consecutive run.
Instead, use `xargs`:
```shell
find . -name \*.md -print0 | xargs -0 -n1 markdown-link-check
markdown-link-check ./docs
```

There is an [open issue](https://github.com/tcort/markdown-link-check/issues/78) for allowing the tool to specify
multiple files on the command line.

#### Usage

```shell
Expand All @@ -186,12 +181,14 @@ Options:

* `ignorePatterns`: An array of objects holding regular expressions which a link is checked against and skipped for checking in case of a match.
* `replacementPatterns`: An array of objects holding regular expressions which are replaced in a link with their corresponding replacement string. This behavior allows (for example) to adapt to certain platform conventions hosting the Markdown. The special replacement `{{BASEURL}}` can be used to dynamically link to the current working directory (for example that `/` points to the root of your current working directory).
* `aliases`: An object holding a `aliasBasePath` and a `alias` object. The `aliasBasePath` is used to prefix all aliases. The `alias` object holds a mapping of regular expressions to replacement strings. The replacement strings can contain `$1`, `$2`, etc. to reference the matching groups in the regular expression (feature required for checking links to documentation hosted with [docsify aliases](https://docsify.js.org/#/configuration?id=alias))
* `httpHeaders`: The headers are only applied to links where the link **starts with** one of the supplied URLs in the `urls` section.
* `timeout` timeout in [zeit/ms](https://www.npmjs.com/package/ms) format. (e.g. `"2000ms"`, `20s`, `1m`). Default `10s`.
* `retryOn429` if this is `true` then retry request when response is an HTTP code 429 after the duration indicated by `retry-after` header.
* `retryCount` the number of retries to be made on a 429 response. Default `2`.
* `fallbackRetryDelay` the delay in [zeit/ms](https://www.npmjs.com/package/ms) format. (e.g. `"2000ms"`, `20s`, `1m`) for retries on a 429 response when no `retry-after` header is returned or when it has an invalid value. Default is `60s`.
* `aliveStatusCodes` a list of HTTP codes to consider as alive.
* `parallel` the number of parallel requests to be made. Default `2`.

**Example:**

Expand Down Expand Up @@ -221,11 +218,19 @@ Options:
}
}
],
"aliases": {
"aliasBasePath": "https://example.com/#",
"alias": {
"/sub-project-one/(.*)": "https://raw.githubusercontent.com/my-org/project-one/main/docs/$1",
"/sub-project-two/(.*)": "https://raw.githubusercontent.com/my-org/project-two/main/docs/$1",
}
},
"timeout": "20s",
"retryOn429": true,
"retryCount": 5,
"fallbackRetryDelay": "30s",
"aliveStatusCodes": [200, 206]
"aliveStatusCodes": [200, 206],
"parallel": 2
}
```

Expand Down
25 changes: 19 additions & 6 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
'use strict';

const _ = require('lodash');
const async = require('async');
const linkCheck = require('link-check');
const LinkCheckResult = require('link-check').LinkCheckResult;
const linkCheck = require('./link-check');
const LinkCheckResult = require('./link-check').LinkCheckResult;
const markdownLinkExtractor = require('markdown-link-extractor');
const ProgressBar = require('progress');

Expand Down Expand Up @@ -62,7 +61,7 @@ module.exports = function markdownLinkCheck(markdown, opts, callback) {
}

const { links, anchors } = markdownLinkExtractor(markdown);
const linksCollection = _.uniq(links);
const linksCollection = [...new Set(links)]
const bar = (opts.showProgressBar) ?
new ProgressBar('Checking... [:bar] :percent', {
complete: '=',
Expand All @@ -72,8 +71,8 @@ module.exports = function markdownLinkCheck(markdown, opts, callback) {
}) : undefined;

opts.anchors = anchors;

async.mapLimit(linksCollection, 2, function (link, callback) {
let parallel = opts.parallel || 2
async.mapLimit(linksCollection, parallel, function (link, callback) {
if (opts.ignorePatterns) {
const shouldIgnore = opts.ignorePatterns.some(function(ignorePattern) {
return ignorePattern.pattern instanceof RegExp ? ignorePattern.pattern.test(link) : (new RegExp(ignorePattern.pattern)).test(link) ? true : false;
Expand All @@ -94,6 +93,20 @@ module.exports = function markdownLinkCheck(markdown, opts, callback) {
}
}

if (opts.aliases) {
for (let alias of Object.keys(opts.aliases.alias)) {
let regex = new RegExp(opts.aliases.basePath+alias);
if (regex.test(link)) {
link = link.replace(regex, opts.aliases.alias[alias]);
let filename = link.split('/').pop();
if (filename.indexOf('.')<0) {
link += '.md'
}
}
}
}


// Make sure it is not undefined and that the appropriate headers are always recalculated for a given link.
opts.headers = {};

Expand Down
117 changes: 117 additions & 0 deletions link-check.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
const fs = require('fs');
const processModule = require('process');
const url = require('url');
const Isemail = require('isemail');
const { URL } = require('url');

class LinkCheckResult {
constructor(opts, link, statusCode, err) {
opts.aliveStatusCodes = opts.aliveStatusCodes || [200];

this.link = link;
this.statusCode = statusCode || 0;
this.err = err || null;
this.status = opts.aliveStatusCodes.some((statusCode) => (statusCode instanceof RegExp) ? statusCode.test(this.statusCode) : statusCode === this.statusCode) ? 'alive' : 'dead';
}
}

function checkLink(link, opts, callback, attempts = 0) {
let retryOn429 = opts.retryOn429 || false;

//max retry count will default to 2 seconds if not provided in options
let retryCount = opts.retryCount || 2;
const url = encodeURI(decodeURIComponent(new URL(link, opts.baseUrl).toString()));

fetch(url, { method: 'HEAD', headers: opts.headers }).then(res => {
if (res.status === 200) {

callback(null, new LinkCheckResult(opts, link, res ? res.status : 0, null)); // alive, returned 200 OK
} else {
if (res.status === 429) {
if (attempts >= retryCount || !retryOn429) {
callback(null, new LinkCheckResult(opts, link, res ? res.status : 0, null));
return;
}

setTimeout(() => {
checkLink(link, opts, callback, attempts + 1);
}, 1000)
}
else {
// retrying with GET because HEAD failed
fetch(url).then(res => {
callback(null, new LinkCheckResult(opts, link, res ? res.status : 0, null)); // alive, returned 200 OK
res.text();
})
}
}
}).catch(err => {
// console.log("ERROR", err);
callback(err, null);
});
}

function checkFile(link, opts, callback) {

// force baseUrl to end with '/' for proper treatment by WHATWG URL API
if (typeof opts.baseUrl === 'string' && !opts.baseUrl.endsWith('/')) {
opts.baseUrl = opts.baseUrl + '/';
} // without the ending '/', the final component is dropped

const loc = new URL(link || '', opts.baseUrl || processModule.cwd());
// eslint-disable-next-line no-prototype-builtins
fs.access(url.fileURLToPath(loc) || '', fs.hasOwnProperty('R_OK') ? fs.R_OK : fs.constants.R_OK, function (err) {
callback(null, new LinkCheckResult(opts, link, !err ? 200 : 400, err));
});
}

function checkHash(link, opts, callback) {
const anchors = opts.anchors || [];
callback(null, new LinkCheckResult(opts, link, anchors.includes(link) ? 200 : 404, null));
}

function checkMailTo(link, opts, callback) {
const address = link
.substr(7) // strip "mailto:"
.split('?')[0]; // trim ?subject=blah hfields

/* per RFC6068, the '?' is a reserved delimiter and email addresses containing '?' must be encoded,
* so it's safe to split on '?' and pick [0].
*/

callback(null, new LinkCheckResult(opts, link, Isemail.validate(address) ? 200 : 400, null));
}

const protocolChecker = {
hash: checkHash,
file: checkFile,
http: checkLink,
https: checkLink,
mailto: checkMailTo,
};

module.exports = function linkCheck(link, opts, callback) {

if (arguments.length === 2 && typeof opts === 'function') {
// optional 'opts' not supplied.
callback = opts;
opts = {};
}
let url
try {
url = link.startsWith('#') ? link : new URL(link, opts.baseUrl);

} catch (err) {
console.log(link, opts.baseUrl, err);
return;
}
const protocol = link.startsWith('#') ? 'hash' : url.protocol.replace(/:$/, '');
// eslint-disable-next-line no-prototype-builtins
if (!protocolChecker.hasOwnProperty(protocol)) {
callback(new Error('Unsupported Protocol'), null);
return;
}
protocolChecker[protocol](link, opts, callback);
};

module.exports.LinkCheckResult = LinkCheckResult
68 changes: 45 additions & 23 deletions markdown-link-check
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
let chalk;
const fs = require('fs');
const markdownLinkCheck = require('./');
const needle = require('needle');
const path = require('path');
const pkg = require('./package.json');
const program = require('commander');
Expand All @@ -25,12 +24,32 @@ function commaSeparatedCodesList(value, dummyPrevious) {
});
}

/**
* Load all files in the rootFolder and all subfolders that end with .md
*/
function loadAllMarkdownFiles(rootFolder = '.') {
const fs = require('fs');
const path = require('path');
const files = [];
fs.readdirSync(rootFolder).forEach(file => {
const fullPath = path.join(rootFolder, file);
if (fs.lstatSync(fullPath).isDirectory()) {
files.push(...loadAllMarkdownFiles(fullPath));
} else if (fullPath.endsWith('.md')) {
files.push(fullPath);
}
});
return files;
}


function getInputs() {
const inputs = [];

program
.version(pkg.version)
.option('-p, --progress', 'show progress bar')
.option('-n, --parallel <number>', 'number of parallel requests (default: 2)')
.option('-c, --config [config]', 'apply a config file (JSON), holding e.g. url specific header configuration')
.option('-q, --quiet', 'displays errors only')
.option('-v, --verbose', 'displays detailed error information')
Expand Down Expand Up @@ -62,9 +81,7 @@ function getInputs() {
filenameForOutput = filenameOrUrl;
let baseUrl = '';
if (/https?:/.test(filenameOrUrl)) {
stream = needle.get(filenameOrUrl);
stream.on('error', onError);
stream.on('response', onResponse);

try { // extract baseUrl from supplied URL
const parsed = url.parse(filenameOrUrl);
delete parsed.search;
Expand All @@ -73,19 +90,25 @@ function getInputs() {
parsed.pathname = parsed.pathname.substr(0, parsed.pathname.lastIndexOf('/') + 1);
}
baseUrl = url.format(parsed);
console.log('baseUrl: ' + baseUrl)
inputs.push(new Input(filenameForOutput, null, {baseUrl: baseUrl}));
} catch (err) { /* ignore error */
}
} else {
const stats = fs.statSync(filenameOrUrl);
if (stats.isDirectory()){
console.error(chalk.red('\nERROR: ' + filenameOrUrl + ' is a directory! Please provide a valid filename as an argument.'));
process.exit(1);
let files = loadAllMarkdownFiles(filenameOrUrl)
for (let file of files) {
filenameForOutput = file;
baseUrl = 'file://' + path.dirname(path.resolve(file));
inputs.push(new Input(filenameForOutput, stream, {baseUrl: baseUrl}));
}
} else {
baseUrl = 'file://' + path.dirname(path.resolve(filenameOrUrl));
inputs.push(new Input(filenameForOutput, null, {baseUrl: baseUrl}));
}
baseUrl = 'file://' + path.dirname(path.resolve(filenameOrUrl));
stream = fs.createReadStream(filenameOrUrl);
}

inputs.push(new Input(filenameForOutput, stream, {baseUrl: baseUrl}));
}
}
).parse(process.argv);
Expand All @@ -95,6 +118,7 @@ function getInputs() {
input.opts.quiet = (program.opts().quiet === true);
input.opts.verbose = (program.opts().verbose === true);
input.opts.retryOn429 = (program.opts().retry === true);
input.opts.parallel = program.opts().parallel;
input.opts.aliveStatusCodes = program.opts().alive;
const config = program.opts().config;
if (config) {
Expand Down Expand Up @@ -135,18 +159,12 @@ async function loadConfig(config) {

async function processInput(filenameForOutput, stream, opts) {
let markdown = ''; // collect the markdown data, then process it

stream.on('error', function(error) {
if (error.code === 'ENOENT') {
console.error(chalk.red('\nERROR: File not found! Please provide a valid filename as an argument.'));
} else {
console.error(chalk.red(error));
}
return process.exit(1);
});

for await (const chunk of stream) {
markdown += chunk.toString();

if (/https?:/.test(filenameForOutput)) {
let res = await fetch(filenameForOutput);
markdown = await res.text();
} else {
markdown = fs.readFileSync(filenameForOutput, 'utf8')
}

if (!opts.quiet && filenameForOutput) {
Expand All @@ -165,6 +183,7 @@ async function processInput(filenameForOutput, stream, opts) {
opts.retryCount = config.retryCount;
opts.fallbackRetryDelay = config.fallbackRetryDelay;
opts.aliveStatusCodes = config.aliveStatusCodes;
opts.parallel = opts.parallel || config.parallel;
}

await runMarkdownLinkCheck(filenameForOutput, markdown, opts);
Expand Down Expand Up @@ -231,9 +250,11 @@ async function runMarkdownLinkCheck(filenameForOutput, markdown, opts) {

async function main() {
chalk = (await import('chalk')).default;

const inputs = getInputs();

// start time
console.time("Links checked in")

let isOk = true;
for await (const input of inputs) {
try {
Expand All @@ -242,7 +263,8 @@ async function main() {
isOk = false;
}
}

console.timeEnd("Links checked in")
console.log('Exit code ' + (isOk ? 0 : 1));
process.exit(isOk ? 0 : 1);
}

Expand Down
Loading