diff --git a/.gitignore b/.gitignore index 504afef8..e6acbdf0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ +coverage/ +lib-cjs/ node_modules/ +.nyc_output/ package-lock.json diff --git a/.travis.yml b/.travis.yml index aa5bbfba..d1544612 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,5 @@ language: node_js node_js: -- "0.10" -- "0.12" -- 4 -- 6 -- 8 -- node -script: npm test + - 12 + - node +script: npm run ci diff --git a/README.md b/README.md index c7c1e1ac..bdc63434 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,25 @@ -# broken-link-checker [![NPM Version][npm-image]][npm-url] [![Build Status][travis-image]][travis-url] [![Dependency Status][david-image]][david-url] +# broken-link-checker [![NPM Version][npm-image]][npm-url] [![Build Status][travis-image]][travis-url] [![Coverage Status][coveralls-image]][coveralls-url] [![Dependency Monitor][greenkeeper-image]][greenkeeper-url] -> Find broken links, missing images, etc in your HTML. +> Find broken links, missing images, etc within your HTML. -Features: -* Stream-parses local and remote HTML pages -* Concurrently checks multiple links -* Supports various HTML elements/attributes, not just `` -* Supports redirects, absolute URLs, relative URLs and `` -* Honors robot exclusions -* Provides detailed information about each link (HTTP and HTML) -* URL keyword filtering with wildcards -* Pause/Resume at any time +* ✅ **Complete**: Unicode, redirects, compression, basic auth, absolute/relative/local URLs. +* ⚡️ **Fast**: Concurrent, streamed and cached. +* 🍰 **Easy**: Convenient defaults and very configurable. +Other features: +* Support for many HTML elements and attributes; not only `` and ``. +* Support for relative URLs with ``. +* WHATWG specifications-compliant [HTML](https://html.spec.whatwg.org) and [URL](https://url.spec.whatwg.org) parsing. +* Honor robot exclusions (robots.txt, headers and `rel`), optionally. +* Detailed information for reporting and maintenance. +* URL keyword filtering with simple wildcards. +* Pause/Resume at any time. +* 🖕 -## Installation -[Node.js](http://nodejs.org/) `>= 0.10` is required; `< 4.0` will need `Promise` and `Object.assign` polyfills. +## Installation -There're two ways to use it: +[Node.js](http://nodejs.org) `>= 12` is required. There're two ways to use it: ### Command Line Usage To install, type this at the command line: @@ -38,205 +40,280 @@ To install, type this at the command line: ```shell npm install broken-link-checker ``` -The rest of this document will assist you with how to use the API. +The remainder of this document will assist you in using the API. ## Classes +While all classes have been exposed for custom use, the one that you need will most likely be [`SiteChecker`](#sitechecker). -### `blc.HtmlChecker(options, handlers)` -Scans an HTML document to find broken links. +### `HtmlChecker` +Scans an HTML document to find broken links. All methods from [`EventEmitter`](https://nodejs.org/api/events.html#events_class_eventemitter) are available. -* `handlers.complete` is fired after the last result or zero results. -* `handlers.html` is fired after the HTML document has been fully parsed. - * `tree` is supplied by [parse5](https://npmjs.com/parse5) - * `robots` is an instance of [robot-directives](https://npmjs.com/robot-directives) containing any `` robot exclusions. -* `handlers.junk` is fired with data on each skipped link, as configured in options. -* `handlers.link` is fired with the result of each discovered link (broken or not). +```js +const {HtmlChecker} = require('broken-link-checker'); + +const htmlChecker = new HtmlChecker(options) + .on('error', (error) => {}) + .on('html', (tree, robots) => {}) + .on('queue', () => {}) + .on('junk', (result) => {}) + .on('link', (result) => {}) + .on('complete', () => {}); -* `.clearCache()` will remove any cached URL responses. This is only relevant if the `cacheResponses` option is enabled. -* `.numActiveLinks()` returns the number of links with active requests. -* `.numQueuedLinks()` returns the number of links that currently have no active requests. +htmlChecker.scan(html, baseURL); +``` + +#### Methods & Properties +* `.clearCache()` will remove any cached URL responses. +* `.isPaused` returns `true` if the internal link queue is paused and `false` if not. +* `.numActiveLinks` returns the number of links with active requests. +* `.numQueuedLinks` returns the number of links that currently have no active requests. * `.pause()` will pause the internal link queue, but will not pause any active requests. * `.resume()` will resume the internal link queue. -* `.scan(html, baseUrl)` parses & scans a single HTML document. Returns `false` when there is a previously incomplete scan (and `true` otherwise). - * `html` can be a stream or a string. - * `baseUrl` is the address to which all relative URLs will be made absolute. Without a value, links to relative URLs will output an "Invalid URL" error. +* `.scan(html, baseURL)` parses & scans a single HTML document and returns a `Promise`. Calling this function while a previous scan is in progress will result in a thrown error. Arguments: + * `html` must be either a [`Stream`](https://nodejs.org/api/stream.html) or a string. + * `baseURL` must be a [`URL`](https://mdn.io/URL). Without this value, links to relative URLs will output a "BLC_INVALID" error (unless an absolute `` is found). + +#### Events +* `'complete'` is emitted after the last result or zero results. +* `'error'` is emitted when an error occurs within any of your event handlers and will prevent the current scan from failing. Arguments: + * `error` is the `Error`. +* `'html'` is emitted after the HTML document has been fully parsed. Arguments: + * `tree` is supplied by [parse5](https://npmjs.com/parse5). + * `robots` is an instance of [robot-directives](https://npmjs.com/robot-directives) containing any `` robot exclusions. +* `'junk'` is emitted on each skipped/unchecked link, as configured in options. Arguments: + * `result` is a [`Link`](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/Link.js). +* `'link'` is emitted with the result of each checked/unskipped link (broken or not). Arguments: + * `result` is a [`Link`](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/Link.js). +* `'queue'` is emitted when a link is internally queued, dequeued or made active. + + +### `HtmlUrlChecker` +Scans the HTML content at each queued URL to find broken links. All methods from [`EventEmitter`](https://nodejs.org/api/events.html#events_class_eventemitter) are available. ```js -var htmlChecker = new blc.HtmlChecker(options, { - html: function(tree, robots){}, - junk: function(result){}, - link: function(result){}, - complete: function(){} -}); - -htmlChecker.scan(html, baseUrl); +const {HtmlUrlChecker} = require('broken-link-checker'); + +const htmlUrlChecker = new HtmlUrlChecker(options) + .on('error', (error) => {}) + .on('html', (tree, robots, response, pageURL, customData) => {}) + .on('queue', () => {}) + .on('junk', (result, customData) => {}) + .on('link', (result, customData) => {}) + .on('page', (error, pageURL, customData) => {}) + .on('end', () => {}); + +htmlUrlChecker.enqueue(pageURL, customData); ``` -### `blc.HtmlUrlChecker(options, handlers)` -Scans the HTML content at each queued URL to find broken links. +#### Methods & Properties +* `.clearCache()` will remove any cached URL responses. +* `.dequeue(id)` removes a page from the queue. Returns `true` on success or `false` on failure. +* `.enqueue(pageURL, customData)` adds a page to the queue. Queue items are auto-dequeued when their requests are complete. Returns a queue ID on success. Arguments: + * `pageURL` must be a [`URL`](https://mdn.io/URL). + * `customData` is optional data (of any type) that is stored in the queue item for the page. +* `.has(id)` returns `true` if the queue contains an active or queued page tagged with `id` and `false` if not. +* `.isPaused` returns `true` if the queue is paused and `false` if not. +* `.numActiveLinks` returns the number of links with active requests. +* `.numPages` returns the total number of pages in the queue. +* `.numQueuedLinks` returns the number of links that currently have no active requests. +* `.pause()` will pause the queue, but will not pause any active requests. +* `.resume()` will resume the queue. -* `handlers.end` is fired when the end of the queue has been reached. -* `handlers.html` is fired after a page's HTML document has been fully parsed. +#### Events +* `'end'` is emitted when the end of the queue has been reached. +* `'error'` is emitted when an error occurs within any of your event handlers and will prevent the current scan from failing. Arguments: + * `error` is the `Error`. +* `'html'` is emitted after a page's HTML document has been fully parsed. Arguments: * `tree` is supplied by [parse5](https://npmjs.com/parse5). * `robots` is an instance of [robot-directives](https://npmjs.com/robot-directives) containing any `` and `X-Robots-Tag` robot exclusions. -* `handlers.junk` is fired with data on each skipped link, as configured in options. -* `handlers.link` is fired with the result of each discovered link (broken or not) within the current page. -* `handlers.page` is fired after a page's last result, on zero results, or if the HTML could not be retrieved. - -* `.clearCache()` will remove any cached URL responses. This is only relevant if the `cacheResponses` option is enabled. -* `.dequeue(id)` removes a page from the queue. Returns `true` on success or an `Error` on failure. -* `.enqueue(pageUrl, customData)` adds a page to the queue. Queue items are auto-dequeued when their requests are complete. Returns a queue ID on success or an `Error` on failure. - * `customData` is optional data that is stored in the queue item for the page. -* `.numActiveLinks()` returns the number of links with active requests. -* `.numPages()` returns the total number of pages in the queue. -* `.numQueuedLinks()` returns the number of links that currently have no active requests. -* `.pause()` will pause the queue, but will not pause any active requests. -* `.resume()` will resume the queue. + * `response` is the full HTTP response for the page, excluding the body. + * `pageURL` is the `URL` to the current page being scanned. + * `customData` is whatever was queued. +* `'junk'` is emitted on each skipped/unchecked link, as configured in options. Arguments: + * `result` is a [`Link`](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/Link.js). + * `customData` is whatever was queued. +* `'link'` is emitted with the result of each checked/unskipped link (broken or not) within the current page. Arguments: + * `result` is a [`Link`](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/Link.js). + * `customData` is whatever was queued. +* `'page'` is emitted after a page's last result, on zero results, or if the HTML could not be retrieved. Arguments: + * `error` will be an `Error` if such occurred or `null` if not. + * `pageURL` is the `URL` to the current page being scanned. + * `customData` is whatever was queued. +* `'queue'` is emitted when a URL (link or page) is queued, dequeued or made active. + + +### `SiteChecker` +Recursively scans (crawls) the HTML content at each queued URL to find broken links. All methods from [`EventEmitter`](https://nodejs.org/api/events.html#events_class_eventemitter) are available. ```js -var htmlUrlChecker = new blc.HtmlUrlChecker(options, { - html: function(tree, robots, response, pageUrl, customData){}, - junk: function(result, customData){}, - link: function(result, customData){}, - page: function(error, pageUrl, customData){}, - end: function(){} -}); - -htmlUrlChecker.enqueue(pageUrl, customData); +const {SiteChecker} = require('broken-link-checker'); + +const siteChecker = new SiteChecker(options) + .on('error', (error) => {}) + .on('robots', (robots, customData) => {}) + .on('html', (tree, robots, response, pageURL, customData) => {}) + .on('queue', () => {}) + .on('junk', (result, customData) => {}) + .on('link', (result, customData) => {}) + .on('page', (error, pageURL, customData) => {}) + .on('site', (error, siteURL, customData) => {}) + .on('end', () => {}); + +siteChecker.enqueue(siteURL, customData); ``` -### `blc.SiteChecker(options, handlers)` -Recursively scans (crawls) the HTML content at each queued URL to find broken links. - -* `handlers.end` is fired when the end of the queue has been reached. -* `handlers.html` is fired after a page's HTML document has been fully parsed. - * `tree` is supplied by [parse5](https://npmjs.com/parse5). - * `robots` is an instance of [robot-directives](https://npmjs.com/robot-directives) containing any `` and `X-Robots-Tag` robot exclusions. -* `handlers.junk` is fired with data on each skipped link, as configured in options. -* `handlers.link` is fired with the result of each discovered link (broken or not) within the current page. -* `handlers.page` is fired after a page's last result, on zero results, or if the HTML could not be retrieved. -* `handlers.robots` is fired after a site's robots.txt has been downloaded and provides an instance of [robots-txt-guard](https://npmjs.com/robots-txt-guard). -* `handlers.site` is fired after a site's last result, on zero results, or if the *initial* HTML could not be retrieved. - -* `.clearCache()` will remove any cached URL responses. This is only relevant if the `cacheResponses` option is enabled. -* `.dequeue(id)` removes a site from the queue. Returns `true` on success or an `Error` on failure. -* `.enqueue(siteUrl, customData)` adds [the first page of] a site to the queue. Queue items are auto-dequeued when their requests are complete. Returns a queue ID on success or an `Error` on failure. - * `customData` is optional data that is stored in the queue item for the site. -* `.numActiveLinks()` returns the number of links with active requests. -* `.numPages()` returns the total number of pages in the queue. -* `.numQueuedLinks()` returns the number of links that currently have no active requests. -* `.numSites()` returns the total number of sites in the queue. +#### Methods & Properties +* `.clearCache()` will remove any cached URL responses. +* `.dequeue(id)` removes a site from the queue. Returns `true` on success or `false` on failure. +* `.enqueue(siteURL, customData)` adds [the first page of] a site to the queue. Queue items are auto-dequeued when their requests are complete. Returns a queue ID on success. Arguments: + * `siteURL` must be a [`URL`](https://mdn.io/URL). + * `customData` is optional data (of any type) that is stored in the queue item for the site. +* `.has(id)` returns `true` if the queue contains an active or queued site tagged with `id` and `false` if not. +* `.isPaused` returns `true` if the queue is paused and `false` if not. +* `.numActiveLinks` returns the number of links with active requests. +* `.numPages` returns the total number of pages in the queue. +* `.numQueuedLinks` returns the number of links that currently have no active requests. +* `.numSites` returns the total number of sites in the queue. * `.pause()` will pause the queue, but will not pause any active requests. * `.resume()` will resume the queue. -**Note:** `options.filterLevel` is used for determining which links are recursive. +#### Events +* `'end'` is emitted when the end of the queue has been reached. +* `'error'` is emitted when an error occurs within any of your event handlers and will prevent the current scan from failing. Arguments: + * `error` is the `Error`. +* `'html'` is emitted after a page's HTML document has been fully parsed. Arguments: + * `tree` is supplied by [parse5](https://npmjs.com/parse5). + * `robots` is an instance of [robot-directives](https://npmjs.com/robot-directives) containing any `` and `X-Robots-Tag` robot exclusions. + * `response` is the full HTTP response for the page, excluding the body. + * `pageURL` is the `URL` to the current page being scanned. + * `customData` is whatever was queued. +* `'junk'` is emitted on each skipped/unchecked link, as configured in options. Arguments: + * `result` is a [`Link`](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/Link.js). + * `customData` is whatever was queued. +* `'link'` is emitted with the result of each checked/unskipped link (broken or not) within the current page. Arguments: + * `result` is a [`Link`](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/Link.js). + * `customData` is whatever was queued. +* `'page'` is emitted after a page's last result, on zero results, or if the HTML could not be retrieved. Arguments: + * `error` will be an `Error` if such occurred or `null` if not. + * `pageURL` is the `URL` to the current page being scanned. + * `customData` is whatever was queued. +* `'queue'` is emitted when a URL (link, page or site) is queued, dequeued or made active. +* `'robots'` is emitted after a site's robots.txt has been downloaded. Arguments: + * `robots` is an instance of [robots-txt-guard](https://npmjs.com/robots-txt-guard). + * `customData` is whatever was queued. +* `'site'` is emitted after a site's last result, on zero results, or if the *initial* HTML could not be retrieved. Arguments: + * `error` will be an `Error` if such occurred or `null` if not. + * `siteURL` is the `URL` to the current site being crawled. + * `customData` is whatever was queued. + +**Note:** the `filterLevel` option is used for determining which links are recursive. + + +### `UrlChecker` +Requests each queued URL to determine if they are broken. All methods from [`EventEmitter`](https://nodejs.org/api/events.html#events_class_eventemitter) are available. ```js -var siteChecker = new blc.SiteChecker(options, { - robots: function(robots, customData){}, - html: function(tree, robots, response, pageUrl, customData){}, - junk: function(result, customData){}, - link: function(result, customData){}, - page: function(error, pageUrl, customData){}, - site: function(error, siteUrl, customData){}, - end: function(){} -}); - -siteChecker.enqueue(siteUrl, customData); -``` +const {UrlChecker} = require('broken-link-checker'); -### `blc.UrlChecker(options, handlers)` -Requests each queued URL to determine if they are broken. +const urlChecker = new UrlChecker(options) + .on('error', (error) => {}) + .on('queue', () => {}) + .on('link', (result, customData) => {}) + .on('end', () => {}); -* `handlers.end` is fired when the end of the queue has been reached. -* `handlers.link` is fired for each result (broken or not). +urlChecker.enqueue(url, customData); +``` -* `.clearCache()` will remove any cached URL responses. This is only relevant if the `cacheResponses` option is enabled. -* `.dequeue(id)` removes a URL from the queue. Returns `true` on success or an `Error` on failure. -* `.enqueue(url, baseUrl, customData)` adds a URL to the queue. Queue items are auto-dequeued when their requests are completed. Returns a queue ID on success or an `Error` on failure. - * `baseUrl` is the address to which all relative URLs will be made absolute. Without a value, links to relative URLs will output an "Invalid URL" error. - * `customData` is optional data that is stored in the queue item for the URL. -* `.numActiveLinks()` returns the number of links with active requests. -* `.numQueuedLinks()` returns the number of links that currently have no active requests. +#### Methods & Properties +* `.clearCache()` will remove any cached URL responses. +* `.dequeue(id)` removes a URL from the queue. Returns `true` on success or `false` on failure. +* `.enqueue(url, customData)` adds a URL to the queue. Queue items are auto-dequeued when their requests are completed. Returns a queue ID on success. Arguments: + * `url` must be a [`URL`](https://mdn.io/URL). + * `customData` is optional data (of any type) that is stored in the queue item for the URL. +* `.has(id)` returns `true` if the queue contains an active or queued URL tagged with `id` and `false` if not. +* `.isPaused` returns `true` if the queue is paused and `false` if not. +* `.numActiveLinks` returns the number of links with active requests. +* `.numQueuedLinks` returns the number of links that currently have no active requests. * `.pause()` will pause the queue, but will not pause any active requests. * `.resume()` will resume the queue. -```js -var urlChecker = new blc.UrlChecker(options, { - link: function(result, customData){}, - end: function(){} -}); +#### Events +* `'end'` is emitted when the end of the queue has been reached. +* `'error'` is emitted when an error occurs within any of your event handlers and will prevent the current scan from failing. Arguments: + * `error` is the `Error`. +* `'link'` is emitted for each result (broken or not). Arguments: + * `result` is a [`Link`](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/Link.js). + * `customData` is whatever was queued. +* `'queue'` is emitted when a URL is queued, dequeued or made active. -urlChecker.enqueue(url, baseUrl, customData); -``` ## Options -### `options.acceptedSchemes` +### `acceptedSchemes` Type: `Array` -Default value: `["http","https"]` +Default value: `['http:','https:']` Will only check links with schemes/protocols mentioned in this list. Any others (except those in `excludedSchemes`) will output an "Invalid URL" error. -### `options.cacheExpiryTime` +### `cacheMaxAge` Type: `Number` -Default Value: `3600000` (1 hour) +Default Value: `3_600_000` (1 hour) The number of milliseconds in which a cached response should be considered valid. This is only relevant if the `cacheResponses` option is enabled. -### `options.cacheResponses` +### `cacheResponses` Type: `Boolean` Default Value: `true` URL request results will be cached when `true`. This will ensure that each unique URL will only be checked once. -### `options.excludedKeywords` +### `excludedKeywords` Type: `Array` Default value: `[]` -Will not check or output links that match the keywords and glob patterns in this list. The only wildcard supported is `*`. +A blacklist. Will not check links that match the keywords and glob patterns in this list. The only wildcards supported are [`*` and `!`](https://npmjs.com/matcher). This option does *not* apply to `UrlChecker`. -### `options.excludedSchemes` +### `excludedSchemes` Type: `Array` -Default value: `["data","geo","javascript","mailto","sms","tel"]` -Will not check or output links with schemes/protocols mentioned in this list. This avoids the output of "Invalid URL" errors with links that cannot be checked. +Default value: `['data:','geo:','javascript:','mailto:','sms:','tel:']` +Will not check links with schemes/protocols mentioned in this list. This avoids the output of "Invalid URL" errors with links that cannot be checked. This option does *not* apply to `UrlChecker`. -### `options.excludeExternalLinks` +### `excludeExternalLinks` Type: `Boolean` Default value: `false` -Will not check or output external links when `true`; relative links with a remote `` included. +Will not check external links (different protocol and/or host) when `true`; relative links with a remote `` included. This option does *not* apply to `UrlChecker`. -### `options.excludeInternalLinks` +### `excludeInternalLinks` Type: `Boolean` Default value: `false` -Will not check or output internal links when `true`. +Will not check internal links (same protocol and host) when `true`. This option does *not* apply to `UrlChecker` nor `SiteChecker`'s *crawler*. -### `options.excludeLinksToSamePage` +### `excludeLinksToSamePage` Type: `Boolean` -Default value: `true` -Will not check or output links to the same page; relative and absolute fragments/hashes included. +Default value: `false` +Will not check links to the same page; relative and absolute fragments/hashes included. This is only relevant if the `cacheResponses` option is disabled. This option does *not* apply to `UrlChecker`. -### `options.filterLevel` +### `filterLevel` Type: `Number` Default value: `1` The tags and attributes that are considered links for checking, split into the following levels: * `0`: clickable links -* `1`: clickable links, media, iframes, meta refreshes -* `2`: clickable links, media, iframes, meta refreshes, stylesheets, scripts, forms -* `3`: clickable links, media, iframes, meta refreshes, stylesheets, scripts, forms, metadata +* `1`: clickable links, media, frames, meta refreshes +* `2`: clickable links, media, frames, meta refreshes, stylesheets, scripts, forms +* `3`: clickable links, media, frames, meta refreshes, stylesheets, scripts, forms, metadata -Recursive links have a slightly different filter subset. To see the exact breakdown of both, check out the [tag map](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/tags.js). `` is not listed because it is not a link, though it is always parsed. +Recursive links have a slightly different filter subset. To see the exact breakdown of both, check out the [tag map](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/tags.js). `` is not listed because it is not a link, though it is always parsed. This option does *not* apply to `UrlChecker`. -### `options.honorRobotExclusions` +### `honorRobotExclusions` Type: `Boolean` Default value: `true` Will not scan pages that search engine crawlers would not follow. Such will have been specified with any of the following: @@ -253,104 +330,132 @@ Will not scan pages that search engine crawlers would not follow. Such will have This option does *not* apply to `UrlChecker`. -### `options.maxSockets` +### `includedKeywords` +Type: `Array` +Default value: `[]` +A whitelist. Will only check links that match the keywords and glob patterns in this list, if any. The only wildcard supported is `*`. + +This option does *not* apply to `UrlChecker`. + +### `includeLink` +Type: `Function` +Default value: `link => true` +A synchronous callback that is called after all other filters have been performed. Return `true` to include `link` (a [`Link`](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/Link.js)) in the list of links to be checked, or return `false` to have it skipped. + +This option does *not* apply to `UrlChecker`. + +### `includePage` +Type: `Function` +Default value: `url => true` +A synchronous callback that is called after all other filters have been performed. Return `true` to include `url` (a [`URL`](https://mdn.io/URL)) in the list of pages to be crawled, or return `false` to have it skipped. + +This option does *not* apply to `UrlChecker` nor `HtmlUrlChecker`. + +### `maxSockets` Type: `Number` Default value: `Infinity` The maximum number of links to check at any given time. -### `options.maxSocketsPerHost` +### `maxSocketsPerHost` Type: `Number` -Default value: `1` +Default value: `2` The maximum number of links per host/port to check at any given time. This avoids overloading a single target host with too many concurrent requests. This will not limit concurrent requests to other hosts. -### `options.rateLimit` +### `rateLimit` Type: `Number` Default value: `0` The number of milliseconds to wait before each request. -### `options.requestMethod` +### `requestMethod` Type: `String` -Default value: `"head"` -The HTTP request method used in checking links. If you experience problems, try using `"get"`, however `options.retry405Head` should have you covered. +Default value: `'head'` +The HTTP request method used in checking links. If you experience problems, try using `'get'`, however the `retryHeadFail` option should have you covered. + +### `retryHeadCodes` +Type: `Array` +Default value: `[405]` +The list of HTTP status codes for the `retryHeadFail` option to reference. -### `options.retry405Head` +### `retryHeadFail` Type: `Boolean` Default value: `true` -Some servers do not respond correctly to a `"head"` request method. When `true`, a link resulting in an HTTP 405 "Method Not Allowed" error will be re-requested using a `"get"` method before deciding that it is broken. +Some servers do not respond correctly to a `'head'` request method. When `true`, a link resulting in an HTTP status code listed within the `retryHeadCodes` option will be re-requested using a `'get'` method before deciding that it is broken. This is only relevant if the `requestMethod` option is set to `'head'`. -### `options.userAgent` +### `userAgent` Type: `String` -Default value: `"broken-link-checker/0.7.0 Node.js/5.5.0 (OS X El Capitan; x64)"` (or similar) +Default value: `'broken-link-checker/0.8.0 Node.js/8.9.4 (OS X; x64)'` (or similar) The HTTP user-agent to use when checking links as well as retrieving pages and robot exclusions. ## Handling Broken/Excluded Links -A broken link will have a `broken` value of `true` and a reason code defined in `brokenReason`. A link that was not checked (emitted as `"junk"`) will have an `excluded` value of `true` and a reason code defined in `excludedReason`. +A broken link will have an `isBroken` value of `true` and a reason code defined in `brokenReason`. A link that was not checked (emitted as `'junk'`) will have an `wasExcluded` value of `true` and a reason code defined in `excludedReason`. ```js -if (result.broken) { - console.log(result.brokenReason); - //=> HTTP_404 -} else if (result.excluded) { - console.log(result.excludedReason); - //=> BLC_ROBOTS +if (link.isBroken) { + console.log(link.brokenReason); + //-> HTTP_406 +} else if (link.wasExcluded) { + console.log(link.excludedReason); + //-> BLC_ROBOTS } ``` Additionally, more descriptive messages are available for each reason code: ```js -console.log(blc.BLC_ROBOTS); //=> Robots Exclusion -console.log(blc.ERRNO_ECONNRESET); //=> connection reset by peer (ECONNRESET) -console.log(blc.HTTP_404); //=> Not Found (404) +const blc = require('broken-link-checker'); + +console.log(blc.reasons.BLC_ROBOTS); //-> Robots Exclusion +console.log(blc.reasons.ERRNO_ECONNRESET); //-> connection reset by peer (ECONNRESET) +console.log(blc.reasons.HTTP_404); //-> Not Found (404) // List all -console.log(blc); +console.log(blc.reasons); ``` Putting it all together: ```js -if (result.broken) { - console.log(blc[result.brokenReason]); -} else if (result.excluded) { - console.log(blc[result.excludedReason]); +if (link.isBroken) { + console.log(blc.reasons[link.brokenReason]); +} else if (link.wasExcluded) { + console.log(blc.reasons[link.excludedReason]); } ``` -## HTML and HTTP information -Detailed information for each link result is provided. Check out the [schema](https://github.com/stevenvachon/broken-link-checker/blob/master/lib/internal/linkObj.js#L16-L64) or: -```js -console.log(result); -``` - ## Roadmap Features -* fix issue where same-page links are not excluded when cache is enabled, despite `excludeLinksToSamePage===true` -* publicize filter handlers +* `'info'` event with messaging such as 'Site does not support HTTP HEAD method' (regarding `retryHeadFail` option) * add cheerio support by using parse5's htmlparser2 tree adaptor? -* add `rejectUnauthorized:false` option to avoid `UNABLE_TO_VERIFY_LEAF_SIGNATURE` -* load sitemap.xml at end of each `SiteChecker` site to possibly check pages that were not linked to +* load sitemap.xml at *start* of each `SiteChecker` site (since cache can expire) to possibly check pages that were not linked to, removing from list as *discovered* links are checked * remove `options.excludedSchemes` and handle schemes not in `options.acceptedSchemes` as junk? * change order of checking to: tcp error, 4xx code (broken), 5xx code (undetermined), 200 -* abort download of body when `options.retry405Head===true` +* abort download of body when `options.retryHeadFail===true` * option to retry broken links a number of times (default=0) * option to scrape `response.body` for erroneous sounding text (using [fathom](https://npmjs.com/fathom-web)?), since an error page could be presented but still have code 200 +* option to detect parked domain (302 with no redirect?) * option to check broken link on archive.org for archived version (using [this lib](https://npmjs.com/archive.org)) * option to run `HtmlUrlChecker` checks on page load (using [jsdom](https://npmjs.com/jsdom)) to include links added with JavaScript? * option to check if hashes exist in target URL document? * option to parse Markdown in `HtmlChecker` for links -* option to play sound when broken link is found -* option to hide unbroken links * option to check plain text URLs * add throttle profiles (0–9, -1 for "custom") for easy configuring -* check [ftp:](https://nmjs.com/ftp), [sftp:](https://npmjs.com/ssh2) (for downloadable files) +* check [ftp:](https://npmjs.com/ftp), [sftp:](https://npmjs.com/ssh2) (for downloadable files) * check ~~mailto:~~, news:, nntp:, telnet:? -* check local files if URL is relative and has no base URL? -* cli json mode -- streamed or not? -* cli non-tty mode -- change nesting ASCII artwork to time stamps? +* check that data URLs are valid (with [valid-data-url](https://www.npmjs.com/valid-data-url))? +* supply CORS error for file:// links on sites with a different protocol +* create an example with http://astexplorer.net +* swap [calmcard](https://npmjs.com/calmcard) for [matcher](https://npmjs.com/matcher) or both [minimatch](https://npmjs.com/minimatch) and `RegExp` +* use [debug](https://npmjs.com/debug) +* use [got](https://npmjs.com/got) ? +* use [bunyan](https://npmjs.com/bunyan) with JSON output for CLI +* store request object/headers (or just auth) in `Link.http`? +* supply basic auth for "page" events? +* add option for `URLCache` normalization profiles [npm-image]: https://img.shields.io/npm/v/broken-link-checker.svg [npm-url]: https://npmjs.org/package/broken-link-checker [travis-image]: https://img.shields.io/travis/stevenvachon/broken-link-checker.svg [travis-url]: https://travis-ci.org/stevenvachon/broken-link-checker -[david-image]: https://img.shields.io/david/stevenvachon/broken-link-checker.svg -[david-url]: https://david-dm.org/stevenvachon/broken-link-checker +[coveralls-image]: https://img.shields.io/coveralls/stevenvachon/broken-link-checker.svg +[coveralls-url]: https://coveralls.io/github/stevenvachon/broken-link-checker +[greenkeeper-image]: https://badges.greenkeeper.io/stevenvachon/broken-link-checker.svg +[greenkeeper-url]: https://greenkeeper.io/ diff --git a/bin/blc b/bin/blc old mode 100644 new mode 100755 index 86433d87..f985af0c --- a/bin/blc +++ b/bin/blc @@ -1,3 +1,3 @@ #!/usr/bin/env node -new (require("../lib/cli"))().input(); +require("../lib-cjs/cli")(); diff --git a/changelog.md b/changelog.md index 52f13c09..2f191036 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,21 @@ +* 0.8.0 + * dropped support for Node versions below 12.0 + * API changes + * `linkObj` renamed to `Link`, and schema changed + * options added: `includedKeywords`, `includePage`, `retryHeadCodes` + * options changed: `acceptedSchemes`, `excludeLinksToSamePage`, `excludedSchemes` + * options renamed: `cacheExpiryTime`→`cacheMaxAge`, `customFilter`→`includeLink`, `retry405Head`→`retryHeadFail` + * CLI options added: `--include`, `--verbosity` + * CLI options removed: `--verbose` + * added [WHATWG URL specification](https://url.spec.whatwg.org)-compliance + * added support for HTTP basic auth, compression, proxies + * added support for file:// protocol + * added missing tags/attributes + * added support for pausing/resuming the CLI + * added progress bar and OS notification to the CLI + * refactored to ES2015+ + * test suite improvements + * bug fixes * 0.7.8 fix for Node.js v9 * 0.7.7 added `--host-requests`, `--requests` CLI options * 0.7.6 bug fix @@ -10,11 +28,11 @@ * added `SiteChecker` * methods added: `numPages()`, `numSites()` * methods removed: `numActiveItems()` - * methods renamed: `length()`->`numQueuedLinks()` + * methods renamed: `length()`→`numQueuedLinks()` * options added: `honorRobotExclusions` * options removed: `excludeResponseData` * handlers added: `html`, `robots` - * handlers renamed: `item`->`page` + * handlers renamed: `item`→`page` * CLI options added: `--follow`, `--recursive`, `--user-agent` * linkObj added: `brokenReason`, `excludedReason`, `html.location` * linkObj removed: `error`, `http.redirects`, `http.statusCode` @@ -52,9 +70,9 @@ * API change * CLI options * options added: `excludeExternalLinks`, `excludeResponseData`, `maxSockets` - * options renamed: `maxSockets`->`maxSocketsPerHost` + * options renamed: `maxSockets`→`maxSocketsPerHost` * linkObj added: `http` - * linkObj moved: `response`->`http.response` + * linkObj moved: `response`→`http.response` * linkObj changed: `internal` and `samePage` now compares the base URL (ignoring ``) with links that may have `` applied * switched from [request](https://npmjs.com/request) to [bhttp](https://npmjs.com/bhttp) * 0.4.3 added `rateLimit` option, cleanup @@ -68,7 +86,7 @@ * linkObj added: `html.selector` * 0.3.0 * options added: `maxSockets` - * options renamed: `site`->`base` + * options renamed: `site`→`base` * `` supported * requesting links now only downloads the response header * faster test suite diff --git a/lib/cli.js b/lib/cli.js index 70a0ad51..e088ac68 100644 --- a/lib/cli.js +++ b/lib/cli.js @@ -1,510 +1,607 @@ -"use strict"; -var blc = require("./"); -var defaultOptions = require("./internal/defaultOptions"); -var pkg = require("../package.json"); - -var chalk = require("chalk"); -var humanizeDuration = require("humanize-duration"); -var nopter = require("nopter"); -var spinner = require("char-spinner"); - -process.on("unhandledRejection", function(reason, p) { - log( nopter.error.fatal("Unhandled Rejection", reason, "Error") ); - process.exit(1); -}); - -function cli() -{ - var filterLevel = "The types of tags and attributes that are considered links.\n"; - filterLevel += " 0: clickable links\n"; - filterLevel += " 1: 0 + media, iframes, meta refreshes\n"; - filterLevel += " 2: 1 + stylesheets, scripts, forms\n"; - filterLevel += " 3: 2 + metadata\n"; - filterLevel += " Default: "+defaultOptions.filterLevel; - - this.nopter = new nopter(); - - this.nopter.config( - { - title: "Broken Link Checker", - description: pkg.description, - version: pkg.version, - name: "blc", - options: - { - "exclude": - { - rename: "excludedKeywords", - info: "A keyword/glob to match links against. Can be used multiple times.", - type: [String, Array], - default: defaultOptions.excludedKeywords - }, - "exclude-external": - { - rename: "excludeExternalLinks", - short: "e", - info: "Will not check external links.", - type: Boolean - }, - "exclude-internal": - { - rename: "excludeInternalLinks", - short: "i", - info: "Will not check internal links.", - type: Boolean - }, - "filter-level": - { - info: filterLevel, - type: Number, - default: defaultOptions.filterLevel - }, - "follow": - { - rename: "followRobotExclusions", - short: "f", - info: "Force-follow robot exclusions.", - type: Boolean - }, - "get": - { - short: "g", - info: "Change request method to GET.", - type: Boolean - }, - "help": - { - short: ["h","?"], - info: "Display this help text.", - type: Boolean - }, - "input": - { - info: "URL to an HTML document.", - type: require("url") - }, - "host-requests": - { - rename: "maxSocketsPerHost", - info: "Concurrent requests limit per host.", - type: Number, - default: defaultOptions.maxSocketsPerHost - }, - "ordered": - { - rename: "maintainLinkOrder", - short: "o", - info: "Maintain the order of links as they appear in their HTML document.", - type: Boolean - }, - "recursive": - { - short: "r", - info: "Recursively scan (\"crawl\") the HTML document(s).", - type: Boolean - }, - "requests": - { - rename: "maxSockets", - info: "Concurrent requests limit.", - type: Number, - default: defaultOptions.maxSockets - }, - "user-agent": - { - info: "The user agent to use for link checks.", - type: String, - default: defaultOptions.userAgent - }, - "verbose": - { - short: "v", - info: "Display excluded links.", - type: Boolean - }, - "version": - { - short: "V", - info: "Display the app version.", - type: Boolean - } - }, - aliases: ["input"] - }); -} +/* eslint-disable no-console */ +import {BROKEN_REASON, EXCLUDED_REASON, HTML_INDEX, HTTP_RESPONSE_WAS_CACHED, IS_BROKEN, ORIGINAL_URL, REBASED_URL, WAS_EXCLUDED} from "./internal/Link"; +import DEFAULT_OPTIONS from "./internal/defaultOptions"; +import {END_EVENT, ERROR_EVENT, HTML_EVENT, JUNK_EVENT, LINK_EVENT, PAGE_EVENT, QUEUE_EVENT, SITE_EVENT} from "./internal/events"; +import Gauge from "gauge"; +import {GET_METHOD} from "./internal/methods"; +import {gray, green, red, white, yellow} from "chalk"; +import {HtmlUrlChecker, SiteChecker} from "./"; +import humanizeDuration from "humanize-duration"; +import longest from "longest"; +import {make_scanner as scanKeys} from "keyscan"; +import notifier from "node-notifier"; +import stripAnsi from "strip-ansi"; +import supportsSemigraphics from "supports-semigraphics"; +import {themes as gaugeThemes} from "gauge/themes"; +import {version as packageVersion} from "../package.json"; + + + +const title = "Broken Link Checker"; +let checker,checkerOptions,gauge,keyScanner,logOptions,pauseMessage,spinner,stats,urls; -cli.prototype.input = function(args, showArgs) +const argsToOptions = args => { - //var testing = args !== undefined; - args = this.nopter.input(args); - - //if (testing===true && showArgs===true) return args; - - if (args.help === true) - { - log( this.nopter.help() ); - } - else if (args.version === true) + const renames = { - log(pkg.version); - } - else if (args.input != null) + exclude: "excludedKeywords", + excludeExternal: "excludeExternalLinks", + excludeInternal: "excludeInternalLinks", + follow: "followRobotExclusions", + hostRequests: "maxSocketsPerHost", + include: "includedKeywords", + ordered: "maintainLinkOrder", + requests: "maxSockets" + }; + + return Object.entries(args).reduce((opts, [argName, argValue]) => { - // TODO :: remove this when nopter's default values for Arrays are fixed - if (args.excludedKeywords === undefined) + if (argName in renames) { - args.excludedKeywords = defaultOptions.excludedKeywords; + opts[ renames[argName] ] = argValue; } - - run(args.input, + else if (argName in DEFAULT_OPTIONS) { - excludedKeywords: args.excludedKeywords, - excludeExternalLinks: args.excludeExternalLinks===true, - excludeInternalLinks: args.excludeInternalLinks===true, - excludeLinksToSamePage: args.verbose!==true, - filterLevel: args.filterLevel, - honorRobotExclusions: args.followRobotExclusions!==true, - maxSockets: args.maxSockets, - maxSocketsPerHost: args.maxSocketsPerHost, - requestMethod: args.get!==true ? "head" : "get", - userAgent: args.userAgent - }, + opts[argName] = argValue; + } + else if (args.get) { - excludeCachedLinks: args.verbose!==true, - excludeFilteredLinks: args.verbose!==true, - maintainLinkOrder: args.maintainLinkOrder, - recursive: args.recursive - }); - } - else - { - log( nopter.error.fatal("Input URL required", "Use --help for more options", "Error") ); - process.exit(1); - } + opts.requestMethod = GET_METHOD; + } + + return opts; + }, {}); }; -function log() +const log = (...args) => { - // Avoid spinner chars getting stuck in the log - spinner.clear(); - - console.log.apply(null, arguments); -} + // Avoid spinner/progress chars getting stuck in the log + gauge.hide(); + + console.log(...args); + + gauge.show(); +}; -function logMetrics(brokenLinks, excludedLinks, totalLinks, duration, preBreak, exit) +log.page = pageURL => { - var output = preBreak===true ? "\n" : ""; - - output += chalk.gray("Finished! "+totalLinks+" links found."); - - if (excludedLinks > 0) - { - output += chalk.gray(" "+excludedLinks+" excluded."); - } - - if (totalLinks > 0) - { - output += chalk.gray(" "); - output += chalk[ brokenLinks>0 ? "red" : "green" ](brokenLinks+" broken"); - output += chalk.gray("."); - } - - if (duration != null) + log( white("\nGetting links from: ") + yellow(pageURL) ); +}; + + + +log.page.metrics = () => +{ + let output = gray(`Finished! ${stats.page.totalLinks} links found.`); + + if (stats.page.skippedLinks > 0) { - output += chalk.gray("\nElapsed time: "); - output += chalk.gray( humanizeDuration(duration, {round:true, largest:2}) ); + output += gray(` ${stats.page.skippedLinks} skipped.`); } - - log(output); - if (exit === true) + if (stats.page.totalLinks > 0) { - process.exit(brokenLinks===0 ? 0 : 1); + output += gray(" "); + + if (stats.page.brokenLinks > 0) + { + output += red(`${stats.page.brokenLinks} broken`); + } + else + { + output += green(`${stats.page.brokenLinks} broken`); + } + + output += gray("."); } -} + + log(output); +}; -/* - Ensure that `logMetrics()` is called after `logResults_delayed()`. -*/ -function logMetrics_delayed(brokenLinks, excludedLinks, totalLinks, duration, preBreak, exit) +log.progress = () => { - setImmediate( function() + const links = checker.numActiveLinks + checker.numQueuedLinks; + + const pageCompletion = links>0 ? 1/links : 0; + + if (logOptions.recursive) { - logMetrics(brokenLinks, excludedLinks, totalLinks, duration, preBreak, exit); - }); -} + gauge.show(`Links:${links} Pages:${checker.numPages} Sites:${checker.numSites}`, pageCompletion); + } + else + { + gauge.show(`Links:${links} Pages:${checker.numPages}`, pageCompletion); + } +}; -function logPage(data, pageUrl) +log.result = /*(*/result/*, finalResult)*/ => { - var output = ""; - - if (++data.total.pages > 1) output += "\n"; - - output += chalk.white("Getting links from: ") + chalk.yellow(pageUrl); - - log(output); -} - + if (result.displayed) + { + // @todo if the last result is skipped, the last RENDERED result will not be "└─" + let output = gray( /*finalResult!==true ?*/ "├─" /*: "└─"*/ ); + const {link} = result; -function logResult(result, finalResult) -{ - var output = ""; - - if (result.__cli_excluded !== true) - { - // TODO :: if later results are skipped, the last RENDERED result will not be "└─" - output = chalk.gray( finalResult!==true ? "├─" : "└─" ); - - if (result.broken === true) + if (link.get(IS_BROKEN)) { - output += chalk.red("BROKEN"); - output += chalk.gray("─ "); + output += red("BROKEN"); + output += gray("─ "); } - else if (result.excluded === true) + else if (link.get(WAS_EXCLUDED)) { - output += chalk.gray("─SKIP── "); + output += gray("─SKIP── "); } else { - output += chalk.gray("──"); - output += chalk.green("OK"); - output += chalk.gray("─── "); + output += gray("──"); + output += green("OK"); + output += gray("─── "); } - - if (result.url.resolved != null) - { - output += chalk.yellow( result.url.resolved ); - } - else - { - // Excluded scheme - output += chalk.yellow( result.url.original ); - } - - if (result.broken === true) + + // @todo is ORIGINAL_URL only for invalid links? + output += yellow( link.get(REBASED_URL) ?? link.get(ORIGINAL_URL) ); + + if (link.get(IS_BROKEN)) { - output += chalk.gray(" ("+ result.brokenReason +")"); + output += gray(` (${link.get(BROKEN_REASON)})`); } - else if (result.excluded === true) + else if (link.get(WAS_EXCLUDED)) { - output += chalk.gray(" ("+ result.excludedReason +")"); + output += gray(` (${link.get(EXCLUDED_REASON)})`); } // Don't display cached message if broken/excluded message is displayed - else if (result.http.cached === true) + else if (link.get(HTTP_RESPONSE_WAS_CACHED)) { - output += chalk.gray(" (CACHED)"); + output += gray(" (CACHED)"); } + + log(output); } - - return output; -} +}; -/* - Logs links in the order that they are found in their containing HTML - document, even if later links receive an earlier response. -*/ -function logResults(data) +/** + * Logs links in the order that they are found in their containing HTML + * document, even if later links receive an earlier response. + */ +log.results = () => { - var done,output,result; - var nextIsReady = true; - - while (nextIsReady) + // eslint-disable-next-line no-constant-condition + while (true) { - result = data.page.results[data.page.currentIndex]; - + const result = stats.page.results[stats.page.currentIndex]; + if (result !== undefined) { - done = data.page.done===true && data.page.currentIndex>=data.page.results.length-1; - - output = logResult(result, done); - - if (output !== "") log(output); - if (done === true) return; - - data.page.currentIndex++; + //const final = stats.page.currentIndex>=stats.page.results.length-1 && checker.numActiveLinks===0 && checker.numQueuedLinks===0; + + log.result(result/*, final*/); + + stats.page.currentIndex++; } else { - nextIsReady = false; + break; } } -} +}; -/* - Ensure that `logResults()` is called after `data.page.done=true`. -*/ -function logResults_delayed(data) +log.site = () => { - // Avoid more than one delay via multiple synchronous iterations - if (data.delay === null) + let output = ""; + + if (++stats.site.totalPages > 1) { - data.delay = setImmediate( function() - { - logResults(data); - data.delay = null; - }); + output += "\n"; } -} + output += white("\nStarting recursive scan..."); + + log(output); +}; -function pushResult(data, result, options) + +// @todo number of unique/uncached links +// @todo "excluded links" [from cli] doesn't make sense with a value of 0 when there're skipped links in the log +log.site.metrics = () => { - if (options.maintainLinkOrder === true) + let output = ""; + output += gray(`\nLinks found: ${stats.site.totalLinks}`); + output += gray(`\nLinks skipped: ${stats.site.skippedLinks}`); + output += gray(`\nLinks OK: ${stats.site.totalLinks - stats.site.skippedLinks - stats.site.brokenLinks}`); + + let broken; + + if (stats.site.totalLinks > 0) { - data.page.results[result.html.index] = result; + broken = stats.site.brokenLinks>0 ? red : green; } else { - data.page.results.push(result); + broken = gray; } -} + output += broken(`\nLinks broken: ${stats.site.brokenLinks}`); + output += gray("\nTime elapsed: "); + output += gray( humanizeDuration(Date.now() - stats.site.startTime, {largest:2, round:true}) ); + const separator = gray("=".repeat( longest(stripAnsi(output).split("\n")).length )); -function resetPageData(data) -{ - data.page.brokenLinks = 0; - data.page.currentIndex = 0; - data.page.done = false; - data.page.excludedLinks = 0; - data.page.results = []; - //data.page.startTime = Date.now(); - data.page.totalLinks = 0; -} + log(`\n${separator}${output}\n${separator}\n`); +}; -function run(url, checkerOptions, logOptions) +const run = () => { - var handlers,instance; - var data = + Object.values(gaugeThemes).forEach(theme => { - delay: null, - page: {}, - total: - { - brokenLinks: 0, - excludedLinks: 0, - links: 0, - pages: 0, - startTime: Date.now() - } - }; - - // In case first page doesn't call "html" handler - resetPageData(data); - - handlers = + //theme.preProgressbar = `\n\n${theme.preProgressbar}`; + theme.preSubsection = gray("—"); + }); + + gauge = new Gauge(); + stats = new Statistics(); + + if (logOptions.recursive) { - html: function(tree, robots, response, pageUrl) - { - resetPageData(data); - - logPage(data, pageUrl); - }, - junk: function(result) - { - if (logOptions.excludeFilteredLinks === true) - { - result.__cli_excluded = true; - - data.page.excludedLinks++; - data.total.excludedLinks++; - } - - data.page.totalLinks++; - data.total.links++; - - pushResult(data, result, logOptions); - - logResults_delayed(data); - }, - link: function(result) - { - // Exclude cached links only if not broken - if (result.broken===false && result.http.cached===true && logOptions.excludeCachedLinks===true) - { - result.__cli_excluded = true; - - data.page.excludedLinks++; - data.total.excludedLinks++; - } - else if (result.broken === true) - { - data.page.brokenLinks++; - data.total.brokenLinks++; - } - - data.page.totalLinks++; - data.total.links++; - - pushResult(data, result, logOptions); - - logResults_delayed(data); - }, - page: function(error, pageUrl) + checker = new SiteChecker(checkerOptions); + } + else + { + checker = new HtmlUrlChecker(checkerOptions); + } + + checker + .on(HTML_EVENT, (tree, robots, response, pageURL) => + { + log.page(pageURL); + }) + .on(QUEUE_EVENT, () => + { + log.progress(); + }) + .on(JUNK_EVENT, link => + { + stats.pushResult(link); + log.progress(); + log.results(); + }) + .on(LINK_EVENT, link => + { + stats.pushResult(link); + log.progress(); + log.results(); + }) + .on(PAGE_EVENT, (error, pageURL) => + { + if (error != null) { - if (error != null) + // HTML_EVENT will not have been called + log.page(pageURL); + + if (error.code<200 || error.code>299) { - // "html" handler will not have been called - logPage(data, pageUrl); - - log( chalk[ error.code!==200 ? "red" : "gray" ](error.name+": "+error.message) ); + log( red(`${error.name}: ${error.message}`) ); } else { - data.page.done = true; - - logMetrics_delayed(data.page.brokenLinks, data.page.excludedLinks, data.page.totalLinks); + log( gray(`${error.name}: ${error.message}`) ); } - }, - end: function() + + process.exitCode = 1; + } + // If more than a total of one page will be scanned + else if (logOptions.recursive || urls.length>1) { - if (data.total.pages <= 0) - { - process.exit(1); - } - else if (data.total.pages === 1) + log.page.metrics(); + log.progress(); + stats.resetPage(); + + // If nothing after current page + if (checker.numPages === 1) { - process.exit(data.page.done===true && data.total.brokenLinks===0 ? 0 : 1); + log.site.metrics(); } - else if (data.total.pages > 1) + } + else + { + log.site.metrics(); + } + }) + .on(SITE_EVENT, () => + { + log.site.metrics(); + stats.resetSite(); + }) + .on(END_EVENT, () => + { + // @todo store multiple site stats in an array and log all site metrics at very end? + + if (supportsSemigraphics()) + { + // Exit gracefully + clearTimeout(spinner); + gauge.disable(); + keyScanner.release(); + + // @todo https://github.com/mikaelbr/node-notifier/issues/174 + notifier.notify({ message:"Finished!", title }); + } + }) + .on(ERROR_EVENT, error => + { + console.error(error); + + // eslint-disable-next-line no-process-exit + process.exit(1); + }); + + if (logOptions.recursive) + { + log.site(); + } + + if (supportsSemigraphics()) + { + // Show pause message + togglePause(false); + + keyScanner = scanKeys(key => + { + if (key.parsed === "space") { - logMetrics_delayed(data.total.brokenLinks, data.total.excludedLinks, data.total.links, Date.now()-data.total.startTime, true, true); + togglePause(); } + }); + } + else + { + gauge.disable(); + } + + try + { + checker.pause(); // avoid auto-start + urls.forEach(url => checker.enqueue(new URL(url))); + checker.resume(); // start, if above didn't throw + } + catch ({message}) + { + console.error(message); + process.exitCode = 1; + } +}; + + + +const spinnerInterval = () => +{ + spinner = setTimeout(() => + { + gauge.pulse(pauseMessage); + spinnerInterval(); + }, 50); +}; + + + +class Statistics +{ + constructor() + { + this.page = {}; + this.site = {}; + this.resetSite(); + } + + pushResult(link) + { + const result = { displayed:true, link }; + + const hideCachedLink = logOptions.hideCachedLinks && link.get(IS_BROKEN)===false && link.get(HTTP_RESPONSE_WAS_CACHED); + const hideSkippedLink = logOptions.hideSkippedLinks && link.get(WAS_EXCLUDED); + const hideUnbrokenLink = logOptions.hideUnbrokenLinks && link.get(IS_BROKEN)===false; + + if (hideCachedLink || hideSkippedLink || hideUnbrokenLink) + { + this.page.hiddenLinks++; + this.site.hiddenLinks++; + result.displayed = false; } - }; - - if (logOptions.recursive !== true) + + if (link.get(IS_BROKEN)) + { + this.page.brokenLinks++; + this.site.brokenLinks++; + process.exitCode = 1; + } + else if (link.get(WAS_EXCLUDED)) + { + this.page.skippedLinks++; + this.site.skippedLinks++; + } + + this.page.totalLinks++; + this.site.totalLinks++; + + if (logOptions.maintainLinkOrder) + { + this.page.results[link.get(HTML_INDEX)] = result; + } + else + { + this.page.results.push(result); + } + } + + resetPage() { - instance = new blc.HtmlUrlChecker(checkerOptions, handlers); + this.page.brokenLinks = 0; + this.page.currentIndex = 0; + this.page.hiddenLinks = 0; + this.page.results = []; + this.page.skippedLinks = 0; + //this.page.startTime = Date.now(); + this.page.totalLinks = 0; } - else + + resetSite() { - instance = new blc.SiteChecker(checkerOptions, handlers); + this.resetPage(); + this.site.brokenLinks = 0; + this.site.hiddenLinks = 0; + this.site.skippedLinks = 0; + this.site.startTime = Date.now(); + this.site.totalLinks = 0; + this.site.totalPages = 0; } - - spinner(); - - instance.enqueue(url); } -module.exports = cli; +const togglePause = pause => +{ + if (pause === undefined) + { + pause = !checker.isPaused; + } + + if (pause) + { + checker.pause(); + + pauseMessage = `${yellow("PAUSED")}${gray(" — press space to resume")}`; + gauge.pulse(pauseMessage); + clearTimeout(spinner); + } + else + { + checker.resume(); + + pauseMessage = gray("press space to pause"); + spinner = spinnerInterval(); + } + + log.progress(); +}; + + + +export default (args=process.argv) => +{ + const filterLevel = + [ + "--filter-level:", + " 0: clickable links", + " 1: 0 + media, frames, meta refreshes", + " 2: 1 + stylesheets, scripts, forms", + " 3: 2 + metadata" + ].join("\n"); + + const verbosity = + [ + "--verbosity:", + " 0: broken links", + " 1: 0 + unbroken links", + " 2: 1 + skipped links" + ].join("\n"); + + /* eslint-disable sort-keys */ + const optionator = require("optionator")( + { + prepend: `${yellow(title.toUpperCase())}\n\n${green("Usage:")} blc [options] url1 [url2 ...]`, + append: `${gray(filterLevel)}\n\n${gray(verbosity)}\n`, + options: + [ + { heading:"Common Options" }, + { option:"recursive", alias:"r", type:"Boolean", description:`Recursively scan ("crawl") the HTML document(s)`, default:"false" }, + + { heading:"Filtering Options" }, + { option:"exclude", type:"[String]", description:"Skip checking of links that match keywords/glob" }, + { option:"exclude-external", alias:"e", type:"Boolean", description:"Skip checking of external links", default:"false" }, + { option:"exclude-internal", alias:"i", type:"Boolean", description:"Skip checking of internal links", default:"false" }, + { option:"filter-level", type:"Number", description:"Include checking of links by HTML properties", default:`${DEFAULT_OPTIONS.filterLevel}` }, + { option:"follow", alias:"f", type:"Boolean", description:"Force-follow robot exclusions", default:"false" }, + { option:"include", type:"[String]", description:"Only check links that match keywords/glob" }, + + { heading:"Display Options" }, + { option:"help", alias:"h", type:"Boolean", description:"Display this help text", default:"false" }, + { option:"ordered", alias:"o", type:"Boolean", description:"Maintain the order of links as they appear in their HTML document", default:"false" }, + { option:"verbosity", type:"Number", description:"The display verbosity level", default:"1" }, + { option:"version", alias:"v", type:"Boolean", description:"Display the app version", default:"false" }, + + { heading:"Advanced Options" }, + { option:"get", alias:"g", type:"Boolean", description:"Change request method to GET", default:"false" }, + { option:"host-requests", type:"Number", description:"Concurrent requests limit per host", default:`${DEFAULT_OPTIONS.maxSocketsPerHost}` }, + { option:"requests", type:"Number", description:"Concurrent requests limit ", default:`${DEFAULT_OPTIONS.maxSockets}` }, + { option:"user-agent", type:"String", description:"The user agent to use for checking links" } + ] + }); + /* eslint-disable sort-keys */ + + try + { + if (args === process.argv) + { + args = optionator.parseArgv(args); + } + else + { + args = optionator.parse(args); + } + } + catch (error) + { + args = error; + } + + if (args instanceof Error) + { + console.error(args.message); + process.exitCode = 1; + } + else if (args.help) + { + console.log( optionator.generateHelp() ); + } + else if (args.version) + { + console.log(packageVersion); + } + else if (args._.length > 0) + { + urls = args._; + checkerOptions = argsToOptions(args); + logOptions = + { + hideCachedLinks: args.verbosity < 2, + hideSkippedLinks: args.verbosity < 2, + hideUnbrokenLinks: args.verbosity < 1, + maintainLinkOrder: args.ordered, + recursive: args.recursive === true // default value is undefined + }; + + run(); + } + else + { + console.error("At least one URL is required - see '--help'"); + process.exitCode = 1; + } +}; diff --git a/lib/index.js b/lib/index.js index a11e1ee2..658035b4 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,23 +1,16 @@ -"use strict"; -var reasons = require("./internal/messages").reasons; - - - -var blc = -{ - HtmlChecker: require("./public/HtmlChecker"), - HtmlUrlChecker: require("./public/HtmlUrlChecker"), - SiteChecker: require("./public/SiteChecker"), - UrlChecker: require("./public/UrlChecker") -}; - - - -for (var i in reasons) -{ - blc[i] = reasons[i]; -} - - - -module.exports = blc; +import * as reasons from "./internal/reasons"; +import DEFAULT_OPTIONS from "./internal/defaultOptions"; +import HtmlChecker from "./public/HtmlChecker"; +import HtmlUrlChecker from "./public/HtmlUrlChecker"; +import SiteChecker from "./public/SiteChecker"; +import UrlChecker from "./public/UrlChecker"; + +export * from "./internal/events"; +export * from "./internal/methods"; + +// @todo https://github.com/tc39/proposal-export-default-from +export {DEFAULT_OPTIONS}; +export {HtmlChecker, HtmlUrlChecker, SiteChecker, UrlChecker}; + +// @todo https://github.com/tc39/proposal-export-ns-from +export {reasons}; diff --git a/lib/internal/Link.js b/lib/internal/Link.js new file mode 100644 index 00000000..0c73a6e8 --- /dev/null +++ b/lib/internal/Link.js @@ -0,0 +1,330 @@ +import * as reasons from "./reasons"; +import isURL from "isurl"; +import URLRelation from "url-relation"; + + + +export const ORIGINAL_URL = "originalURL"; // The URL string as it was inputted +export const RESOLVED_URL = "resolvedURL"; // The `URL`, resolved with `RESOLVED_BASE_URL` +export const REBASED_URL = "rebasedURL"; // The `URL`, resolved with `REBASED_BASE_URL` +export const REDIRECTED_URL = "redirectedURL"; // The `URL`, after its last redirection, if any + +export const RESOLVED_BASE_URL = "resolvedBaseURL"; // The base `URL` +export const REBASED_BASE_URL = "rebasedBaseURL"; // The base `URL`, resolved with `HTML_BASE_HREF` + +export const HTML_INDEX = "htmlIndex"; // The order in which the link appeared in its document -- using max-level tag filter +export const HTML_OFFSET_INDEX = "htmlOffsetIndex"; // Sequential (gap-free) indices for skipped and unskipped links +export const HTML_LOCATION = "htmlLocation"; // Source code location of the attribute that the link was found within +export const HTML_SELECTOR = "htmlSelector"; // CSS selector for element in document +export const HTML_TAG_NAME = "htmlTagName"; // Tag name that the link was found on +export const HTML_ATTR_NAME = "htmlAttrName"; // Attribute name that the link was found within +export const HTML_ATTRS = "htmlAttrs"; // All attributes on the element +export const HTML_TEXT = "htmlText"; // TextNodes/innerText of the element +export const HTML_TAG = "htmlTag"; // The entire tag string +export const HTML_BASE_HREF = "htmlBaseHref"; // The document's `` value + +export const HTTP_RESPONSE = "httpResponse"; // The request response +export const HTTP_RESPONSE_WAS_CACHED = "httpResponseWasCached"; // If the response was from cache + +export const IS_BROKEN = "isBroken"; // If the link was determined to be broken or not +export const IS_INTERNAL = "isInternal"; // If the link is to the same host as its base/document +export const IS_SAME_PAGE = "isSamePage"; // If the link is to the same page as its base/document +export const WAS_EXCLUDED = "wasExcluded"; // If the link was excluded due to any filtering + +export const BROKEN_REASON = "brokenReason"; // The reason why the link was considered broken, if it indeed is +export const EXCLUDED_REASON = "excludedReason"; // The reason why the link was excluded from being checked, if it indeed was + + + +export default class Link extends Map +{ + /** + * @param {Link} [link] + */ + constructor(link) + { + super(link); + + if (!(link instanceof Link)) + { + // Default values + keysAsList.forEach(key => super.set(key, null)); + } + } + + + /** + * Change state to "broken" with a reason. + * @param {string} reasonKey + * @returns {Link} + */ + break(reasonKey) + { + if (!(reasonKey in reasons)) + { + reasonKey = "BLC_UNKNOWN"; + } + + super.set(IS_BROKEN, true); + super.set(BROKEN_REASON, reasonKey); + return this; + } + + + + /** + * Change state to "excluded" with a reason. + * @param {string} reasonKey + * @returns {Link} + */ + exclude(reasonKey) + { + super.set(WAS_EXCLUDED, true); + super.set(EXCLUDED_REASON, reasonKey); + return this; + } + + + + /** + * Change state to "not excluded" and remove any previous reason for being otherwise. + * @returns {Link} + */ + include() + { + super.set(WAS_EXCLUDED, false); + super.set(EXCLUDED_REASON, null); + return this; + } + + + + /** + * Change state to "not broken" and remove any previous reason for being otherwise. + * @returns {Link} + */ + mend() + { + super.set(IS_BROKEN, false); + super.set(BROKEN_REASON, null); + return this; + } + + + /** + * Assign a redirected URL and change any relative state. + * @param {URL|string} url + * @returns {Link} + */ + redirect(url) + { + super.set(REDIRECTED_URL, parseURL(url)); + + this.#relateWithBase(); + return this; + } + + + + /** + * Reassign properties associated with state relative to the link's environment. + */ + #relateWithBase() + { + const url = super.get(REDIRECTED_URL) ?? super.get(REBASED_URL); + + // If impossible to determine is linked to same server/etc + if (url===null || super.get(RESOLVED_BASE_URL)===null) + { + // Overwrite any previous values + super.set(IS_INTERNAL, null); + super.set(IS_SAME_PAGE, null); + } + else + { + // Rebased base URL not used because `` URL could be remote + // @todo common/careful profile + // @todo auth shouldn't affect this + const relation = new URLRelation(url, super.get(RESOLVED_BASE_URL)); + + super.set(IS_INTERNAL, relation.upTo(URLRelation.HOST)); + super.set(IS_SAME_PAGE, relation.upTo(URLRelation.PATH)); + } + } + + + + /** + * Produce and assign an absolute URL and change any relative state. + * @param {URL|string} url + * @param {URL|string} base + * @returns {Link} + */ + resolve(url, base) + { + if (url != null) + { + // Parse or clone + base = parseURL(base); + + if (isURL.lenient(url)) + { + super.set(ORIGINAL_URL, url.href); + super.set(RESOLVED_URL, url); + } + else + { + super.set(ORIGINAL_URL, url); + super.set(RESOLVED_URL, parseURL(url)); + } + + if (base !== null) + { + // Remove any hash since it's useless in a base -- safe to mutate + base.hash = ""; + + const rebased = parseURL(super.get(HTML_BASE_HREF), base); + + super.set(REBASED_BASE_URL, rebased ?? base); + super.set(RESOLVED_BASE_URL, base); + } + else + { + super.set(REBASED_BASE_URL, parseURL(super.get(HTML_BASE_HREF))); + } + + if (super.get(REBASED_BASE_URL) !== null) + { + // Remove any hash since it's useless in a base -- safe to mutate + super.get(REBASED_BASE_URL).hash = ""; + + if (super.get(RESOLVED_URL) === null) + { + super.set(RESOLVED_URL, parseURL(url, super.get(RESOLVED_BASE_URL))); + super.set(REBASED_URL, parseURL(url, super.get(REBASED_BASE_URL))); + } + else + { + super.set(REBASED_URL, super.get(RESOLVED_URL)); + } + } + else + { + super.set(REBASED_URL, super.get(RESOLVED_URL)); + } + + // @todo move relation stuff out of this function -- separation of concerns? + this.#relateWithBase(); + } + + return this; + } + + + + /** + * Assign a value to a supported key. + * @param {symbol} key + * @param {*} value + * @throws {TypeError} unsupported key or undefined value + * @returns {Link} + */ + set(key, value) + { + if (!(key in keysAsKeys)) + { + throw new TypeError("Invalid key"); + } + else if (value === undefined) + { + throw new TypeError("Invalid value"); + } + else + { + return super.set(key, value); + } + } + + + + /** + * Produce a key-value object for `JSON.stringify()`. + * @returns {object} + */ + toJSON() + { + // @todo https://github.com/tc39/proposal-pipeline-operator + return Object.fromEntries(Array.from(super.entries())); + } +} + + + +const keysAsValues = +{ + BROKEN_REASON, + EXCLUDED_REASON, + HTML_ATTR_NAME, + HTML_ATTRS, + HTML_BASE_HREF, + HTML_INDEX, + HTML_LOCATION, + HTML_OFFSET_INDEX, + HTML_SELECTOR, + HTML_TAG, + HTML_TAG_NAME, + HTML_TEXT, + HTTP_RESPONSE, + HTTP_RESPONSE_WAS_CACHED, + IS_BROKEN, + IS_INTERNAL, + IS_SAME_PAGE, + ORIGINAL_URL, + REBASED_BASE_URL, + REBASED_URL, + REDIRECTED_URL, + RESOLVED_BASE_URL, + RESOLVED_URL, + WAS_EXCLUDED +}; + + + +const keysAsList = Object.values(keysAsValues); + + + +const keysAsKeys = keysAsList.reduce((result, value) => +{ + result[value] = true; // memoized value + return result; +}, {}); + + + +/** + * Parse or clone a URL. + * @param {URL|string|null} [url] + * @param {URL|string} [base] + * @returns {URL|null} + */ +const parseURL = (url=null, base) => +{ + if (url !== null) + { + try + { + url = new URL(url, base); + } + catch + { + url = null; + } + } + + return url; +}; + + + +Object.freeze(Link); diff --git a/lib/internal/SafeEventEmitter.js b/lib/internal/SafeEventEmitter.js new file mode 100644 index 00000000..d6e0739a --- /dev/null +++ b/lib/internal/SafeEventEmitter.js @@ -0,0 +1,24 @@ +import {ERROR_EVENT} from "../internal/events"; +import {EventEmitter} from "events"; + + + +export default class SafeEventEmitter extends EventEmitter +{ + /** + * Emit an event while catching any errors within consumer handlers. + * @param {string} type + * @param {...*} args + */ + emit(type, ...args) + { + try + { + super.emit(type, ...args); + } + catch (error) + { + super.emit(ERROR_EVENT, error); + } + } +} diff --git a/lib/internal/checkLink.js b/lib/internal/checkLink.js new file mode 100644 index 00000000..c9eef981 --- /dev/null +++ b/lib/internal/checkLink.js @@ -0,0 +1,164 @@ +import {cloneDeep} from "lodash"; +//import {join as joinPath} from "path"; +import Link, {HTTP_RESPONSE, HTTP_RESPONSE_WAS_CACHED, REBASED_URL} from "./Link"; +//import {promises as fs} from "fs"; +import requestHTTP from "./requestHTTP"; +import URLRelation from "url-relation"; + +//const {stat:statFile} = fs; + + + +/** + * Check a link on the local file system. + * @param {Link} link + * @param {URLCache} cache + * @param {object} options + * @returns {Promise} + */ +/*const checkFile = async (link, cache, options) => +{ + try + { + const {isFile} = await statFile(link.get(REBASED_URL).pathname); + + if (!isFile()) + { + //throw new Error("ERRNOTFOUND"); + } + + link.mend(); + } + catch ({code}) + { + link.break(`ERRNO_${code}`); + } + finally + { + return link; + } +};*/ + + + +/** + * Check a link via HTTP. + * @param {Link} link + * @param {object} auth + * @param {URLCache} cache + * @param {object} options + * @returns {Promise} + */ +const checkHTTP = async (link, auth, cache, options) => +{ + const result = await requestHTTP(link.get(REBASED_URL), auth, options.requestMethod, cache, options) + .then(({response}) => response) // exclude any stream + .catch(error => error); + + copyResponseData(result, link, options); + + link.set(HTTP_RESPONSE_WAS_CACHED, false); + + return link; +}; + + + +/** + * Copy data from a cached or uncached response into a Link. + * @param {object|Error} response + * @param {Link} link + * @param {object} options + */ +const copyResponseData = (response, link, {cacheResponses}) => +{ + if (response instanceof Error) + { + link.break(`ERRNO_${response.code}`); + } + else + { + if (response.status<200 || response.status>299) + { + link.break(`HTTP_${response.status}`); + } + else + { + link.mend(); + } + + // @todo would a string check be sufficient? + if (!URLRelation.match(response.url, link.get(REBASED_URL), { targetComponent:URLRelation.PATH })) + { + // @todo this needs a test + // @todo test if redirected to a different protocol + link.redirect(response.url); + } + + if (cacheResponses) + { + // Avoid potential mutations to cache + response = cloneDeep(response); + } + + link.set(HTTP_RESPONSE, response); + } +}; + + + +/** + * Check a link's URL to see if it is broken or not. + * @param {Link} link + * @param {object} auth + * @param {URLCache} cache + * @param {object} options + * @throws {TypeError} non-Link + * @returns {Promise} + */ +export default async (link, auth, cache, options) => +{ + if (!(link instanceof Link)) + { + throw new TypeError("Invalid Link"); + } + else + { + let output; + + // @todo move out to a `Link::invalidate()` to share with `HtmlChecker()` ? + if (!(link.get(REBASED_URL)?.protocol in options.acceptedSchemes)) + { + link.break("BLC_INVALID"); + output = link; + } + else if (options.cacheResponses) + { + // @todo different auths can have different responses + const result = cache.get(link.get(REBASED_URL)); + + if (result !== undefined) + { + copyResponseData(await result, link, options); + + link.set(HTTP_RESPONSE_WAS_CACHED, true); + output = link; + } + } + + if (output) + { + return output; + } + else + { + /*switch (link.get(REBASED_URL).protocol) + { + "file:": return checkFile(link, cache, options); + + "http:": + "https:":*/ return checkHTTP(link, auth, cache, options); + //} + } + } +}; diff --git a/lib/internal/checkUrl.js b/lib/internal/checkUrl.js deleted file mode 100644 index 5130761f..00000000 --- a/lib/internal/checkUrl.js +++ /dev/null @@ -1,164 +0,0 @@ -"use strict"; -var linkObj = require("./linkObj"); -var reasons = require("./messages").reasons; -var simpleResponse = require("./simpleResponse"); - -var bhttp = require("bhttp"); -var extend = require("extend"); -var isString = require("is-string"); - - - -/* - Checks a URL to see if it's broken or not. -*/ -function checkUrl(link, baseUrl, cache, options, retry) -{ - var cached; - - if (retry === undefined) - { - if (isString(link) === true) - { - link = linkObj(link); - linkObj.resolve(link, baseUrl, options); - } - - // TODO :: move out to an `linkObj.invalidate()` to share with `HtmlChecker()` ? - if (link.url.resolved === null) - { - link.broken = true; - link.brokenReason = "BLC_INVALID"; - linkObj.clean(link); - return Promise.resolve(link); - } - - cached = cache.get(link.url.parsed); - - if (cached !== undefined) - { - return Promise.resolve(cached).then( function(response) - { - // Cloned to avoid unexpected mutations as a result of user changes - response = extend({}, response); - - copyResponseData(response, link, options); - - link.http.cached = true; - - return link; - }); - } - } - - var request = bhttp.request(link.url.resolved, // TODO :: https://github.com/joepie91/node-bhttp/issues/3 - { - discardResponse: true, - headers: { "user-agent":options.userAgent }, - method: retry!==405 ? options.requestMethod : "get" - }) - .then( function(response) - { - response = simpleResponse(response); - - if (response.statusCode===405 && options.requestMethod==="head" && options.retry405Head===true && retry!==405) - { - // Retry possibly broken server with "get" - return checkUrl(link, baseUrl, cache, options, 405); - } - - // TODO :: store ALL redirected urls in cache - if (options.cacheResponses===true && response.url!==link.url.resolved) - { - cache.set(response.url, response); // TODO :: store `request` instead to be consistent? - } - - return response; - }) - .catch( function(error) - { - // The error will be stored as a response - return error; - }); - - if (retry === undefined) - { - // Send response to cache -- it will be available to `cache.get()` before being resolved - if (options.cacheResponses === true) - { - cache.set(link.url.parsed, request); - } - - // Send linkObj to caller - return request.then( function(response) - { - copyResponseData(response, link, options); - - link.http.cached = false; - - return link; - }); - } - else - { - return request; - } -} - - - -/* - Copy data from a bhttp response object—either from a request or cache— - into a link object. -*/ -function copyResponseData(response, link, options) -{ - if (response instanceof Error === false) - { - if (response.statusCode !== 200) - { - link.broken = true; - link.brokenReason = "HTTP_" + response.statusCode; - } - else - { - link.broken = false; - } - - link.http.response = response; - - if (link.url.resolved !== response.url) - { - link.url.redirected = response.url; - - if (link.base.resolved !== null) - { - // TODO :: this needs a test - linkObj.relation(link, link.url.redirected); - } - } - } - else - { - link.broken = true; - - if (reasons["ERRNO_"+response.code] != null) - { - link.brokenReason = "ERRNO_" + response.code; - } - /*else if (response.message === "Invalid URL") - { - link.brokenReason = "BLC_INVALID"; - }*/ - else - { - link.brokenReason = "BLC_UNKNOWN"; - } - } - - linkObj.clean(link); -} - - - -module.exports = checkUrl; diff --git a/lib/internal/defaultOptions.js b/lib/internal/defaultOptions.js index c8936ce9..3f6af7fc 100644 --- a/lib/internal/defaultOptions.js +++ b/lib/internal/defaultOptions.js @@ -1,29 +1,32 @@ -"use strict"; -var pkg = require("../../package.json"); +import deepFreeze from "deep-freeze-node"; +import {HEAD_METHOD} from "./methods"; +import {name as packageName, version as packageVersion} from "../../package.json"; +import TAGS from "./tags"; +import userAgent from "default-user-agent"; -var userAgent = require("default-user-agent"); -var defaultOptions = + +export default deepFreeze( { - acceptedSchemes: ["http","https"], - cacheExpiryTime: 3600000, + acceptedSchemes: ["http:","https:"], // @todo add "file:" + cacheMaxAge: 3_600_000, cacheResponses: true, excludedKeywords: [], - excludedSchemes: ["data","geo","javascript","mailto","sms","tel"], + excludedSchemes: ["data:","geo:","javascript:","mailto:","sms:","tel:"], excludeExternalLinks: false, excludeInternalLinks: false, - excludeLinksToSamePage: true, + excludeLinksToSamePage: false, filterLevel: 1, honorRobotExclusions: true, - maxSockets: Infinity, - maxSocketsPerHost: 1, + includedKeywords: [], + includeLink: () => true, + includePage: () => true, + maxSockets: Infinity, // @todo change to `maxExternalSockets` + maxSocketsPerHost: 1, // @todo separate to `maxInternalSockets=5` and `maxExternalSocketsPerHost=1` rateLimit: 0, - requestMethod: "head", - retry405Head: true, - tags: require("./tags"), - userAgent: userAgent(pkg.name, pkg.version) -}; - - - -module.exports = defaultOptions; + requestMethod: HEAD_METHOD, + retryHeadCodes: [405], + retryHeadFail: true, + tags: TAGS, + userAgent: userAgent(packageName, packageVersion) +}); diff --git a/lib/internal/errors.js b/lib/internal/errors.js new file mode 100644 index 00000000..225fe539 --- /dev/null +++ b/lib/internal/errors.js @@ -0,0 +1,31 @@ +export class ExpectedHTMLError extends TypeError +{ + /** + * @param {string} mimeType + * @param {number|string} statusCode + */ + constructor(mimeType="", statusCode) + { + if (mimeType !== "") + { + mimeType = ` but got "${mimeType}"`; + } + + super(`Expected type "text/html"${mimeType}`); + this.code = statusCode; + } +} + + + +export class HTMLRetrievalError extends Error +{ + /** + * @param {number|string} statusCode + */ + constructor(statusCode) + { + super("HTML could not be retrieved"); + this.code = statusCode; + } +} diff --git a/lib/internal/events.js b/lib/internal/events.js new file mode 100644 index 00000000..67c733c1 --- /dev/null +++ b/lib/internal/events.js @@ -0,0 +1,10 @@ +export const COMPLETE_EVENT = "complete"; +export const END_EVENT = "end"; +export const ERROR_EVENT = "error"; +export const HTML_EVENT = "html"; +export const JUNK_EVENT = "junk"; +export const LINK_EVENT = "link"; +export const PAGE_EVENT = "page"; +export const QUEUE_EVENT = "queue"; +export const ROBOTS_EVENT = "robots"; +export const SITE_EVENT = "site"; diff --git a/lib/internal/getRobotsTxt.js b/lib/internal/getRobotsTxt.js index caca35be..86027fc5 100644 --- a/lib/internal/getRobotsTxt.js +++ b/lib/internal/getRobotsTxt.js @@ -1,33 +1,37 @@ -"use strict"; -var guard = require("robots-txt-guard"); -var parse = require("robots-txt-parse"); +import {BLC_INVALID} from "./reasons"; +import {GET_METHOD} from "./methods"; +import guard from "robots-txt-guard"; +import isURL from "isurl"; +import parse from "robots-txt-parse"; +import requestHTTP from "./requestHTTP"; -var bhttp = require("bhttp"); -var urllib = require("url"); -var urlobj = require("urlobj"); - -function getRobotsTxt(url, options) +/** + * Download and parse a robots.txt file from a server's root path. + * @param {URL} url + * @param {object} auth + * @param {URLCache} cache + * @param {object} options + * @throws {TypeError} non-URL + * @returns {Promise} + */ +export default async (url, auth, cache, options) => { - url = urlobj.parse(url); - - // TODO :: this mutates the original (if was an object) - url.hash = null; - url.path = url.pathname = "/robots.txt"; - url.query = null; - url.search = null; - - return bhttp.get(urllib.format(url), // TODO :: https://github.com/joepie91/node-bhttp/issues/3 + if (!isURL.lenient(url)) { - discardResponse: true, - headers: { "user-agent":options.userAgent }, - stream: true - }) - .then(parse) - .then(guard); -} - + throw new TypeError(BLC_INVALID); + } + else + { + url = new URL(url); + url.hash = ""; + url.pathname = "/robots.txt"; + url.search = ""; + const {stream} = await requestHTTP(url, auth, GET_METHOD, cache, options); -module.exports = getRobotsTxt; + // @todo https://github.com/tc39/proposal-pipeline-operator + return guard(await parse(stream)); + } +}; diff --git a/lib/internal/linkObj.js b/lib/internal/linkObj.js deleted file mode 100644 index 545bf473..00000000 --- a/lib/internal/linkObj.js +++ /dev/null @@ -1,281 +0,0 @@ -"use strict"; -var isString = require("is-string"); -var urllib = require("url"); -var urlobj = require("urlobj"); - -var hasOwnProperty = Object.prototype.hasOwnProperty; - - -function linkObj(url) -{ - if (url===undefined || isString(url)===false) - { - url = null; - } - - var link = - { - url: - { - original: url, // The URL as it was inputted - resolved: null, // The URL, resolved as a browser would do so - redirected: null // The URL, after its last redirection, if any - }, - - base: - { - original: null, // The base URL as it was inputted - resolved: null // The base URL, resolved as a browser would do so - }, - - html: - { - index: null, // The order in which the link appeared in its document -- using max-level tag filter - offsetIndex: null, // Sequential (gap-free) indicies for skipped and unskipped links - location:null, // Source code location of the attribute that the link was found within - selector: null, // CSS selector for element in document - tagName: null, // Tag name that the link was found on - attrName: null, // Attribute name that the link was found within - attrs: null, // All attributes on the element - text: null, // TextNode/innerText within the element - tag: null, // The entire tag string - - // Temporary keys - base: null - }, - - http: - { - cached: null, // If the response was pulled from cache - response: null // The request response - }, - - broken: null, // If the link was determined to be broken or not - internal: null, // If the link is to the same server as its base/document - samePage: null, // If the link is to the same page as its base/document - excluded: null, // If the link was excluded due to any filtering - - brokenReason: null, // The reason why the link was considered broken, if it indeed is - excludedReason: null, // The reason why the link was excluded from being checked, if it indeed was - - // Temporary keys - broken_link_checker: true, - resolved: false - }; - - // Not enumerable -- hidden from `JSON.stringify()` - Object.defineProperty(link.base, "parsed", { value:null, writable:true }); // Same as `link.base.resolved`, but is an Object - Object.defineProperty(link.url, "parsed", { value:null, writable:true }); // Same as `link.url.resolved`, but is an Object - - return link; -} - - - -/* - Remove unnecessary keys for public use. -*/ -linkObj.clean = function(link) -{ - delete link.broken_link_checker; - delete link.html.base; // TODO :: don't clean this? - delete link.resolved; - - return link; -}; - - - -/* - Define relationships with base URL. -*/ -linkObj.relation = function(link, url_parsed) -{ - if (url_parsed === undefined) url_parsed = link.url.parsed; - else if (typeof url_parsed === "string") url_parsed = urlobj.parse(url_parsed); - - var relation; - - // If no protocols, it's impossible to determine if they link to the same server - if (url_parsed.protocol===null || link.base.parsed.protocol===null) - { - // Overwrite any previous values - link.internal = null; - link.samePage = null; - } - else - { - // Resolved base not used because html base could be remote - relation = urlobj.relation(url_parsed, link.base.parsed); - - link.internal = relation >= urlobj.component.AUTH; - link.samePage = link.internal===true && relation>=urlobj.component.PATH; - } - - return link; -}; - - - -/* - Absolute'ize a link based on its base URL and HTML's . -*/ -// TODO :: make similar to `url.resolve(from,to)` ? -linkObj.resolve = function(link, base, options) -{ - // If already resolved - if (link.resolved === true) return; - - // Parity with core `url.resolve()` - var parseOptions = { slashesDenoteHost:true }; - - - - // TODO :: we're constantly re-parsing base and html base -- find way to cache them - var base_parsed = base==null ? "" : base; - base_parsed = urlobj.normalize( urlobj.parse(base_parsed, parseOptions) ); - - var htmlBase_parsed = link.html.base==null ? "" : link.html.base; - htmlBase_parsed = urlobj.normalize( urlobj.parse(htmlBase_parsed, parseOptions) ); - - // TODO :: options.normalize=false - // TODO :: options.clone=true ? - var resolvedBase_parsed = urlobj.resolve(base_parsed, htmlBase_parsed); - - if (resolvedBase_parsed.hash !== null) - { - // Hashes are useless in a base - resolvedBase_parsed.hash = null; - resolvedBase_parsed.href = urllib.format(resolvedBase_parsed); // TODO :: use urlobj.format() when available - } - - // TODO :: is this necessary if `link.base.parsed` is cleaned? - if (base_parsed.hash !== null) - { - // Hashes are useless in a base - base_parsed.hash = null; - base_parsed.href = urllib.format(base_parsed); // TODO :: use urlobj.format() when available - } - - - - // `link.url.original` should only ever not have a value within internal tests - var linkOrg_parsed = link.url.original==null ? "" : link.url.original; - linkOrg_parsed = urlobj.parse(linkOrg_parsed, parseOptions); - urlobj.normalize(linkOrg_parsed); - - // `linkOrg_parsed` is cloned to avoid it being mutated - // TODO :: options.clone=true - var resolvedUrl_parsed = urlobj.resolve( resolvedBase_parsed, cloneObject(linkOrg_parsed) ); - - - - if (base !== undefined) - { - link.base.original = base; - } - - // TODO :: use url types (>UNKNOWN && !=EMPTY ... not simple enough) - if (resolvedBase_parsed.href !== "") - { - link.base.resolved = parity(resolvedBase_parsed.href); - } - - link.base.parsed = base_parsed; - - // If resolved link has accepted scheme - if (options.acceptedSchemes[ resolvedUrl_parsed.extra.protocolTruncated ] === true) - { - link.url.resolved = parity(resolvedUrl_parsed.href); - link.url.parsed = resolvedUrl_parsed; - - // TODO :: move relation stuff out of this function -- separation of concerns? - linkObj.relation(link); - } - // Else could not be properly resolved - else - { - link.url.parsed = linkOrg_parsed; - - // If at least resolved to absolute - if (resolvedUrl_parsed.extra.type === urlobj.type.ABSOLUTE) - { - // If base is accepted scheme - if (options.acceptedSchemes[ base_parsed.extra.protocolTruncated ] === true) - { - link.internal = false; - link.samePage = false; - } - } - } - - - - // Avoid future resolving - link.resolved = true; - - return link; -}; - - - -//::: PRIVATE FUNCTIONS - - - -/* - Clones an object and its prototype while maintaining enumerable - keys and support for `instanceof`. -*/ -// TODO :: this may not be necessary if linkObj.base.parsed and linkObj.url.parsed are cleaned out -// TODO :: move this into urlobj -function cloneObject(source) -{ - var clone,key,value; - - if (Array.isArray(source) === true) - { - clone = []; - } - else - { - // Only clone the prototype -- more efficient as it will not convert keys to prototype keys - clone = Object.create( Object.getPrototypeOf(source) ); - } - - // Clone keys/indexes - // TODO :: use Object.keys() for more speed - for (key in source) - { - if (hasOwnProperty.call(source, key) === true) - { - value = source[key]; - - if (value!==null && typeof value==="object") - { - clone[key] = cloneObject(value); - } - else - { - clone[key] = value; - } - } - } - - return clone; -} - - - -/* - Maintain parity with core `url.resolve()`. -*/ -// TODO :: remove this? -function parity(url) -{ - return (url !== "http://") ? url : "http:///"; -} - - - -module.exports = linkObj; diff --git a/lib/internal/matchURL.js b/lib/internal/matchURL.js new file mode 100644 index 00000000..652ce429 --- /dev/null +++ b/lib/internal/matchURL.js @@ -0,0 +1,23 @@ +import {isMatch} from "matcher"; + + + +/** + * Determine if a URL contains at least one—possibly glob'bed—keyword. + * @param {string} url + * @param {Array} keywords + * @returns {boolean} + */ +export default (url, keywords) => keywords.some(keyword => +{ + // Check for literal keyword + if (url.includes(keyword)) + { + return true; + } + else + { + // Check for glob + return isMatch(url, keyword); + } +}); diff --git a/lib/internal/matchUrl.js b/lib/internal/matchUrl.js deleted file mode 100644 index ab76ecc2..00000000 --- a/lib/internal/matchUrl.js +++ /dev/null @@ -1,34 +0,0 @@ -"use strict"; -var calmcard = require("calmcard"); - - - -function matchUrl(url, keywords) -{ - var i,numKeywords; - - if (url != null) - { - numKeywords = keywords.length; - - for (i=0; i -1) - { - return true; - } - // Check for glob'bed keyword - else if ( calmcard(keywords[i], url) === true ) - { - return true; - } - } - } - - return false; -} - - - -module.exports = matchUrl; diff --git a/lib/internal/messages.js b/lib/internal/messages.js deleted file mode 100644 index 983d3b20..00000000 --- a/lib/internal/messages.js +++ /dev/null @@ -1,54 +0,0 @@ -"use strict"; -var errno = require("errno").code; -var statusCodes = require("http").STATUS_CODES; - -var i; - -var errors = -{ - EXPECTED_HTML: function(type) - { - type = type==null ? type : '"'+type+'"'; - return 'Expected type "text/html" but got '+type; - }, - HTML_RETRIEVAL: "HTML could not be retrieved" -}; - -var reasons = -{ - //BLC_CUSTOM: "Custom Exclusion", - BLC_EXTERNAL: "External URL Exclusion", - BLC_INTERNAL: "Internal URL Exclusion", - BLC_HTML: "HTML Exclusion", - BLC_INVALID: "Invalid URL", - BLC_KEYWORD: "Keyword Exclusion", - //BLC_LOCALPATH: "Local File Path Exclusion", - BLC_ROBOTS: "Robots Exclusion", - BLC_SAMEPAGE: "Same-page URL Exclusion", - BLC_SCHEME: "Scheme Exclusion", - BLC_UNKNOWN: "Unknown Error", - - ERRNO_ENOTFOUND: "no matching dns record (ENOTFOUND)" -}; - - - -for (i in errno) -{ - reasons["ERRNO_"+i] = errno[i].description +" ("+i+")"; -} - - - -for (i in statusCodes) -{ - reasons["HTTP_"+i] = statusCodes[i] +" ("+i+")"; -} - - - -module.exports = -{ - errors: errors, - reasons: reasons -}; diff --git a/lib/internal/methods.js b/lib/internal/methods.js new file mode 100644 index 00000000..65e63ae7 --- /dev/null +++ b/lib/internal/methods.js @@ -0,0 +1,2 @@ +export const GET_METHOD = "get"; +export const HEAD_METHOD = "head"; diff --git a/lib/internal/parseHTML.js b/lib/internal/parseHTML.js new file mode 100644 index 00000000..14eddde8 --- /dev/null +++ b/lib/internal/parseHTML.js @@ -0,0 +1,72 @@ +import defaultTreeAdapter from "parse5/lib/tree-adapters/default"; +import isStream from "is-stream"; +import isString from "is-string"; +import {parse} from "parse5"; +import ParserStream from "parse5-parser-stream"; +import {PassThrough} from "stream"; + + + +const FINISH_EVENT = "finish"; + + + +const OPTIONS = +{ + sourceCodeLocationInfo: true, + treeAdapter: + { + ...defaultTreeAdapter, + + createElement: (...args) => + { + const result = defaultTreeAdapter.createElement(...args); + result.attrMap = memoizeAttrs(result.attrs); + return result; + } + } +}; + + + +/** + * Convert a list of parse5 attributes into key-value pairs. + * Note: spec-compliant HTML cannot have multiple attrs of the same name. + * @param {Array} attrs + * @returns {object} + */ +const memoizeAttrs = attrs => attrs.reduce((result, {name, value}) => +{ + result[name] = value; + return result; +}, {}); + + + +/** + * Parse an HTML stream/string and return a tree. + * @param {Stream|string} input + * @throws {TypeError} non-Stream or non-string + * @returns {Promise} + */ +export default input => new Promise((resolve, reject) => +{ + if (isStream(input)) + { + const parser = new ParserStream(OPTIONS) + .once(FINISH_EVENT, () => resolve(parser.document)); + + // @todo https://github.com/sindresorhus/got/issues/834 + const toStringChunks = new PassThrough({ encoding:"utf8" }); + + input.pipe(toStringChunks).pipe(parser); + } + else if (isString(input)) + { + resolve( parse(input, OPTIONS) ); + } + else + { + reject( new TypeError("Invalid input") ); + } +}); diff --git a/lib/internal/parseHtml.js b/lib/internal/parseHtml.js deleted file mode 100644 index 0bf24a1d..00000000 --- a/lib/internal/parseHtml.js +++ /dev/null @@ -1,77 +0,0 @@ -"use strict"; -var isStream = require("is-stream"); -var isString = require("is-string"); -var parse5 = require("parse5"); - -var treeAdapter = Object.create( parse5.treeAdapters.default ); -treeAdapter.createElement_old = treeAdapter.createElement; -treeAdapter.createElement = function(tagName, namespaceURI, attrs) -{ - var result = treeAdapter.createElement_old(tagName, namespaceURI, attrs); - - if (result.attrs != null) - { - result.attrMap = getAttrMap(result.attrs); - } - - return result; -}; - -var options = { locationInfo:true, treeAdapter:treeAdapter }; - - - -/* - Convert attributes array to a map. - - Note: parse5 will have already handled multiple attrs of the - same name. -*/ -function getAttrMap(attrs) -{ - var i; - var map = {}; - var numAttrs = attrs.length; - - for (i=0; i { asdf1, asdf2 } + */ +const memoizeArray = array => array.reduce((map, value) => { - var i,map,numElements; - - if (Array.isArray(array) === true) - { - map = {}; - numElements = array.length; - - for (i=0; i { - if (options==null || options.__parsed!==true) + if (options.__parsed !== HAS_BEEN_PARSED_VALUE) { - options = Object.assign({}, defaultOptions, options); - - // Maps have better search performance, but are not friendly for options - options.acceptedSchemes = array2booleanMap(options.acceptedSchemes); - options.excludedSchemes = array2booleanMap(options.excludedSchemes); - - // Undocumented -- avoids reparsing pass-thru options from class to class - options.__parsed = true; - } - - return options; -} + options = { ...DEFAULT_OPTIONS, ...options }; + // Maps of this kind are easier to work with, but are not consumer-friendly + options.acceptedSchemes = memoizeArray(options.acceptedSchemes); + options.excludedSchemes = memoizeArray(options.excludedSchemes); + options.requestMethod = options.requestMethod.toLowerCase(); -module.exports = parseOptions; + // Undocumented -- avoids reparsing options passed through from class to class + options.__parsed = HAS_BEEN_PARSED_VALUE; + } + + return options; +}; diff --git a/lib/internal/reasons.js b/lib/internal/reasons.js new file mode 100644 index 00000000..5f9f817b --- /dev/null +++ b/lib/internal/reasons.js @@ -0,0 +1,33 @@ +import {code as ERRNO} from "errno"; +import {STATUS_CODES as HTTP} from "http"; + + + +export default Object.freeze( +{ + BLC_CUSTOM: "Custom Exclusion", + BLC_EXTERNAL: "External URL Exclusion", + //BLC_LOCAL_EXCLUSION: "Local File System Path Exclusion", + BLC_HTML: "HTML Exclusion", + BLC_INTERNAL: "Internal URL Exclusion", + BLC_INVALID: "Invalid URL", + BLC_KEYWORD: "Keyword Exclusion", + BLC_ROBOTS: "Robots Exclusion", + BLC_SAMEPAGE: "Same-page URL Exclusion", + BLC_SCHEME: "Scheme Exclusion", + BLC_UNKNOWN: "Unknown Error", + + ERRNO_ENOTFOUND: "no matching dns record (ENOTFOUND)", + + // @todo https://github.com/tc39/proposal-pipeline-operator + ...Object.fromEntries + ( + Object.entries(ERRNO).map(([key, {description}]) => [`ERRNO_${key}`, `${description} (${key})`]) + ), + + // @todo https://github.com/tc39/proposal-pipeline-operator + ...Object.fromEntries + ( + Object.entries(HTTP).map(([key, value]) => [`HTTP_${key}`, `${value} (${key})`]) + ) +}); diff --git a/lib/internal/requestHTTP.js b/lib/internal/requestHTTP.js new file mode 100644 index 00000000..70053fd8 --- /dev/null +++ b/lib/internal/requestHTTP.js @@ -0,0 +1,156 @@ +import {BLC_INVALID} from "./reasons"; +import {GET_METHOD, HEAD_METHOD} from "./methods"; +import isURL from "isurl"; +import {stream as streamHTTP} from "got"; +import tunnel from "auto-tunnel"; + + + +const ERROR_EVENT = "error"; +const REDIRECT_EVENT = "redirect"; +const RESPONSE_EVENT = "response"; + + + +/** + * Create an HTTP request. + * @param {URL} url + * @param {object} auth + * @param {string} method + * @param {object} options + * @param {boolean} [retry] + * @returns {Promise} + */ +const createRequest = (url, auth, method, options, retry=false) => new Promise((resolve, reject) => +{ + const headers = { "user-agent":options.userAgent }; + const redirects = []; + + streamHTTP(url, + { + agent: tunnel(url, { proxyHeaders:headers }), + auth: stringifyAuth(url, auth), + headers, + method, + rejectUnauthorized: false, // accept self-signed SSL certificates + retries: 0, // explicit; they're already disabled for streams + throwHttpErrors: false + }) + .on(ERROR_EVENT, reject) + .on(REDIRECT_EVENT, stream => redirects.push( simplifyResponse(stream) )) + .on(RESPONSE_EVENT, stream => + { + const response = simplifyResponse(stream, redirects); + + if (!retry && method===HEAD_METHOD && options.retryHeadFail && options.retryHeadCodes.includes(response.status)) + { + // Retry potentially broken server with GET_METHOD + resolve( createRequest(url, auth, GET_METHOD, options, true) ); + } + else if (method===GET_METHOD && response.status>=200 && response.status<=299) + { + resolve({ response, stream }); + } + else + { + resolve({ response }); + } + }); +}); + + + +/** + * Create a simple response object from that of the "http" module. + * @param {object|Stream} response + * @param {Array} [redirects] + * @returns {object} + * @todo add response time -- https://github.com/sindresorhus/got/issues/874 + */ +const simplifyResponse = ({headers, statusCode, statusMessage, url}, redirects) => +({ + headers, + status: statusCode, + statusText: statusMessage, + url: new URL(url), + ...(redirects && {redirects}) +}); + + + +/** + * Convert an HTTP authentication URL or object into a string. + * @param {URL} url + * @param {object} auth + * @returns {string} + */ +const stringifyAuth = (url, auth) => +{ + if (url.password!=="" || url.username!=="") + { + return `${url.username}:${url.password}`; + } + else if (auth.password!=="" || auth.username!=="") + { + return `${auth.username}:${auth.password}`; + } +}; + + + +/** + * Create an HTTP request and optionally cache the response. + * @param {URL} url + * @param {object} auth + * @param {string} method + * @param {URLCache} cache + * @param {object} options + * @throws {TypeError} non-URL + * @returns {Promise} + * @todo use `Promise.try()` instead of `async` + */ +export default async (url, auth, method, cache, options) => +{ + if (!isURL.lenient(url)) + { + throw new TypeError(BLC_INVALID); + } + else + { + const promise = createRequest(url, auth, method.toLowerCase(), options); + + if (options.cacheResponses) + { + const cachedPromise = promise + .then(({response}) => + { + // Replace cached promise + // @todo store in a "response" key, so that we can also store a list of all element IDs in the document + cache.set(url, response); + + // Any final redirect + // @todo store in a "response" key, so that we can also store a list of all element IDs in the document + cache.set(response.url, response); + + // Any intermediary redirects + response.redirects.forEach((redirect, i) => + { + const subsequentRedirects = response.redirects.slice(i + 1); + + // @todo store in a "response" key, so that we can also store a list of all element IDs in the document + cache.set(redirect.url, {...response, redirects:subsequentRedirects}); + }); + + return response; + }) + .catch(error => error); // pass-through + + // Make future response available to other requests before completion + // Will always overwrite previous value + // @todo store in a "response" key, so that we can also store a list of all element IDs in the document + cache.set(url, cachedPromise); + } + + return promise; + } +}; diff --git a/lib/internal/scrapeHTML.js b/lib/internal/scrapeHTML.js new file mode 100644 index 00000000..8a959937 --- /dev/null +++ b/lib/internal/scrapeHTML.js @@ -0,0 +1,326 @@ +import condenseWhitespace from "condense-whitespace"; +import Link, {HTML_ATTR_NAME, HTML_ATTRS, HTML_BASE_HREF, HTML_INDEX, HTML_LOCATION, HTML_SELECTOR, HTML_TAG, HTML_TAG_NAME, HTML_TEXT} from "./Link"; +import list2Array from "list-to-array"; +import parseMetaRefresh from "http-equiv-refresh"; +import parseSrcset from "parse-srcset"; +import RobotDirectives from "robot-directives"; +import TAGS from "./tags"; +import walk from "walk-parse5"; + + + +const MAX_FILTER_LEVEL = TAGS[TAGS.length - 1]; +const ALL_NODE_ATTRS = MAX_FILTER_LEVEL["*"]; + +const SPECIAL_NODE_NAME_PREFIX = "#"; + +const BASE_NODE_NAME = "base"; +const BODY_NODE_NAME = "body"; +const COMMENT_NODE_NAME = `${SPECIAL_NODE_NAME_PREFIX}comment`; +const DOCUMENT_NODE_NAME = `${SPECIAL_NODE_NAME_PREFIX}document`; +const HEAD_NODE_NAME = "head"; +const HTML_NODE_NAME = "html"; +const META_NODE_NAME = "meta"; +const TEXT_NODE_NAME = `${SPECIAL_NODE_NAME_PREFIX}text`; + +const CONTENT_ATTR_NAME = "content"; +const HREF_ATTR_NAME = "href"; +const HTTP_EQUIV_ATTR_NAME = "http-equiv"; +const NAME_ATTR_NAME = "name"; +const PING_ATTR_NAME = "ping"; +const SRCSET_ATTR_NAME = "srcset"; + +const REFRESH_ATTR_VALUE = "refresh"; +const ROBOTS_ATTR_VALUE = "robots"; + + + +/** + * Traverse the root node and return located links via a callback function. + * @param {object} rootNode + * @param {Function} callback + */ +const findLinks = (rootNode, callback) => +{ + walk(rootNode, node => + { + if (node.nodeName!==COMMENT_NODE_NAME && node.nodeName!==TEXT_NODE_NAME) + { + const filteredNodeAttrs = MAX_FILTER_LEVEL[node.nodeName] ?? {}; + + node.attrs.forEach(({name:attrName, value:attrValue}) => + { + let url = null; + + // If a supported attribute + if (attrName in filteredNodeAttrs || attrName in ALL_NODE_ATTRS) + { + switch (attrName) + { + case CONTENT_ATTR_NAME: + { + // Special case for `` + if (node.attrMap[HTTP_EQUIV_ATTR_NAME]?.toLowerCase() === REFRESH_ATTR_VALUE) + { + url = parseMetaRefresh(attrValue).url; + } + + break; + } + case PING_ATTR_NAME: + { + url = list2Array(attrValue, ","); + break; + } + case SRCSET_ATTR_NAME: + { + url = parseSrcset(attrValue).map(image => image.url); + break; + } + default: + { + // https://html.spec.whatwg.org/multipage/infrastructure.html#valid-url-potentially-surrounded-by-spaces + url = attrValue.trim(); + } + } + + if (Array.isArray(url)) + { + url.forEach(_url => callback(node, attrName, _url)); + } + else if (url != null) + { + callback(node, attrName, url); + } + } + }); + } + }); +}; + + + +/** + * Traverse the root node to locate preliminary elements/data. + * + * + * + * Looks for the first instance. If no `href` attribute exists, + * the element is ignored and possible successors are considered. + * + * + * + * Looks for all robot instances and cascades the values. + * + * @param {object} rootNode + * @param {RobotDirectives} robots + * @returns {object} + */ +const findPreliminaries = (rootNode, robots) => +{ + const result = { base:null }; + + walk(rootNode, ({attrMap, nodeName}) => + { + switch (nodeName) + { + // `` can be anywhere, not just within `` + case BASE_NODE_NAME: + { + if (result.base===null && HREF_ATTR_NAME in attrMap) + { + // https://html.spec.whatwg.org/multipage/infrastructure.html#valid-url-potentially-surrounded-by-spaces + result.base = attrMap[HREF_ATTR_NAME].trim(); + } + + break; + } + // `` can be anywhere + case META_NODE_NAME: + { + if (robots && NAME_ATTR_NAME in attrMap && CONTENT_ATTR_NAME in attrMap) + { + const name = attrMap[NAME_ATTR_NAME].trim().toLowerCase(); + + if (name===ROBOTS_ATTR_VALUE || RobotDirectives.isBot(name)) + { + robots.meta(name, attrMap[CONTENT_ATTR_NAME]); + } + } + + break; + } + } + + if (result.base!==null && !robots) + { + // Kill walk + return false; + } + }); + + return result; +}; + + + +/** + * Find the `` element. + * @param {object} document + * @returns {object} + */ +const findRootNode = document => document.childNodes.find(childNode => +{ + // Doctypes have no `childNodes` property + // HTML can only have one true root node + if (childNode.childNodes != null) + { + return childNode; + } +}); + + + +/** + * Find a node's `:nth-child()` index among its siblings. + * @param {object} node + * @returns {number} + */ +const getNthIndex = node => +{ + const parentsChildren = node.parentNode.childNodes; + let count = 0; + + parentsChildren.every(child => + { + if (child !== node) + { + // Exclude non-element nodes + if (!child.nodeName.startsWith(SPECIAL_NODE_NAME_PREFIX)) + { + count++; + } + + return true; + } + else + { + return false; + } + }); + + // `:nth-child()` indices don't start at 0 + return count + 1; +}; + + + +/** + * Produces a CSS selector that matches an element. + * @param {object} node + * @returns {string} + */ +const getSelector = node => +{ + const selector = []; + + while (node.nodeName !== DOCUMENT_NODE_NAME) + { + let name = node.nodeName; + + // Only one of these are ever allowed per document -- so, index is unnecessary + if (name!==HTML_NODE_NAME && name!==BODY_NODE_NAME & name!==HEAD_NODE_NAME) + { + name += `:nth-child(${getNthIndex(node)})`; + } + + // Building backwards + selector.push(name); + + node = node.parentNode; + } + + return selector.reverse().join(" > "); +}; + + + +/** + * Produces an `innerText` value for text nodes within an element. + * @param {object} node + * @returns {string|null} + */ +const getText = node => +{ + let text = null; + + if (node.childNodes.length > 0) + { + text = ""; + + walk(node, ({nodeName, value}) => + { + if (nodeName === TEXT_NODE_NAME) + { + text += value; + } + }); + + // @todo don't normalize if within
 ? use "normalize-html-whitespace" package if so
+		text = condenseWhitespace(text);
+	}
+
+	return text;
+};
+
+
+
+/**
+ * Serialize an HTML element into a string.
+ * @param {object} node
+ * @returns {string}
+ */
+const stringifyNode = ({attrs, nodeName}) =>
+{
+	const attrsString = attrs.reduce((result, {name, value}) => `${result} ${name}="${value}"`, "");
+
+	return `<${nodeName}${attrsString}>`;
+};
+
+
+
+/**
+ * Scrape a parsed HTML document/tree for links.
+ * @param {object} document
+ * @param {URL|string} pageURL
+ * @param {RobotDirectives} robots
+ * @returns {Array}
+ */
+export default (document, pageURL, robots) =>
+{
+	const links = [];
+	const rootNode = findRootNode(document);
+	const {base} = findPreliminaries(rootNode, robots);
+
+	findLinks(rootNode, (node, attrName, url) =>
+	{
+		// Elements added for compliance (not from HTML source) have no location
+		const location = node.sourceCodeLocation?.attrs[attrName] ?? null;
+
+		const link = new Link()
+		.set(HTML_ATTR_NAME, attrName)
+		.set(HTML_ATTRS, node.attrMap)
+		.set(HTML_BASE_HREF, base)
+		.set(HTML_INDEX, links.length)
+		.set(HTML_LOCATION, location)
+		.set(HTML_SELECTOR, getSelector(node))
+		.set(HTML_TAG, stringifyNode(node))
+		.set(HTML_TAG_NAME, node.nodeName)
+		.set(HTML_TEXT, getText(node))
+		.resolve(url, pageURL);
+
+		links.push(link);
+	});
+
+	return links;
+};
diff --git a/lib/internal/scrapeHtml.js b/lib/internal/scrapeHtml.js
deleted file mode 100644
index 911b1774..00000000
--- a/lib/internal/scrapeHtml.js
+++ /dev/null
@@ -1,360 +0,0 @@
-"use strict";
-var linkObj = require("./linkObj");
-var tags    = require("./tags");
-
-var condenseWhitespace = require("condense-whitespace");
-var parseMetaRefresh = require("http-equiv-refresh");
-var RobotDirectives = require("robot-directives");
-
-var maxFilterLevel = tags[tags.length - 1];
-
-
-
-/*
-	Scrape a parsed HTML document/tree for links.
-*/
-function scrapeHtml(document, robots)
-{
-	var link,links,preliminaries,rootNode;
-	
-	rootNode = findRootNode(document);
-	
-	if (rootNode != null)
-	{
-		preliminaries = findPreliminaries(rootNode, robots);
-		links = [];
-		
-		findLinks(rootNode, function(node, attrName, url)
-		{
-			link = linkObj(url);
-
-			link.html.attrs = node.attrMap;
-			link.html.attrName = attrName;
-			link.html.base = preliminaries.base;
-			link.html.index = links.length;
-			link.html.selector = getSelector(node);
-			link.html.tag = stringifyNode(node);
-			link.html.tagName = node.nodeName;
-			link.html.text = getText(node);
-
-			// If not a "fake" (duplicated) element, as a result of adoption
-			if (node.__location !== undefined)
-			{
-				link.html.location = node.__location.attrs[attrName];
-			}
-			
-			links.push(link);
-		});
-	}
-	
-	return links;
-}
-
-
-
-//::: PRIVATE FUNCTIONS
-
-
-
-/*
-	Traverses the root node to locate links that match filters.
-*/
-function findLinks(rootNode, callback)
-{
-	var attrName,i,link,linkAttrs,numAttrs,url;
-	
-	walk(rootNode, function(node)
-	{
-		linkAttrs = maxFilterLevel[node.nodeName];
-		
-		// If a supported element
-		if (linkAttrs != null)
-		{
-			numAttrs = node.attrs.length;
-			
-			// Faster to loop through Arrays than Objects
-			for (i=0; i`
-					if (node.nodeName==="meta" && attrName==="content")
-					{
-						if (node.attrMap["http-equiv"]!=null && node.attrMap["http-equiv"].toLowerCase()==="refresh")
-						{
-							url = parseMetaRefresh( node.attrMap[attrName] ).url;
-						}
-					}
-					else
-					{
-						// https://html.spec.whatwg.org/multipage/infrastructure.html#valid-url-potentially-surrounded-by-spaces
-						url = node.attrMap[attrName].trim();
-					}
-					
-					if (url != null)
-					{
-						callback(node, attrName, url);
-					}
-				}
-			}
-		}
-	});
-}
-
-
-
-/*
-	Traverses the root node to locate preliminary elements/data.
-	
-	
-		
-		Looks for the first instance. If no `href` attribute exists,
-		the element is ignored and possible successors are considered.
-	
-	
-		
-		Looks for all robot instances and cascades the values.
-*/
-function findPreliminaries(rootNode, robots)
-{
-	var name;
-	var find = {
-		base: true,
-		robots: robots != null
-	};
-	var found = {
-		base: false
-	};
-	var result = {
-		base: null
-	};
-	
-	walk(rootNode, function(node)
-	{
-		switch (node.nodeName)
-		{
-			// `` can be anywhere, not just within ``
-			case "base":
-			{
-				if (find.base===true && found.base===false && node.attrMap.href!=null)
-				{
-					// https://html.spec.whatwg.org/multipage/infrastructure.html#valid-url-potentially-surrounded-by-spaces
-					result.base = node.attrMap.href.trim();
-					
-					found.base = true;
-				}
-				
-				break;
-			}
-			// `` can be anywhere
-			case "meta":
-			{
-				if (find.robots===true && node.attrMap.name!=null && node.attrMap.content!=null)
-				{
-					name = node.attrMap.name.trim().toLowerCase();
-					
-					switch (name)
-					{
-						case "description":
-						case "keywords":
-						{
-							break;
-						}
-						// Catches all because we have "robots" and countless botnames such as "googlebot"
-						default:
-						{
-							if (name==="robots" || RobotDirectives.isBot(name)===true)
-							{
-								robots.meta(name, node.attrMap.content);
-							}
-						}
-					}
-				}
-				
-				break;
-			}
-		}
-		
-		if (found.base===true && find.robots===false)
-		{
-			// Kill walk
-			return false;
-		}
-	});
-	
-	return result;
-}
-
-
-
-/*
-	Find the `` element.
-*/
-function findRootNode(document)
-{
-	var i;
-	var rootNodes = document.childNodes;
-	
-	for (i=0; i ");
-}
-
-
-
-function getText(node)
-{
-	var text = null;
-	
-	if (node.childNodes.length > 0)
-	{
-		text = "";
-		
-		walk(node, function(node)
-		{
-			if (node.nodeName === "#text")
-			{
-				text += node.value;
-			}
-		});
-		
-		// TODO :: don't normalize if within 
 ? use "normalize-html-whitespace" package if so
-		text = condenseWhitespace(text);
-	}
-	
-	return text;
-}
-
-
-
-/*
-	Serialize an HTML node back to a string.
-*/
-function stringifyNode(node)
-{
-	var result = "<"+node.nodeName;
-	var numAttrs = node.attrs.length;
-	
-	for (var i=0; i}
+ */
+export default async (url, auth, cache, options) =>
+{
+	const result = await requestHTTP(url, auth, GET_METHOD, cache, options);
+	const {response: {headers, status}} = result;
+
+	if (status<200 || status>299)
+	{
+		throw new HTMLRetrievalError(status);
+	}
+	else
+	{
+		const type = headers[CONTENT_TYPE];
+
+		// Content-type is not mandatory in HTTP spec
+		if (!type?.startsWith(HTML_MIMETYPE))
+		{
+			throw new ExpectedHTMLError(type, status);
+		}
+	}
+
+	return result;
+};
diff --git a/lib/internal/streamHtml.js b/lib/internal/streamHtml.js
deleted file mode 100644
index 1a4e2926..00000000
--- a/lib/internal/streamHtml.js
+++ /dev/null
@@ -1,95 +0,0 @@
-"use strict";
-var errors         = require("./messages").errors;
-var simpleResponse = require("./simpleResponse");
-
-var bhttp = require("bhttp");
-
-
-
-function checkErrors(response)
-{
-	var error,type;
-	
-	if (response.statusCode !== 200)
-	{
-		error = new Error(errors.HTML_RETRIEVAL);
-		error.code = response.statusCode;
-		return error;
-	}
-	
-	type = response.headers["content-type"];
-	
-	// content-type is not mandatory in HTTP spec
-	if (type==null || type.indexOf("text/html")!==0)
-	{
-		error = new Error(errors.EXPECTED_HTML(type));
-		error.code = response.statusCode;
-		return error;
-	}
-}
-
-
-
-/*
-	Request a URL for its HTML contents and return a stream.
-*/
-function streamHtml(url, cache, options)
-{
-	var result;
-	
-	// Always gets the URL because response bodies are never cached
-	var request = bhttp.get(url,  // TODO :: https://github.com/joepie91/node-bhttp/issues/3
-	{
-		headers: { "user-agent":options.userAgent },
-		stream: true
-	})
-	.then( function(orgResponse)
-	{
-		var response = simpleResponse(orgResponse);
-		
-		result = checkErrors(response);
-		
-		if (result === undefined)
-		{
-			result = 
-			{
-				response: response,
-				stream: orgResponse
-			};
-			
-			// Send response of redirected url to cache
-			if (options.cacheResponses===true && response.url!==url)
-			{
-				// Will always overwrite previous value
-				cache.set(response.url, response);  // TODO :: store `request` instead to be consistent?
-			}
-		}
-		
-		return response;
-	})
-	.catch( function(error)
-	{
-		// The error will be stored as a response
-		return error;
-	});
-	
-	// Send response to cache -- it will be available to `cache.get()` before being resolved
-	if (options.cacheResponses === true)
-	{
-		// Will always overwrite previous value
-		cache.set(url, request);
-	}
-	
-	// Send result to caller
-	return request.then( function(response)
-	{
-		if (response instanceof Error === true) throw response;
-		if (result instanceof Error === true) throw result;
-		
-		return result;
-	});
-}
-
-
-
-module.exports = streamHtml;
diff --git a/lib/internal/tags.js b/lib/internal/tags.js
index a23c502e..c0b3af32 100644
--- a/lib/internal/tags.js
+++ b/lib/internal/tags.js
@@ -1,108 +1,162 @@
-"use strict";
+import deepFreeze from "deep-freeze-node";
 
-var tags = 
+
+
+// Clickable links
+const flatLevel0 =
 {
-	0:  // clickable links
-	{
-		a:      { href:true },
-		area:   { href:true }
-	},
-	1:  // clickable links, media, iframes, meta refreshes
-	{
-		a:        { href:true },
-		area:     { href:true },
-		audio:    { src:true },
-		embed:    { src:true },
-		iframe:   { src:true },
-		img:      { src:true },
-		input:    { src:true },
-		menuitem: { icon:true },
-		meta:     { content:true },
-		object:   { data:true },
-		source:   { src:true },
-		track:    { src:true },
-		video:    { poster:true, src:true }
-	},
-	2:  // clickable links, media, iframes, meta refreshes, stylesheets, scripts, forms
-	{
-		a:        { href:true },
-		area:     { href:true },
-		audio:    { src:true },
-		embed:    { src:true },
-		form:     { action:true },
-		iframe:   { src:true },
-		img:      { src:true },
-		input:    { src:true },
-		link:     { href:true },
-		menuitem: { icon:true },
-		meta:     { content:true },
-		object:   { data:true },
-		script:   { src:true },
-		source:   { src:true },
-		track:    { src:true },
-		video:    { poster:true, src:true }
-	},
-	3:  // clickable links, media, iframes, meta refreshes, stylesheets, scripts, forms, metadata
-	{
-		a:          { href:true },
-		area:       { href:true },
-		audio:      { src:true },
-		blockquote: { cite:true },
-		del:        { cite:true },
-		embed:      { src:true },
-		form:       { action:true },
-		iframe:     { longdesc:true, src:true },
-		img:        { longdesc:true, src:true },
-		input:      { src:true },
-		ins:        { cite:true },
-		link:       { href:true },
-		menuitem:   { icon:true },
-		meta:       { content:true },
-		object:     { data:true },
-		q:          { cite:true },
-		script:     { src:true },
-		source:     { src:true },
-		track:      { src:true },
-		video:      { poster:true, src:true }
-	},
-	
-	length: 4  // simulate Array
+	a:      { href:true },
+	area:   { href:true }
 };
 
+// Clickable links, media, frames, meta refreshes
+const flatLevel1 =
+{
+	a:        { href:true },
+	applet:   { archive:true, code:true, src:true },
+	area:     { href:true },
+	audio:    { src:true },
+	body:     { background:true },
+	embed:    { src:true },
+	frame:    { src:true },
+	iframe:   { src:true },
+	img:      { src:true, srcset:true },
+	input:    { src:true },
+	menuitem: { icon:true },
+	meta:     { content:true },
+	object:   { data:true },
+	source:   { src:true, srcset:true },
+	table:    { background:true },
+	tbody:    { background:true },
+	td:       { background:true },
+	tfoot:    { background:true },
+	th:       { background:true },
+	thead:    { background:true },
+	tr:       { background:true },
+	track:    { src:true },
+	video:    { poster:true, src:true }
+};
 
+// Clickable links, media, frames, meta refreshes, stylesheets, scripts, forms
+const flatLevel2 =
+{
+	a:        { href:true },
+	applet:   { archive:true, code:true, src:true },
+	area:     { href:true },
+	audio:    { src:true },
+	body:     { background:true },
+	button:   { formaction:true },
+	embed:    { src:true },
+	form:     { action:true },
+	frame:    { src:true },
+	iframe:   { src:true },
+	img:      { src:true, srcset:true },
+	input:    { formaction:true, src:true },
+	link:     { href:true },
+	menuitem: { icon:true },
+	meta:     { content:true },
+	object:   { data:true },
+	script:   { src:true },
+	source:   { src:true, srcset:true },
+	table:    { background:true },
+	tbody:    { background:true },
+	td:       { background:true },
+	tfoot:    { background:true },
+	th:       { background:true },
+	thead:    { background:true },
+	tr:       { background:true },
+	track:    { src:true },
+	video:    { poster:true, src:true }
+};
 
-// Only used for `SiteChecker`
-tags.recursive = 
+// Clickable links, media, frames, meta refreshes, stylesheets, scripts, forms, metadata
+const flatLevel3 =
 {
-	0: tags[0],
-	1:
-	{
-		a:        { href:true },
-		area:     { href:true },
-		iframe:   { src:true },
-		meta:     { content:true },
-	},
-	2:
-	{
-		a:        { href:true },
-		area:     { href:true },
-		iframe:   { src:true },
-		meta:     { content:true },
-	},
-	3:
-	{
-		a:          { href:true },
-		area:       { href:true },
-		blockquote: { cite:true },
-		del:        { cite:true },
-		iframe:     { longdesc:true, src:true },
-		img:        { longdesc:true },
-		ins:        { cite:true },
-		meta:       { content:true },
-		q:          { cite:true }
-	}
+	"*":        { itemtype:true },
+	a:          { href:true, ping:true },
+	applet:     { archive:true, code:true, codebase:true, object:true, src:true },
+	area:       { href:true, ping:true },
+	audio:      { src:true },
+	blockquote: { cite:true },
+	body:       { background:true },
+	button:     { formaction:true },
+	del:        { cite:true },
+	embed:      { src:true },
+	form:       { action:true },
+	frame:      { longdesc:true, src:true },
+	head:       { profile:true },
+	html:       { manifest:true },
+	iframe:     { longdesc:true, src:true },
+	img:        { longdesc:true, src:true, srcset:true },
+	input:      { formaction:true, src:true },
+	ins:        { cite:true },
+	link:       { href:true },
+	menuitem:   { icon:true },
+	meta:       { content:true },
+	object:     { codebase:true, data:true },
+	q:          { cite:true },
+	script:     { src:true },
+	source:     { src:true, srcset:true },
+	table:      { background:true },
+	tbody:      { background:true },
+	td:         { background:true },
+	tfoot:      { background:true },
+	th:         { background:true },
+	thead:      { background:true },
+	tr:         { background:true },
+	track:      { src:true },
+	video:      { poster:true, src:true }
+};
+
+
+
+const recursiveLevel0 = flatLevel0;
+
+const recursiveLevel1 =
+{
+	a:        { href:true },
+	area:     { href:true },
+	iframe:   { src:true },
+	meta:     { content:true }
 };
 
+const recursiveLevel2 =
+{
+	a:        { href:true },
+	area:     { href:true },
+	iframe:   { src:true },
+	meta:     { content:true }
+};
+
+const recursiveLevel3 =
+{
+	a:          { href:true },
+	area:       { href:true },
+	blockquote: { cite:true },
+	del:        { cite:true },
+	frame:      { longdesc:true },
+	iframe:     { longdesc:true, src:true },
+	img:        { longdesc:true },
+	ins:        { cite:true },
+	meta:       { content:true },
+	q:          { cite:true }
+};
 
 
-module.exports = tags;
+
+export default deepFreeze(
+{
+	0: flatLevel0,
+	1: flatLevel1,
+	2: flatLevel2,
+	3: flatLevel3,
+	length: 4,  // simulate Array
+
+	recursive:  // only used for `SiteChecker`
+	{
+		0: recursiveLevel0,
+		1: recursiveLevel1,
+		2: recursiveLevel2,
+		3: recursiveLevel3
+	}
+});
diff --git a/lib/internal/transitiveAuth.js b/lib/internal/transitiveAuth.js
new file mode 100644
index 00000000..ee22a46e
--- /dev/null
+++ b/lib/internal/transitiveAuth.js
@@ -0,0 +1,38 @@
+import isURL from "isurl";
+
+
+
+const DEFAULT_AUTH = Object.freeze({ password:"", username:"" });
+
+
+
+/**
+ * Possibly override `auth` with that from `url`.
+ * @param {URL} url
+ * @param {object} [auth]
+ * @returns {object}
+ */
+export default (url, auth=DEFAULT_AUTH) =>
+{
+	if (!isURL.lenient(url))
+	{
+		throw new TypeError("Invalid URL");
+	}
+	else if (url.username!=="" || url.password!=="")
+	{
+		// Clone to avoid mutation
+		url = new URL(url);
+
+		auth =
+		{
+			password: url.password,
+			username: url.username
+		};
+
+		// @todo is this the kind of result we want, with auth stored in `http` ?
+		url.password = "";
+		url.username = "";
+	}
+
+	return { auth, url };
+};
diff --git a/lib/public/HtmlChecker.js b/lib/public/HtmlChecker.js
index 08daae7a..a33ca974 100644
--- a/lib/public/HtmlChecker.js
+++ b/lib/public/HtmlChecker.js
@@ -1,264 +1,314 @@
-"use strict";
-var linkObj      = require("../internal/linkObj");
-var matchUrl     = require("../internal/matchUrl");
-var parseHtml    = require("../internal/parseHtml");
-var parseOptions = require("../internal/parseOptions");
-var scrapeHtml   = require("../internal/scrapeHtml");
+import * as reasons from "../internal/reasons";
+import {COMPLETE_EVENT, END_EVENT, ERROR_EVENT, HTML_EVENT, JUNK_EVENT, LINK_EVENT, QUEUE_EVENT} from "../internal/events";
+import {HTML_ATTR_NAME, HTML_ATTRS, HTML_INDEX, HTML_OFFSET_INDEX, HTML_TAG_NAME, IS_INTERNAL, IS_SAME_PAGE, REBASED_URL} from "../internal/Link";
+import isString from "is-string";
+import {map as linkTypes} from "link-types";
+import matchURL from "../internal/matchURL";
+import parseHTML from "../internal/parseHTML";
+import parseOptions from "../internal/parseOptions";
+import RobotDirectives, {NOFOLLOW, NOIMAGEINDEX, NOINDEX} from "robot-directives";
+import SafeEventEmitter from "../internal/SafeEventEmitter";
+import scrapeHTML from "../internal/scrapeHTML";
+import transitiveAuth from "../internal/transitiveAuth";
+import UrlChecker from "./UrlChecker";
+
+
+
+export default class HtmlChecker extends SafeEventEmitter
+{
+	#auth;
+	#excludedLinks;
+	#options;
+	#resolvePromise;
+	#robots;
+	#scanning;
+	#urlChecker;
 
-var UrlChecker = require("./UrlChecker");
 
-var isString = require("is-string");
-var linkTypes = require("link-types").map;
-var maybeCallback = require("maybe-callback");
-var RobotDirectives = require("robot-directives");
 
+	constructor(options)
+	{
+		super();
+		this.#options = parseOptions(options);
+		this.#reset();
+
+		this.#urlChecker = new UrlChecker(this.#options)
+		.on(ERROR_EVENT, error => this.emit(ERROR_EVENT, error))
+		.on(QUEUE_EVENT, () => this.emit(QUEUE_EVENT))
+		.on(LINK_EVENT, result => this.emit(LINK_EVENT, result))
+		.on(END_EVENT, () => this.#complete());
+	}
 
 
-function HtmlChecker(options, handlers)
-{
-	var thisObj = this;
-	
-	reset(this);
-	
-	this.handlers = handlers || {};
-	this.options = options = parseOptions(options);
-	
-	this.urlChecker = new UrlChecker(this.options,
+
+	clearCache()
 	{
-		link: function(result)
-		{
-			maybeCallback(thisObj.handlers.link)(result);
-		},
-		end: function()
-		{
-			// If stream finished
-			if (thisObj.parsed === true)
-			{
-				complete(thisObj);
-			}
-		}
-	});
-}
+		this.#urlChecker.clearCache();
+		return this;
+	}
 
 
 
-HtmlChecker.prototype.clearCache = function()
-{
-	return this.urlChecker.clearCache();
-};
+	#complete()
+	{
+		const resolvePromise = this.#resolvePromise;
 
+		this.#reset();
 
+		this.emit(COMPLETE_EVENT);
 
-HtmlChecker.prototype.numActiveLinks = function()
-{
-	return this.urlChecker.numActiveLinks();
-};
+		resolvePromise();
+	}
 
 
 
-HtmlChecker.prototype.numQueuedLinks = function()
-{
-	return this.urlChecker.numQueuedLinks();
-};
+	/**
+	 * Determine whether a Link should be excluded from checks, and the reason for such.
+	 * @param {Link} link
+	 * @returns {string|undefined}
+	 */
+	#getExcludeReason(link)
+	{
+		const attrName = link.get(HTML_ATTR_NAME);
+		const attrs = link.get(HTML_ATTRS);
+		const {href, protocol} = link.get(REBASED_URL);
+		const isInternal = link.get(IS_INTERNAL);
+		const tagName = link.get(HTML_TAG_NAME);
 
+		const
+		{
+			excludedKeywords,
+			excludedSchemes,
+			excludeExternalLinks,
+			excludeInternalLinks,
+			excludeLinksToSamePage,
+			honorRobotExclusions,
+			includedKeywords,
+			includeLink
+		} = this.#options;
+
+		if (this.#isExcludedAttribute(attrName, [tagName, "*"]))
+		{
+			return "BLC_HTML";
+		}
+		else if (excludeExternalLinks && isInternal===false)
+		{
+			return "BLC_EXTERNAL";
+		}
+		else if (excludeInternalLinks && isInternal)
+		{
+			return "BLC_INTERNAL";
+		}
+		else if (excludeLinksToSamePage && link.get(IS_SAME_PAGE))
+		{
+			return "BLC_SAMEPAGE";
+		}
+		else if (protocol in excludedSchemes)
+		{
+			return "BLC_SCHEME";
+		}
+		else if (honorRobotExclusions && this.#robots.oneIs([ NOFOLLOW, NOINDEX ]))
+		{
+			return "BLC_ROBOTS";
+		}
+		else if (honorRobotExclusions && this.#robots.is(NOIMAGEINDEX) && isRobotAttr(tagName, attrName))
+		{
+			return "BLC_ROBOTS";
+		}
+		else if (honorRobotExclusions && attrs?.rel!=null && linkTypes(attrs.rel).nofollow)
+		{
+			return "BLC_ROBOTS";
+		}
+		else if (matchURL(href, excludedKeywords))
+		{
+			return "BLC_KEYWORD";
+		}
+		else if (includedKeywords.length>0 && !matchURL(href, includedKeywords))
+		{
+			return "BLC_KEYWORD";
+		}
+		else
+		{
+			const filterResult = includeLink(link);
 
+			// Undocumented support for strings (from `SiteChecker`)
+			if (isString(filterResult) && filterResult in reasons)
+			{
+				return filterResult;
+			}
+			else if (!filterResult)
+			{
+				return "BLC_CUSTOM";
+			}
+			else
+			{
+				// Not excluded
+			}
+		}
+	}
 
-HtmlChecker.prototype.pause = function()
-{
-	return this.urlChecker.pause();
-};
 
 
+	/**
+	 * Determine whether a Link's HTML element and attribute would cause it to be excluded from checks.
+	 * @param {string} attrName
+	 * @param {Array} tagNames
+	 * @returns {boolean}
+	 */
+	#isExcludedAttribute(attrName, tagNames)
+	{
+		const tagGroups = this.#options.tags[this.#options.filterLevel];
 
-HtmlChecker.prototype.resume = function()
-{
-	return this.urlChecker.resume();
-};
+		return tagNames.every(tagName => !(tagName in tagGroups) || !(attrName in tagGroups[tagName]));
+	}
 
 
 
-HtmlChecker.prototype.scan = function(html, baseUrl, robots)
-{
-	var tree;
-	var thisObj = this;
-	
-	if (this.active === false)
+	get isPaused()
+	{
+		return this.#urlChecker.isPaused;
+	}
+
+
+
+	/**
+	 * Enqueue a Link if it is valid and passes filters.
+	 * @param {Link} link
+	 */
+	#maybeEnqueueLink(link)
 	{
-		// Prevent user error with undocumented arugment
-		if (robots instanceof RobotDirectives === false)
+		if (link.get(REBASED_URL) === null)
 		{
-			robots = new RobotDirectives({ userAgent: this.options.userAgent });
+			link.set(HTML_OFFSET_INDEX, link.get(HTML_INDEX) - this.#excludedLinks);
+			link.break("BLC_INVALID");
+			link.include();
+
+			this.emit(LINK_EVENT, link);
 		}
-		
-		this.active = true;
-		this.baseUrl = baseUrl;
-		this.robots = robots;
-		
-		parseHtml(html).then( function(document)
+		else
 		{
-			tree = document;
-			return scrapeHtml(document, thisObj.robots);
-		})
-		.then( function(links)
-		{
-			maybeCallback(thisObj.handlers.html)(tree, thisObj.robots);
-			
-			for (var i=0, numLinks=links.length; i this.#maybeEnqueueLink(link));
+
+			const resolveOnComplete = new Promise(resolve => this.#resolvePromise = resolve);
+
+			// If no links found or all links already checked
+			if (this.#urlChecker.numActiveLinks===0 && this.#urlChecker.numQueuedLinks===0)
+			{
+				this.#complete();
+			}
+
+			return resolveOnComplete;
 		}
 	}
-	
-	if (matchUrl(link.url.resolved, instance.options.excludedKeywords) === true) return "BLC_KEYWORD";
-	
-	// Undocumented handler for custom constraints
-	externalFilter = maybeCallback(instance.handlers._filter)(link);
-	
-	if (isString(externalFilter) === true)
+
+
+
+	get __cache()
 	{
-		return externalFilter;
+		return this.#urlChecker.__cache;
 	}
-	/*else if (externalFilter === false)
-	{
-		return "BLC_CUSTOM";
-	}*/
-	
-	return false;
 }
 
 
 
-function reset(instance)
-{
-	instance.active = false;
-	instance.baseUrl = undefined;
-	instance.excludedLinks = 0;
-	instance.linkEnqueued = null;
-	instance.parsed = false;
-	instance.robots = null;
-}
+//::: PRIVATE FUNCTIONS
 
 
 
-module.exports = HtmlChecker;
+const isRobotAttr = (tagName, attrName) =>
+{
+	return (tagName==="img"      && attrName==="src"   ) ||
+	       (tagName==="input"    && attrName==="src"   ) ||
+	       (tagName==="menuitem" && attrName==="icon"  ) ||
+	       (tagName==="video"    && attrName==="poster");
+};
diff --git a/lib/public/HtmlUrlChecker.js b/lib/public/HtmlUrlChecker.js
index c2a162f1..15938beb 100644
--- a/lib/public/HtmlUrlChecker.js
+++ b/lib/public/HtmlUrlChecker.js
@@ -1,194 +1,223 @@
-"use strict";
-var parseOptions = require("../internal/parseOptions");
-var streamHtml   = require("../internal/streamHtml");
+import {COMPLETE_EVENT, END_EVENT, ERROR_EVENT, HTML_EVENT, JUNK_EVENT, LINK_EVENT, PAGE_EVENT, QUEUE_EVENT} from "../internal/events";
+import HtmlChecker from "./HtmlChecker";
+import parseOptions from "../internal/parseOptions";
+import RequestQueue, {ITEM_EVENT, END_EVENT as REQUEST_QUEUE_END_EVENT} from "limited-request-queue";
+import RobotDirectives from "robot-directives";
+import SafeEventEmitter from "../internal/SafeEventEmitter";
+import streamHTML from "../internal/streamHTML";
+import transitiveAuth from "../internal/transitiveAuth";
 
-var HtmlChecker = require("./HtmlChecker");
 
-var maybeCallback = require("maybe-callback");
-var RequestQueue = require("limited-request-queue");
-var RobotDirectives = require("robot-directives");
 
+export default class HtmlUrlChecker extends SafeEventEmitter
+{
+	#currentAuth;
+	#currentCustomData;
+	#currentDone;
+	#currentPageURL;
+	#currentResponse;
+	#currentRobots;
+	#htmlChecker;
+	#htmlUrlQueue;
+	#options;
 
 
-function HtmlUrlChecker(options, handlers)
-{
-	var thisObj = this;
-	
-	reset(this);
-	
-	this.handlers = handlers || {};
-	this.options = options = parseOptions(options);
-	
-	this.htmlUrlQueue = new RequestQueue(
-	{
-		maxSockets: 1,
-		rateLimit: this.options.rateLimit
-	},
+
+	constructor(options)
 	{
-		item: function(input, done)
+		super();
+		this.#reset();
+
+		this.#options = parseOptions(options);
+
+		this.#htmlUrlQueue = new RequestQueue(
+		{
+			maxSockets: 1,
+			rateLimit: this.#options.rateLimit
+		})
+		.on(ITEM_EVENT, async (url, {auth, customData}, done) =>
 		{
-			thisObj.currentCustomData = input.data.customData;
-			thisObj.currentDone = done;
-			thisObj.currentPageUrl = input.url;
-			
-			streamHtml(thisObj.currentPageUrl, thisObj.__getCache(), thisObj.options).then( function(result)
+			this.#reset();
+
+			this.#currentAuth = auth;
+			this.#currentCustomData = customData;
+			this.#currentDone = done;
+			this.#currentPageURL = url;  // @todo remove hash ?
+
+			try
 			{
-				thisObj.currentResponse = result.response;
-				
-				thisObj.currentRobots = new RobotDirectives({ userAgent: thisObj.options.userAgent });
-				
-				robotHeaders(thisObj);
-				
+				const {response, stream} = await streamHTML(this.#currentPageURL, this.#currentAuth, this.__cache, this.#options);
+
+				this.#currentResponse = response;
+				this.#currentRobots = new RobotDirectives({ userAgent: this.#options.userAgent });
+
+				this.#appendRobotHeaders();
+
 				// Passes robots instance so that headers are included in robot exclusion checks
-				thisObj.htmlChecker.scan(result.stream, result.response.url, thisObj.currentRobots);
-			})
-			.catch( function(error)
+				// @todo does the `await` cause `completedPage` to be called twice (other's in COMPLETE_EVENT) if error occurs?
+				await this.#htmlChecker.scan(stream, response.url, this.#currentRobots, this.#currentAuth);
+			}
+			catch (error)
 			{
-				completedPage(thisObj, error);
-			});
-		},
-		end: function()
+				this.#completedPage(error);
+			}
+		})
+		.on(REQUEST_QUEUE_END_EVENT, () =>
 		{
 			// Clear references for garbage collection
-			reset(thisObj);
-			
-			maybeCallback(thisObj.handlers.end)();
-		}
-	});
-	
-	this.htmlChecker = new HtmlChecker(this.options,
-	{
-		html: function(tree, robots)
-		{
-			maybeCallback(thisObj.handlers.html)(tree, robots, thisObj.currentResponse, thisObj.currentPageUrl, thisObj.currentCustomData);
-		},
-		_filter: function(result)
-		{
-			// Undocumented handler for excluding links via custom constraints
-			return maybeCallback(thisObj.handlers._filter)(result);
-		},
-		junk: function(result)
-		{
-			maybeCallback(thisObj.handlers.junk)(result, thisObj.currentCustomData);
-		},
-		link: function(result)
+			this.#reset();
+
+			this.emit(END_EVENT);
+		});
+
+		this.#htmlChecker = new HtmlChecker(this.#options)
+		.on(ERROR_EVENT, error => this.emit(ERROR_EVENT, error))
+		.on(HTML_EVENT, (tree, robots) =>
 		{
-			maybeCallback(thisObj.handlers.link)(result, thisObj.currentCustomData);
-		},
-		complete: function()
+			this.emit(HTML_EVENT, tree, robots, this.#currentResponse, this.#currentPageURL, this.#currentCustomData);
+		})
+		.on(QUEUE_EVENT, () => this.emit(QUEUE_EVENT))
+		.on(JUNK_EVENT, result => this.emit(JUNK_EVENT, result, this.#currentCustomData))
+		.on(LINK_EVENT, result => this.emit(LINK_EVENT, result, this.#currentCustomData))
+		.on(COMPLETE_EVENT, () => this.#completedPage());
+	}
+
+
+
+	/**
+	 * Append any robot headers.
+	 */
+	#appendRobotHeaders()
+	{
+		const xRobotsTag = this.#currentResponse.headers["x-robots-tag"];
+
+		// @todo https://github.com/nodejs/node/issues/3591
+		if (xRobotsTag != null)
 		{
-			completedPage(thisObj, null);
+			this.#currentRobots.header(xRobotsTag);
 		}
-	});
-}
+	}
 
 
 
-HtmlUrlChecker.prototype.clearCache = function()
-{
-	return this.htmlChecker.clearCache();
-};
+	clearCache()
+	{
+		this.#htmlChecker.clearCache();
+		return this;
+	}
 
 
 
-HtmlUrlChecker.prototype.dequeue = function(id)
-{
-	return this.htmlUrlQueue.dequeue(id);
-};
+	/**
+	 * Emit PAGE_EVENT and continue the queue.
+	 * @param {Error} [error]
+	 */
+	#completedPage(error = null)
+	{
+		// @todo emit page error instead?
+		// @todo include redirected url if there is one?
+		this.emit(PAGE_EVENT, error, this.#currentPageURL, this.#currentCustomData);
+
+		// Auto-starts next queue item, if any
+		// Emits REQUEST_QUEUE_END_EVENT, if not
+		this.#currentDone();
+	}
 
 
 
-HtmlUrlChecker.prototype.enqueue = function(pageUrl, customData)
-{
-	return this.htmlUrlQueue.enqueue(
+	dequeue(id)
 	{
-		url: pageUrl,
-		data: { customData:customData }
-	});
-};
+		const success = this.#htmlUrlQueue.dequeue(id);
 
+		this.emit(QUEUE_EVENT);
 
+		return success;
+	}
 
-HtmlUrlChecker.prototype.numActiveLinks = function()
-{
-	return this.htmlChecker.numActiveLinks();
-};
 
 
+	// `auth` is undocumented and for internal use only
+	enqueue(pageURL, customData, auth)
+	{
+		// @todo this could get messy if there're many different credentials per site (if we cache based on headers)
+		const transitive = transitiveAuth(pageURL, auth);
 
-HtmlUrlChecker.prototype.numPages = function()
-{
-	return this.htmlUrlQueue.length();
-};
+		const id = this.#htmlUrlQueue.enqueue(transitive.url, { auth:transitive.auth, customData });
 
+		this.emit(QUEUE_EVENT);
 
+		return id;
+	}
 
-HtmlUrlChecker.prototype.numQueuedLinks = function()
-{
-	return this.htmlChecker.numQueuedLinks();
-};
 
 
+	has(id)
+	{
+		return this.#htmlUrlQueue.has(id);
+	}
 
-HtmlUrlChecker.prototype.pause = function()
-{
-	this.htmlChecker.pause();
-	return this.htmlUrlQueue.pause();
-};
 
 
+	get isPaused()
+	{
+		return this.#htmlChecker.isPaused;
+	}
+
 
-HtmlUrlChecker.prototype.resume = function()
-{
-	this.htmlChecker.resume();
-	return this.htmlUrlQueue.resume();
-};
 
+	get numActiveLinks()
+	{
+		return this.#htmlChecker.numActiveLinks;
+	}
 
 
-HtmlUrlChecker.prototype.__getCache = function()
-{
-	return this.htmlChecker.__getCache();
-};
+
+	get numPages()
+	{
+		return this.#htmlUrlQueue.length;
+	}
 
 
 
-//::: PRIVATE FUNCTIONS
+	get numQueuedLinks()
+	{
+		return this.#htmlChecker.numQueuedLinks;
+	}
 
 
 
-function completedPage(instance, error)
-{
-	maybeCallback(instance.handlers.page)(error, instance.currentPageUrl, instance.currentCustomData);
-	
-	// Auto-starts next queue item, if any
-	// If not, fires "end"
-	instance.currentDone();
-}
+	pause()
+	{
+		this.#htmlChecker.pause();
+		this.#htmlUrlQueue.pause();
+		return this;
+	}
 
 
 
-function reset(instance)
-{
-	instance.currentCustomData = null;
-	instance.currentDone = null;
-	instance.currentPageUrl = null;
-	instance.currentResponse = null;
-	instance.currentRobots = null;
-}
+	#reset()
+	{
+		this.#currentAuth = null;
+		this.#currentCustomData = null;
+		this.#currentDone = null;
+		this.#currentPageURL = null;
+		this.#currentResponse = null;
+		this.#currentRobots = null;
+	}
 
 
 
-function robotHeaders(instance)
-{
-	// TODO :: https://github.com/joepie91/node-bhttp/issues/20
-	// TODO :: https://github.com/nodejs/node/issues/3591
-	if (instance.currentResponse.headers["x-robots-tag"] != null)
+	resume()
 	{
-		instance.currentRobots.header( instance.currentResponse.headers["x-robots-tag"] );
+		this.#htmlChecker.resume();
+		this.#htmlUrlQueue.resume();
+		return this;
 	}
-}
 
 
 
-module.exports = HtmlUrlChecker;
+	get __cache()
+	{
+		return this.#htmlChecker.__cache;
+	}
+}
diff --git a/lib/public/SiteChecker.js b/lib/public/SiteChecker.js
index 8f94f434..0540314a 100644
--- a/lib/public/SiteChecker.js
+++ b/lib/public/SiteChecker.js
@@ -1,318 +1,389 @@
-"use strict";
-var getRobotsTxt = require("../internal/getRobotsTxt");
-var matchUrl     = require("../internal/matchUrl");
-var parseOptions = require("../internal/parseOptions");
-var reasons      = require("../internal/messages").reasons;
+import {END_EVENT, ERROR_EVENT, HTML_EVENT, JUNK_EVENT, LINK_EVENT, PAGE_EVENT, QUEUE_EVENT, ROBOTS_EVENT, SITE_EVENT} from "../internal/events";
+import {EXCLUDED_REASON, HTML_ATTR_NAME, HTML_TAG_NAME, HTTP_RESPONSE, IS_BROKEN, IS_INTERNAL, REBASED_URL, REDIRECTED_URL, WAS_EXCLUDED} from "../internal/Link";
+import getRobotsTxt from "../internal/getRobotsTxt";
+import HtmlUrlChecker from "./HtmlUrlChecker";
+import parseOptions from "../internal/parseOptions";
+import RequestQueue, {ITEM_EVENT, END_EVENT as REQUEST_QUEUE_END_EVENT} from "limited-request-queue";
+import SafeEventEmitter from "../internal/SafeEventEmitter";
+import transitiveAuth from "../internal/transitiveAuth";
+import URLCache from "urlcache";
 
-var HtmlUrlChecker = require("./HtmlUrlChecker");
 
-var maybeCallback = require("maybe-callback");
-var RequestQueue = require("limited-request-queue");
-var UrlCache = require("urlcache");
 
+// @todo BLC_ROBOTS catches rel=nofollow links but will also catch meta/header excluded links -- fine?
+const PAGE_EXCLUSIONS = ["BLC_KEYWORD", "BLC_ROBOTS", "BLC_SCHEME"];
 
+const PAGE_WAS_CHECKED = true;
 
-function SiteChecker(options, handlers)
+
+
+export default class SiteChecker extends SafeEventEmitter
 {
-	var thisObj = this;
-	
-	reset(this);
-	
-	this.handlers = handlers || {};
-	this.options = options = parseOptions(options);
-	
-	this.sitePagesChecked = new UrlCache({ expiryTime: this.options.cacheExpiryTime });
-	
-	this.siteUrlQueue = new RequestQueue(
-	{
-		maxSockets: 1,
-		rateLimit: this.options.rateLimit
-	},
+	#currentAuth;
+	#currentCustomData;
+	#currentDone;
+	#currentPageError;
+	#currentRobotsTxt;
+	#currentSiteURL;
+	#htmlUrlChecker;
+	#options;
+	#sitePagesChecked;
+	#siteUrlQueue;
+
+
+
+	constructor(options)
 	{
-		item: function(input, done)
+		super();
+		this.#options = this.#overrideOptions(parseOptions(options)); // @todo https://github.com/tc39/proposal-pipeline-operator
+		this.#sitePagesChecked = new URLCache({ maxAge: this.#options.cacheMaxAge });
+		this.#reset();
+
+		this.#siteUrlQueue = new RequestQueue(
+		{
+			maxSockets: 1,
+			rateLimit: this.#options.rateLimit
+		})
+		.on(ITEM_EVENT, async (url, {auth, customData}, done) =>
 		{
-			thisObj.currentCustomData = input.data.customData;
-			thisObj.currentDone = done;
-			thisObj.currentSiteUrl = input.url;  // TODO :: strip after hostname?
-			
-			// Support checking sites multiple times
-			thisObj.sitePagesChecked.clear();
-			
-			if (options.honorRobotExclusions === true)
+			this.#reset();
+
+			this.#currentAuth = auth;
+			this.#currentCustomData = customData;
+			this.#currentDone = done;
+			this.#currentSiteURL = url;  // @todo strip after hostname?
+
+			try
 			{
-				getRobotsTxt(thisObj.currentSiteUrl, options).then( function(robots)
+				if (this.#options.honorRobotExclusions)
 				{
-					thisObj.currentRobotsTxt = robots;
-					
-					maybeCallback(thisObj.handlers.robots)(robots, thisObj.currentCustomData);
-				/*})
-				.catch( function(error)
-				{
-					maybeCallback(thisObj.handlers.robots)(error, null);
-				})
-				.then( function()
-				{*/
-					enqueuePage(thisObj, thisObj.currentSiteUrl, thisObj.currentCustomData);
-				});
+					const robots = await getRobotsTxt(this.#currentSiteURL, this.#currentAuth, this.__cache, this.#options);
+
+					// This receives an instance even if no robots.txt was found
+					this.#currentRobotsTxt = robots;
+
+					this.emit(ROBOTS_EVENT, robots, this.#currentCustomData);
+				}
 			}
-			else
+			catch
+			{
+				// If could not connect to server -- let `HtmlUrlChecker` catch it
+			}
+			finally
 			{
-				enqueuePage(thisObj, thisObj.currentSiteUrl, thisObj.currentCustomData);
+				this.#enqueuePage(this.#currentSiteURL, this.#currentCustomData, this.#currentAuth);
 			}
-		},
-		end: function()
+		})
+		.on(REQUEST_QUEUE_END_EVENT, () =>
 		{
-			// Reduce memory usage
-			thisObj.sitePagesChecked.clear();
-			
 			// Clear references for garbage collection
-			reset(thisObj);
-			
-			maybeCallback(thisObj.handlers.end)();
-		}
-	});
-	
-	this.htmlUrlChecker = new HtmlUrlChecker(this.options,
-	{
-		html: function(tree, robots, response, pageUrl, customData)
+			this.#reset();
+
+			this.emit(END_EVENT);
+		});
+
+		this.#htmlUrlChecker = new HtmlUrlChecker(this.#options)
+		.on(ERROR_EVENT, error => this.emit(ERROR_EVENT, error))
+		.on(HTML_EVENT, (tree, robots, response, pageURL, customData) =>
 		{
 			// If was redirected
-			if (response.url !== pageUrl)
+			if (response.url !== pageURL)
 			{
-				thisObj.sitePagesChecked.set(response.url, true);
-				
-				for (var i=0; i this.#sitePagesChecked.set(redirect.url, PAGE_WAS_CHECKED));
 			}
-			
-			maybeCallback(thisObj.handlers.html)(tree, robots, response, pageUrl, customData);
-		},
-		_filter: function(result)  // undocumented handler
-		{
-			// Additional filters for excluding links
-			return maybeCheckLink(thisObj, result);
-		},
-		junk: function(result, customData)
+
+			this.emit(HTML_EVENT, tree, robots, response, pageURL, customData);
+		})
+		.on(QUEUE_EVENT, () => this.emit(QUEUE_EVENT))
+		.on(JUNK_EVENT, (result, customData) =>
 		{
-			maybeCallback(thisObj.handlers.junk)(result, customData);
-			
-			maybeEnqueuePage(thisObj, result, customData);
-		},
-		link: function(result, customData)
+			this.emit(JUNK_EVENT, result, customData);
+
+			this.#maybeEnqueuePage(result, customData, this.#currentAuth);
+		})
+		.on(LINK_EVENT, (result, customData) =>
 		{
-			maybeCallback(thisObj.handlers.link)(result, customData);
-			
-			maybeEnqueuePage(thisObj, result, customData);
-		},
-		page: function(error, pageUrl, customData)
+			this.emit(LINK_EVENT, result, customData);
+
+			this.#maybeEnqueuePage(result, customData, this.#currentAuth);
+		})
+		.on(PAGE_EVENT, (error, pageURL, customData) =>
 		{
-			maybeCallback(thisObj.handlers.page)(error, pageUrl, customData);
-			
-			// Only the first page should supply an error to "site" handler
-			if (thisObj.sitePagesChecked.length() <= 1)
+			this.emit(PAGE_EVENT, error, pageURL, customData);
+
+			// Only the first page should supply an error to SITE_EVENT
+			if (this.#sitePagesChecked.length <= 1)
 			{
-				thisObj.currentPageError = error;
+				this.#currentPageError = error;
 			}
-		},
-		end: function()
+		})
+		.on(END_EVENT, () =>
 		{
-			maybeCallback(thisObj.handlers.site)(thisObj.currentPageError, thisObj.currentSiteUrl, thisObj.currentCustomData);
-			
+			this.emit(SITE_EVENT, this.#currentPageError, this.#currentSiteURL, this.#currentCustomData);
+
 			// Auto-starts next site, if any
-			// If not, fires "end"
-			thisObj.currentDone();
-		}
-	});
-}
+			// Emits REQUEST_QUEUE_END_EVENT, if not
+			this.#currentDone();
+		});
+	}
 
 
 
-SiteChecker.prototype.clearCache = function()
-{
-	// Does not clear `sitePagesChecked` because it would mess up any current scans
-	return this.htmlUrlChecker.clearCache();
-};
+	clearCache()
+	{
+		// Does not clear `sitePagesChecked` because it would mess up any current scans
+		this.#htmlUrlChecker.clearCache();
+		return this;
+	}
 
 
 
-SiteChecker.prototype.dequeue = function(id)
-{
-	return this.siteUrlQueue.dequeue(id);
-};
+	dequeue(id)
+	{
+		const success = this.#siteUrlQueue.dequeue(id);
 
+		this.emit(QUEUE_EVENT);
 
+		return success;
+	}
 
-SiteChecker.prototype.enqueue = function(firstPageUrl, customData)
-{
-	return this.siteUrlQueue.enqueue(
+
+
+	enqueue(firstPageURL, customData)
 	{
-		url: firstPageUrl,
-		data: { customData:customData }
-	});
-};
+		const transitive = transitiveAuth(firstPageURL);
 
+		const success = this.#siteUrlQueue.enqueue(transitive.url, { auth:transitive.auth, customData });
 
+		this.emit(QUEUE_EVENT);
 
-SiteChecker.prototype.numActiveLinks = function()
-{
-	return this.htmlUrlChecker.numActiveLinks();
-};
+		return success;
+	}
 
 
 
-SiteChecker.prototype.numQueuedLinks = function()
-{
-	return this.htmlUrlChecker.numQueuedLinks();
-};
+	/**
+	 * Enqueue a URL to be crawled.
+	 * @param {URL} url
+	 * @param {*} customData
+	 * @param {object} auth
+	 */
+	#enqueuePage(url, customData, auth)
+	{
+		// Avoid links to self within page
+		this.#sitePagesChecked.set(url, PAGE_WAS_CHECKED);
 
+		this.#htmlUrlChecker.enqueue(url, customData, auth);
+	}
 
 
-SiteChecker.prototype.numPages = function()
-{
-	return this.htmlUrlChecker.numPages();
-};
 
+	/**
+	 * Determine whether a Link should be excluded from checks, and the reason for such.
+	 * @param {Link} link
+	 * @returns {string|undefined}
+	 */
+	#getExcludedReason(link)
+	{
+		if (link.get(IS_INTERNAL) && !this.#isAllowed(link))
+		{
+			return "BLC_ROBOTS";
+		}
+		else
+		{
+			// Not excluded
+		}
+	}
 
 
-SiteChecker.prototype.numSites = function()
-{
-	return this.siteUrlQueue.length();
-};
+
+	has(id)
+	{
+		return this.#siteUrlQueue.has(id);
+	}
 
 
 
-SiteChecker.prototype.pause = function()
-{
-	this.htmlUrlChecker.pause();
-	return this.siteUrlQueue.pause();
-};
+	/**
+	 * Determine whether a Link should be included, conforming to any robots filter.
+	 * @param {Link} link
+	 * @returns {boolean}
+	 */
+	#isAllowed(link)
+	{
+		if (this.#options.honorRobotExclusions)
+		{
+			const rebasedPathname = link.get(REBASED_URL)?.pathname;
+
+			// @todo remove condition when/if `Link::invalidate()` is used in `HtmlChecker`
+			if (rebasedPathname !== null)
+			{
+				return this.#currentRobotsTxt.isAllowed(this.#options.userAgent, rebasedPathname);
+			}
+			else
+			{
+				return true;
+			}
+		}
+		else
+		{
+			return true;
+		}
+	}
 
 
 
-SiteChecker.prototype.resume = function()
-{
-	this.htmlUrlChecker.resume();
-	return this.siteUrlQueue.resume();
-};
+	get isPaused()
+	{
+		return this.#htmlUrlChecker.isPaused;
+	}
 
 
 
-/*SiteChecker.prototype.__getCache = function()
-{
-	return this.htmlUrlChecker.__getCache();
-};*/
+	/**
+	 * Enqueue a page (to be crawled) if it passes filters.
+	 * @param {Link} link
+	 * @param {*} customData
+	 * @param {object} auth
+	 */
+	#maybeEnqueuePage(link, customData, auth)
+	{
+		// Skip specific links that were excluded from checks
+		if (link.get(WAS_EXCLUDED) && PAGE_EXCLUSIONS.includes(link.get(EXCLUDED_REASON)))
+		{
+			// do nothing
+		}
+		else
+		{
+			const tagGroup = this.#options.tags.recursive[this.#options.filterLevel][link.get(HTML_TAG_NAME)] ?? {};
+			const attrSupported = link.get(HTML_ATTR_NAME) in tagGroup;
+			const rebasedURL = link.get(REBASED_URL);
+			const redirectedURL = link.get(REDIRECTED_URL);
+
+			if (
+			   	!attrSupported ||
+			   	link.get(IS_BROKEN) ||
+			   	!link.get(IS_INTERNAL) ||
+			   	this.#sitePagesChecked.has(rebasedURL) ||
+			   	!this.#isAllowed(link)
+			   )
+			{
+				// do nothing
+			}
+			else if (redirectedURL !== null)
+			{
+				// Because only the final redirected page needs to be [recursively] checked,
+				// all redirects are stored as pages that have been checked
+				link.get(HTTP_RESPONSE).redirects.forEach(({url}) => this.#sitePagesChecked.set(url, PAGE_WAS_CHECKED));
 
+				if (!this.#sitePagesChecked.has(redirectedURL))
+				{
+					this.#enqueuePage(redirectedURL, customData, auth);
+				}
+			}
+			else if (this.#options.includePage(rebasedURL))
+			{
+				this.#enqueuePage(rebasedURL, customData, auth);
+			}
+		}
+	}
 
 
-//::: PRIVATE FUNCTIONS
 
+	get numActiveLinks()
+	{
+		return this.#htmlUrlChecker.numActiveLinks;
+	}
 
 
-function enqueuePage(instance, url, customData)
-{
-	// Avoid links to self within page
-	instance.sitePagesChecked.set(url, true);
-	
-	instance.htmlUrlChecker.enqueue(url, customData);
-}
 
+	get numQueuedLinks()
+	{
+		return this.#htmlUrlChecker.numQueuedLinks;
+	}
 
 
-function isAllowed(instance, link)
-{
-	if (instance.options.honorRobotExclusions===true /*&& instance.currentRobotsTxt!=null*/)
+
+	get numPages()
 	{
-		// TODO :: remove condition when/if `linkObj.invalidate()` is used in `HtmlChecker`
-		if (link.url.resolved !== null)
-		{
-			return instance.currentRobotsTxt.isAllowed(instance.options.userAgent, link.url.parsed.pathname);
-		}
+		return this.#htmlUrlChecker.numPages;
 	}
-	
-	return true;
-}
 
 
 
-function maybeCheckLink(instance, link)
-{
-	if (link.internal===true && isAllowed(instance, link)===false)
+	get numSites()
 	{
-		return "BLC_ROBOTS";
+		return this.#siteUrlQueue.length;
 	}
-}
 
 
 
-function maybeEnqueuePage(instance, link, customData)
-{
-	var attrSupported,i,redirects,tagGroup;
-	
-	// Skip specific links that were excluded from checks
-	if (link.excluded === true)
+	/**
+	 * Override/mutate some options for extended behavior.
+	 * @param {object} options
+	 * @returns {object}
+	 */
+	#overrideOptions(options)
 	{
-		switch (link.excludedReason)
+		const {includeLink} = options;
+
+		options.includeLink = link =>
 		{
-			case "BLC_KEYWORD":
-			case "BLC_ROBOTS":  // TODO :: catches rel=nofollow links but will also catch meta/header excluded links -- fine?
-			case "BLC_SCHEME":
+			const excludedReason = this.#getExcludedReason(link);
+
+			if (excludedReason === undefined)
 			{
-				return false;
+				return includeLink(link);
 			}
-		}
-	}
-	
-	tagGroup = instance.options.tags.recursive[instance.options.filterLevel][link.html.tagName];
-	
-	if (tagGroup != null)
-	{
-		attrSupported = tagGroup[link.html.attrName];
+			else
+			{
+				// Undocumented return value type
+				return excludedReason;
+			}
+		};
+
+		return options;
 	}
-	
-	if (
-	   	(attrSupported !== true) || 
-	   	(link.broken === true) || 
-	   	(link.internal !== true) || 
-	   	(instance.sitePagesChecked.get(link.url.resolved) === true) || 
-	   	(isAllowed(instance, link) === false)
-	   )
+
+
+
+	pause()
 	{
-		return false;
+		this.#htmlUrlChecker.pause();
+		this.#siteUrlQueue.pause();
+		return this;
 	}
-	
-	if (link.url.redirected!=null && instance.sitePagesChecked.get(link.url.redirected)===true)
+
+
+
+	#reset()
 	{
-		redirects = link.http.response.redirects;
-		
-		for (i=0; i
 		{
-			maybeCallback(thisObj.handlers.end)();
-		}
-	});
-}
+			const result = await checkLink(link, auth, this.#cache, options);
 
+			this.emit(LINK_EVENT, result, customData);
 
+			// Auto-starts next queue item, if any
+			// Emits REQUEST_QUEUE_END_EVENT, if not
+			done();
+		})
+		.on(REQUEST_QUEUE_END_EVENT, () => this.emit(END_EVENT));
+	}
 
-UrlChecker.prototype.clearCache = function()
-{
-	return this.cache.clear();
-};
 
 
+	clearCache()
+	{
+		this.#cache.clear();
+		return this;
+	}
+
 
-UrlChecker.prototype.dequeue = function(id)
-{
-	return this.linkQueue.dequeue(id);
-};
 
+	dequeue(id)
+	{
+		const success = this.#linkQueue.dequeue(id);
 
+		this.emit(QUEUE_EVENT);
 
-UrlChecker.prototype.enqueue = function(url, baseUrl, customData)
-{
-	// Undocumented internal use: enqueue(linkObj)
-	if (isString(url)===false && url.broken_link_checker===true)
+		return success;
+	}
+
+
+
+	// `auth` is undocumented and for internal use only
+	enqueue(url, customData, auth={})
 	{
-		return this.linkQueue.enqueue(
+		let link;
+
+		// Undocumented internal use: `enqueue(Link)`
+		if (url instanceof Link)
+		{
+			link = url;
+		}
+		// Documented use: `enqueue(URL)`
+		else if (isURL.lenient(url))
+		{
+			link = new Link().resolve(url);
+		}
+		else
 		{
-			url: url.url.parsed,
-			data: { customData:customData, linkObj:url }
-		});
+			throw new TypeError("Invalid URL");
+		}
+
+		const id = this.#linkQueue.enqueue(link.get(REBASED_URL), { auth, customData, link });
+
+		this.emit(QUEUE_EVENT);
+
+		return id;
 	}
-	// Documented use: enqueue(url, baseUrl)
-	// or erroneous and let linkQueue sort it out
-	else
+
+
+
+	has(id)
 	{
-		return this.linkQueue.enqueue(
-		{
-			url: urlobj.resolve(baseUrl || "", urlobj.parse(url) ),  // URL must be absolute
-			data: { orgUrl:url, baseUrl:baseUrl, customData:customData }
-		});
+		return this.#linkQueue.has(id);
 	}
-};
 
 
 
-UrlChecker.prototype.numActiveLinks = function()
-{
-	return this.linkQueue.numActive();
-};
+	get isPaused()
+	{
+		return this.#linkQueue.isPaused;
+	}
 
 
 
-UrlChecker.prototype.numQueuedLinks = function()
-{
-	return this.linkQueue.numQueued();
-};
+	get numActiveLinks()
+	{
+		return this.#linkQueue.numActive;
+	}
 
 
 
-UrlChecker.prototype.pause = function()
-{
-	return this.linkQueue.pause();
-};
+	get numQueuedLinks()
+	{
+		return this.#linkQueue.numQueued;
+	}
 
 
 
-UrlChecker.prototype.resume = function()
-{
-	return this.linkQueue.resume();
-};
+	pause()
+	{
+		this.#linkQueue.pause();
+		return this;
+	}
 
 
 
-UrlChecker.prototype.__getCache = function()
-{
-	return this.cache;
-};
+	resume()
+	{
+		this.#linkQueue.resume();
+		return this;
+	}
 
 
 
-module.exports = UrlChecker;
+	get __cache()
+	{
+		return this.#cache;
+	}
+}
diff --git a/package.json b/package.json
index 302f53eb..9c0d20f8 100644
--- a/package.json
+++ b/package.json
@@ -1,61 +1,99 @@
 {
   "name": "broken-link-checker",
-  "description": "Find broken links, missing images, etc in your HTML.",
-  "version": "0.7.8",
+  "description": "Find broken links, missing images, etc within your HTML.",
+  "version": "0.8.0",
   "license": "MIT",
-  "author": "Steven Vachon  (https://www.svachon.com/)",
-  "repository": "stevenvachon/broken-link-checker",
-  "main": "lib",
+  "author": "Steven Vachon  (https://svachon.com)",
+  "repository": "github:stevenvachon/broken-link-checker",
+  "main": "lib-cjs",
   "bin": {
     "blc": "bin/blc",
     "broken-link-checker": "bin/blc"
   },
   "dependencies": {
-    "bhttp": "^1.2.1",
-    "calmcard": "~0.1.1",
-    "chalk": "^1.1.3",
-    "char-spinner": "^1.0.1",
-    "condense-whitespace": "^1.0.0",
+    "auto-tunnel": "github:stevenvachon/auto-tunnel",
+    "chalk": "^2.4.2",
+    "condense-whitespace": "^2.0.0",
+    "deep-freeze-node": "^1.1.3",
     "default-user-agent": "^1.0.0",
-    "errno": "~0.1.4",
-    "extend": "^3.0.0",
-    "humanize-duration": "^3.9.1",
-    "http-equiv-refresh": "^1.0.0",
-    "is-stream": "^1.0.1",
+    "errno": "~0.1.7",
+    "gauge": "^2.7.4",
+    "got": "^9.6.0",
+    "http-equiv-refresh": "^2.0.1",
+    "humanize-duration": "^3.20.1",
+    "is-stream": "^2.0.0",
     "is-string": "^1.0.4",
-    "limited-request-queue": "^2.0.0",
-    "link-types": "^1.1.0",
-    "maybe-callback": "^2.1.0",
-    "nopter": "~0.3.0",
-    "parse5": "^3.0.2",
-    "robot-directives": "~0.3.0",
-    "robots-txt-guard": "~0.1.0",
-    "robots-txt-parse": "~0.0.4",
-    "urlcache": "~0.7.0",
-    "urlobj": "0.0.11"
+    "isurl": "^4.0.1",
+    "keyscan": "^1.7.0",
+    "limited-request-queue": "^5.1.0",
+    "link-types": "^3.0.0",
+    "list-to-array": "^1.1.0",
+    "lodash": "^4.17.15",
+    "longest": "^2.0.1",
+    "matcher": "^2.0.0",
+    "node-notifier": "^6.0.0",
+    "optionator": "~0.8.2",
+    "parse-srcset": "^1.0.2",
+    "parse5": "^5.1.0",
+    "parse5-parser-stream": "^5.1.0",
+    "robot-directives": "github:stevenvachon/robot-directives",
+    "robots-txt-guard": "~0.2.1",
+    "robots-txt-parse": "^1.0.1",
+    "strip-ansi": "^5.2.0",
+    "supports-semigraphics": "^1.0.1",
+    "url-relation": "github:stevenvachon/url-relation",
+    "urlcache": "github:stevenvachon/urlcache",
+    "walk-parse5": "^2.0.0"
   },
   "devDependencies": {
-    "chai": "^3.5.0",
-    "chai-as-promised": "^6.0.0",
-    "chai-like": "~0.2.10",
+    "@babel/cli": "^7.6.2",
+    "@babel/core": "^7.6.2",
+    "@babel/plugin-proposal-class-properties": "^7.5.5",
+    "@babel/plugin-proposal-nullish-coalescing-operator": "^7.4.4",
+    "@babel/plugin-proposal-numeric-separator": "^7.2.0",
+    "@babel/plugin-proposal-optional-catch-binding": "^7.2.0",
+    "@babel/plugin-proposal-optional-chaining": "^7.6.0",
+    "@babel/plugin-proposal-private-methods": "^7.6.0",
+    "@babel/preset-env": "^7.6.2",
+    "@babel/register": "^7.6.2",
+    "babel-eslint": "^10.0.3",
+    "babel-plugin-add-module-exports": "^1.0.2",
+    "basic-auth": "^2.0.1",
+    "basic-auth-header": "^1.0.1",
+    "chai": "^4.2.0",
+    "chai-as-promised": "^7.1.1",
+    "chai-subset": "github:stevenvachon/chai-subset",
     "chai-things": "~0.2.0",
-    "es6-promise": "^4.1.0",
-    "mocha": "^3.0.2",
-    "object.assign": "^4.0.4",
-    "slashes": "^1.0.5",
-    "st": "^1.2.0"
+    "coveralls": "^3.0.6",
+    "escape-string-regexp": "^2.0.0",
+    "eslint": "^6.4.0",
+    "eslint-plugin-import": "^2.18.2",
+    "eslint-plugin-jsdoc": "^15.9.3",
+    "eslint-plugin-sort-destructure-keys": "^1.3.3",
+    "eslint-plugin-you-dont-need-lodash-underscore": "^6.7.0",
+    "mocha": "^6.2.0",
+    "nock": "^11.3.5",
+    "nyc": "^14.1.1",
+    "void-elements": "^3.1.0"
   },
   "engines": {
-    "node": ">= 0.10"
+    "node": ">= 12"
   },
   "scripts": {
-    "test": "mocha test/ --reporter spec --check-leaks --bail",
-    "test-watch": "mocha test/ --reporter spec --check-leaks --bail -w"
+    "build": "babel lib/ --config-file=./scripts/babel.config.js --out-dir=lib-cjs/ --source-maps",
+    "ci": "npm run lint && npm test && nyc report --reporter=text-lcov | coveralls",
+    "lint": "npm run lint:cjs ; npm run lint:esm",
+    "lint:cjs": "eslint --config=scripts/eslintrc.cjs.js scripts/",
+    "lint:esm": "eslint --config=scripts/eslintrc.esm.js lib/ test/",
+    "posttest": "nyc report --reporter=text-summary --reporter=html",
+    "prepublishOnly": "npm run lint && npm test && npm run build",
+    "test": "nyc --exclude=scripts/ --silent mocha test/ --bail --check-leaks --require=scripts/register-babel",
+    "test:watch": "npm test -- --watch --watch-extensions=js,json"
   },
   "files": [
     "bin",
     "lib",
-    "license"
+    "lib-cjs"
   ],
   "keywords": [
     "404",
@@ -63,6 +101,7 @@
     "hyperlink",
     "links",
     "seo",
-    "url"
+    "url",
+    "whatwg"
   ]
 }
diff --git a/scripts/babel.config.js b/scripts/babel.config.js
new file mode 100644
index 00000000..45630d64
--- /dev/null
+++ b/scripts/babel.config.js
@@ -0,0 +1,28 @@
+"use strict";
+const {engines: {node:nodeVersion}} = require("../package.json");
+
+
+
+module.exports =
+{
+	plugins:
+	[
+		"@babel/proposal-class-properties",
+		"@babel/proposal-nullish-coalescing-operator",
+		"@babel/proposal-numeric-separator",
+		"@babel/proposal-optional-catch-binding",
+		"@babel/proposal-optional-chaining",
+		"@babel/proposal-private-methods",
+		"add-module-exports"
+	],
+	presets:
+	[
+		["@babel/preset-env",
+		{
+			targets:
+			{
+				browsers: `node ${nodeVersion}`
+			}
+		}]
+	]
+};
diff --git a/scripts/eslintrc.base.js b/scripts/eslintrc.base.js
new file mode 100644
index 00000000..b9114203
--- /dev/null
+++ b/scripts/eslintrc.base.js
@@ -0,0 +1,126 @@
+/* eslint-disable quote-props */
+"use strict";
+
+// eslint:recommended overrides
+const overriddenRules =
+{
+	"no-mixed-spaces-and-tabs": [2, "smart-tabs"],
+	"require-atomic-updates": 1  // usually not a problem
+};
+
+module.exports =
+{
+	env:
+	{
+		es6: true
+	},
+	extends:
+	[
+		"eslint:recommended",
+		"plugin:you-dont-need-lodash-underscore/all"
+	],
+	parserOptions:
+	{
+		ecmaVersion: 2019
+	},
+	plugins:
+	[
+		"jsdoc",
+		"sort-destructure-keys",
+		"you-dont-need-lodash-underscore"
+	],
+	root: true,
+	rules:
+	{
+		...overriddenRules,
+
+		"arrow-parens": [2, "as-needed"],
+		"brace-style": [2, "allman"],
+		"camelcase": 2,
+		"comma-dangle": 2,
+		"comma-style": 2,
+		"curly": 2,
+		"dot-notation": [2, {allowKeywords:true}],
+		"eol-last": 2,
+		"eqeqeq": [2, "always", {null:"ignore"}],
+		"func-call-spacing": [2, "never"/*, {allowNewlines:true}*/],
+		//"indent": [2, "tab", {MemberExpression:0}],
+		"jsdoc/check-alignment": 2,
+		"jsdoc/check-param-names": 2,
+		"jsdoc/check-syntax": 2,
+		"jsdoc/check-tag-names": 2,
+		"jsdoc/check-types": 2,
+		//"jsdoc/no-undefined-types": 2,
+		//"jsdoc/require-jsdoc": 2,
+		"jsdoc/require-param": 2,
+		"jsdoc/require-param-name": 2,
+		"jsdoc/require-param-type": 2,
+		"jsdoc/require-returns": 2,
+		"jsdoc/require-returns-check": 2,
+		"jsdoc/require-returns-type": 2,
+		"jsdoc/valid-types": 2,
+		"keyword-spacing": 2,
+		"new-parens": 2,
+		"no-array-constructor": 2,
+		"no-caller": 2,
+		"no-console": 2,
+		"no-debugger": 2,
+		"no-eval": 2,
+		"no-extra-boolean-cast": 2,
+		"no-floating-decimal": 2,
+		"no-implied-eval": 2,
+		"no-label-var": 2,
+		"no-labels": [2, {allowLoop:true}],
+		"no-lone-blocks": 2,
+		"no-loop-func": 2,
+		"no-multi-str": 2,
+		"no-native-reassign": 2,
+		"no-nested-ternary": 2,
+		"no-new-func": 2,
+		"no-new-object": 2,
+		"no-new-wrappers": 2,
+		"no-octal-escape": 2,
+		"no-process-exit": 2,
+		"no-proto": 2,
+		"no-restricted-globals": [2, {message:"Use setTimeout", name:"setInterval"}],
+		"no-restricted-properties": [2, {message:"Use setTimeout", object:"window", property:"setInterval"}],
+		"no-sequences": 2,
+		"no-shadow-restricted-names": 2,
+		"no-trailing-spaces": 2,
+		"no-undef": 2,
+		"no-undef-init": 2,
+		"no-unused-vars": [2, {args:"after-used"}],
+		"no-useless-concat": 2,
+		"no-var": 2,
+		"no-with": 2,
+		"object-shorthand": 2,
+		"one-var": [2, {initialized:"never", uninitialized:"consecutive"}],
+		"prefer-arrow-callback": 2,
+		"prefer-const": 2,
+		"prefer-destructuring":
+		[
+			2,
+			{
+				AssignmentExpression: {array:false, object:false},
+				VariableDeclarator: {array:false, object:true}
+			}/*,
+			{enforceForRenamedProperties:true}*/
+		],
+		"prefer-object-spread": 2,
+		"prefer-rest-params": 2,
+		"prefer-spread": 2,
+		"prefer-template": 2,
+		"quote-props": [2, "as-needed"],
+		"quotes": [2, "double", {allowTemplateLiterals:true, avoidEscape:true}],
+		"radix": 2,
+		"semi": [2, "always", {omitLastInOneLineBlock:true}],
+		"semi-spacing": [2, {after:true, before:false}],
+		"sort-destructure-keys/sort-destructure-keys": [2, {caseSensitive:false}],
+		"sort-keys": [2, "asc", {caseSensitive:false, natural:true}],
+		"sort-vars": [2, {ignoreCase:true}],
+		"space-before-blocks": 2,
+		"space-before-function-paren": [2, {anonymous:"never", asyncArrow:"always", named:"never"}],
+		"space-unary-ops": [2, {nonwords:false, words:true}],
+		"yoda": 2
+	}
+};
diff --git a/scripts/eslintrc.cjs.js b/scripts/eslintrc.cjs.js
new file mode 100644
index 00000000..c3ff15b6
--- /dev/null
+++ b/scripts/eslintrc.cjs.js
@@ -0,0 +1,19 @@
+/* eslint-disable quote-props */
+"use strict";
+const base = require("./eslintrc.base");
+const {merge} = require("lodash");
+
+
+
+module.exports = merge(base,
+{
+	env:
+	{
+		node: true
+	},
+	rules:
+	{
+		"no-new-require": 2,
+		"strict": 2
+	}
+});
diff --git a/scripts/eslintrc.esm.js b/scripts/eslintrc.esm.js
new file mode 100644
index 00000000..a36ed46e
--- /dev/null
+++ b/scripts/eslintrc.esm.js
@@ -0,0 +1,41 @@
+/* eslint-disable quote-props */
+"use strict";
+const base = require("./eslintrc.base");
+const {mergeWith} = require("lodash");
+
+
+
+// This is annoying
+const customizer = (objValue, srcValue) =>
+{
+	if (Array.isArray(objValue))
+	{
+		return [...objValue, ...srcValue];
+	}
+};
+
+
+
+module.exports = mergeWith(base,
+{
+	env:
+	{
+		node: true
+	},
+	parser: "babel-eslint",
+	parserOptions:
+	{
+		sourceType: "module"
+	},
+	plugins:
+	[
+		"import"
+	],
+	rules:
+	{
+		"import/extensions": [2, "ignorePackages", {js:"never"}],
+		"import/first": 2,
+		"import/no-duplicates": 2,
+		"sort-imports": [2, {ignoreCase:true, ignoreDeclarationSort:true}]
+	}
+}, customizer);
diff --git a/scripts/generate-html-json.js b/scripts/generate-html-json.js
new file mode 100644
index 00000000..3b5899ff
--- /dev/null
+++ b/scripts/generate-html-json.js
@@ -0,0 +1,121 @@
+"use strict";
+const {normalize:normalizePath} = require("path");
+const {promises: {writeFile}} = require("fs");
+
+
+
+// XHTML self-closing tags exist to check that they're removed via the HTML parser
+const htmls =
+{
+	"<* itemtype>":        `microdata`,
+	"":            `link`,
+	"":            `link`,
+	"":    `params`,
+	"":       `params`,
+	"":   `params`,
+	"":     `params`,
+	"":        `params`,
+	"":        ``,
+	"":        ``,
+	"