Skip to content
This repository has been archived by the owner on Apr 6, 2023. It is now read-only.

ci: crawl docs site for new deployments to track broken links #7473

Merged
merged 4 commits into from
Sep 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .github/workflows/docs-e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: docs-e2e

on:
workflow_dispatch:
inputs:
url:
required: false
description: The URL to run the test suite against.
type: string
deployment_status:

jobs:
crawl-docs:
environment:
name: ${{ github.event.deployment.environment || 'Production' }}
url: ${{ github.event.inputs.url || github.event.deployment.payload.web_url || github.event.deployment_status.target_url }}
if: github.event.deployment_status.state == 'success' || github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- uses: actions/setup-node@v3
with:
node-version: ${{ matrix.node }}
cache: "yarn"

- name: Install dependencies
run: yarn --immutable

- run: node ./scripts/crawl.mjs
env:
BASE_URL: ${{ github.event.inputs.url || github.event.deployment.payload.web_url || github.event.deployment_status.target_url }}
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,15 @@
"unbuild": "^0.8.11"
},
"devDependencies": {
"@actions/core": "^1.9.1",
"@nuxtjs/eslint-config-typescript": "^11.0.0",
"@types/crawler": "^1.2.2",
"@types/node": "^16.11.58",
"@types/rimraf": "^3",
"@unocss/reset": "^0.45.21",
"case-police": "^0.5.10",
"changelogen": "^0.3.0",
"crawler": "^1.3.0",
"eslint": "^8.23.1",
"eslint-plugin-jsdoc": "^39.3.6",
"execa": "^6.1.0",
Expand Down
109 changes: 109 additions & 0 deletions scripts/crawl.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import Crawler from 'crawler'
import consola from 'consola'
import { parseURL, withoutTrailingSlash } from 'ufo'
import chalk from 'chalk'
import * as actions from '@actions/core'
import { isCI } from 'std-env'

const logger = consola.withTag('crawler')

const baseURL = withoutTrailingSlash(process.env.BASE_URL || 'https://v3.nuxtjs.org')
const startingURL = baseURL + '/'

const excludedExtensions = ['svg', 'png', 'jpg', 'sketch', 'ico', 'gif']
const urlsToOmit = ['http://localhost:3000']

// TODO: remove when migrating to Nuxt 3/Docus
const errorsToIgnore = [
'/guide/directory-structure/nuxt.config',
'/guide/directory-structure',
'/guide/directory-structure/app.config',
'/api/configuration/nuxt.config',
'/guide/deploy',
'/guide/features/app-config'
]

// GLOBALS
const urls = new Set([startingURL])
const erroredUrls = new Set()

/**
* @param {string} path Path to check
* @param {string | undefined} referrer The referring page
*/
function queue (path, referrer) {
if (urlsToOmit.some(url => path.startsWith(url))) { return }

const { pathname, origin } = new URL(path, referrer)

// Don't crawl the same page more than once
const url = `${origin}${pathname}`
if (!url || urls.has(url) || !crawler) { return }

// Don't try to visit linked assets (e.g. SVGs)
const extension = url.split('.').pop()
if (extension && excludedExtensions.includes(extension)) { return }

// Don't crawl external URLs
if (origin !== baseURL) { return }

urls.add(url)

crawler.queue(url)
}

const crawler = new Crawler({
maxConnections: 100,
callback (error, res, done) {
const { $ } = res
const { uri } = res.options
// @ts-ignore
const { statusCode } = res.request.response

if (error || ![200, 301, 302].includes(statusCode) || !$) {
if (errorsToIgnore.includes(parseURL(uri).pathname)) {
const message = chalk.gray(`${chalk.bold('βœ—')} ${uri} (${statusCode}) (ignored)`)
logger.log(message)
return done()
}
const message = chalk.red(`${chalk.bold('βœ—')} ${uri} (${statusCode})`)
if (isCI) { actions.error(message) }
logger.log(message)
erroredUrls.add(uri)
return done()
}

if (!$) {
const message = `Could not parse HTML for ${uri}`
logger.error(message)
if (isCI) { actions.warning(message) }
return done()
}

$('a:not([href*=mailto])').each((_, el) => 'attribs' in el && queue(el.attribs.href, uri))

logger.success(chalk.green(uri))
logger.debug(uri, `[${crawler.queueSize} / ${urls.size}]`)

if (!isCI && crawler.queueSize === 1) {
logger.log('')
logger.info(`Checked \`${urls.size}\` pages.`)
// Tasks to run at the end.
if (erroredUrls.size) {
const message = `${chalk.bold(erroredUrls.size)} errors found on ${chalk.bold(baseURL)}.`
const error = new Error(`\n\n${message}\n`)
error.message = message
error.stack = ''
throw error
}
}

done()
}
})

logger.log('')
logger.info(`Checking \`${baseURL}\`.`)
logger.info(`Ignoring file extensions: \`${excludedExtensions.join(', ')}.\`\n`)

crawler.queue(startingURL)
Loading