Skip to content

Commit

Permalink
fix: improved robots.txt exclude matching
Browse files Browse the repository at this point in the history
  • Loading branch information
harlan-zw committed Sep 29, 2024
1 parent 8934952 commit da74f6b
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 46 deletions.
113 changes: 68 additions & 45 deletions packages/core/src/discovery/robotsTxt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,6 @@ export interface RobotsTxtParsed {
groups: RobotsGroupResolved[]
}

function isValidRegex(s: string | RegExp) {
if (typeof s === 'string') {
// make sure it's valid regex
try {
// eslint-disable-next-line no-new
new RegExp(s)
return true
}
catch (e) {
return false
}
}
return true
}
/**
* Fetches the robots.txt file.
* @param site
Expand All @@ -46,40 +32,77 @@ export async function fetchRobotsTxt(site: string): Promise<false | string> {
return robotsTxt.response.data as string
}

export function mergeRobotsTxtConfig(config: ResolvedUserConfig, { groups, sitemaps }: RobotsTxtParsed): void {
const normalisedGroups = groups
.filter(group => group.userAgent.includes('*'))
.map((group) => {
for (const k of ['disallow', 'allow']) {
// @ts-expect-error untyped
group[k] = (group[k] as string[])
// skip any disallows that are root level
.filter(path => path !== '/' && path)
.map((path) => {
// convert robots.txt paths to regex paths
if (path.includes('*'))
path = path.replace(/\*/g, '.*')
else
path = `${path}.*`
return path
})
interface RobotsTxtRule { pattern: string, allow: boolean }

function matches(pattern: string, path: string): boolean {
const pathLength = path.length
const patternLength = pattern.length
const matchingLengths: number[] = Array.from({ length: pathLength + 1 }).fill(0)
let numMatchingLengths = 1

let p = 0
while (p < patternLength) {
if (pattern[p] === '$' && p + 1 === patternLength) {
return matchingLengths[numMatchingLengths - 1] === pathLength
}

if (pattern[p] === '*') {
numMatchingLengths = pathLength - matchingLengths[0] + 1
for (let i = 1; i < numMatchingLengths; i++) {
matchingLengths[i] = matchingLengths[i - 1] + 1
}
}
else {
let numMatches = 0
for (let i = 0; i < numMatchingLengths; i++) {
const matchLength = matchingLengths[i]
if (matchLength < pathLength && path[matchLength] === pattern[p]) {
matchingLengths[numMatches++] = matchLength + 1
}
}
if (numMatches === 0) {
return false
}
return group
})
numMatchingLengths = numMatches
}
p++
}

return true
}
export function matchPathToRule(path: string, _rules: RobotsTxtRule[]): RobotsTxtRule | null {
let matchedRule: RobotsTxtRule | null = null

const rules = _rules.filter(Boolean) // filter out empty line such as Disallow:
const rulesLength = rules.length
let i = 0
while (i < rulesLength) {
const rule = rules[i]
if (!matches(rule.pattern, path)) {
i++
continue
}

// for diallow we add it to the exclude list
config.scanner.exclude = [...new Set([
...(config.scanner.exclude || []),
...normalisedGroups.flatMap(group => group.disallow),
])].filter(isValidRegex)
config.scanner.include = config.scanner.include || []
const robotsAllows = normalisedGroups.flatMap(group => group.allow).filter(a => a.length)
if (!config.scanner.include.length && robotsAllows.length) {
config.scanner.include = [...new Set([
'/*',
...normalisedGroups.flatMap(group => group.allow),
])].filter(isValidRegex)
if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) {
matchedRule = rule
}
else if (
rule.pattern.length === matchedRule.pattern.length
&& rule.allow
&& !matchedRule.allow
) {
matchedRule = rule
}
i++
}

return matchedRule
}

export function mergeRobotsTxtConfig(config: ResolvedUserConfig, { groups, sitemaps }: RobotsTxtParsed): void {
config.scanner._robotsTxtRules = groups.filter((group) => {
return group.userAgent.includes('*') || group.userAgent.includes(String(config.lighthouseOptions?.emulatedUserAgent))
}).map(group => group._rules)
if (config.scanner.sitemap !== false && sitemaps.length) {
// allow overriding the robots.txt sitemaps with your own
if (!Array.isArray(config.scanner.sitemap) || !config.scanner.sitemap.length)
Expand Down
10 changes: 10 additions & 0 deletions packages/core/src/puppeteer/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import fs from 'node:fs'
import { join } from 'node:path'
import chalk from 'chalk'
import { get, sortBy, uniqBy } from 'lodash-es'
import { matchPathToRule } from '../discovery'
import { useLogger } from '../logger'
import { useUnlighthouse } from '../unlighthouse'
import { createTaskReportFromRoute, formatBytes, ReportArtifacts } from '../util'
Expand Down Expand Up @@ -94,6 +95,15 @@ export async function createUnlighthouseWorker(tasks: Record<UnlighthouseTask, T
if (ignoredRoutes.has(id))
return

// do robots.txt test
if (resolvedConfig.scanner.robotsTxt) {
const rule = matchPathToRule(path, resolvedConfig.scanner._robotsTxtRules)
if (rule && !rule.allow) {
logger.info(`Skipping route based on robots.txt rule \`${rule.pattern}\``, { path })
return
}
}

if (resolvedConfig.scanner.include || resolvedConfig.scanner.exclude) {
const filter = createFilter(resolvedConfig.scanner)
if (!filter(path)) {
Expand Down
5 changes: 5 additions & 0 deletions packages/core/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,11 @@ export interface ResolvedUserConfig {
* @default 'mobile'
*/
device: 'mobile' | 'desktop' | false
/**
* Resolved robots.txt groups.
* @internal
*/
_robotsTxtRules?: any
}
/**
* Changes the default behaviour of lighthouse.
Expand Down
25 changes: 24 additions & 1 deletion packages/core/src/util/robotsTxtParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ export interface RobotsGroupResolved {
allow: string[]
userAgent: string[]
host?: string
// runtime optimization
_indexable: boolean
_rules: { pattern: string, allow: boolean }[]
}

/**
Expand Down Expand Up @@ -78,7 +81,27 @@ export function parseRobotsTxt(s: string) {
...currentGroup,
})
return {
groups,
groups: groups.map(normalizeGroup),
sitemaps,
}
}

function asArray(v: any) {
return typeof v === 'undefined' ? [] : (Array.isArray(v) ? v : [v])
}

function normalizeGroup(group: RobotsGroupResolved): RobotsGroupResolved {
const disallow = asArray(group.disallow) // we can have empty disallow
const allow = asArray(group.allow).filter(rule => Boolean(rule))
return <RobotsGroupResolved> {
...group,
userAgent: group.userAgent ? asArray(group.userAgent) : ['*'],
disallow,
allow,
_indexable: !disallow.includes((rule: string) => rule === '/'),
_rules: [
...disallow.filter(Boolean).map(r => ({ pattern: r, allow: false })),
...allow.map(r => ({ pattern: r, allow: true })),
],
}
}

0 comments on commit da74f6b

Please sign in to comment.