Skip to content

Commit

Permalink
fix: better checks for implicit html routes
Browse files Browse the repository at this point in the history
Fixes #231
Fixes #225
  • Loading branch information
harlan-zw committed Sep 29, 2024
1 parent da74f6b commit 24e134d
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 18 deletions.
17 changes: 3 additions & 14 deletions packages/core/src/puppeteer/tasks/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ import { withoutTrailingSlash } from 'ufo'
import { useLogger } from '../../logger'
import { normaliseRoute } from '../../router'
import { useUnlighthouse } from '../../unlighthouse'
import { fetchUrlRaw, ReportArtifacts, trimSlashes } from '../../util'
import { fetchUrlRaw, ReportArtifacts } from '../../util'
import { isImplicitOrExplicitHtml } from '../../util/filter'
import { setupPage } from '../util'

export const extractHtmlPayload: (page: Page, route: string) => Promise<{ success: boolean, redirected?: false | string, message?: string, payload?: string }> = async (page, route) => {
Expand Down Expand Up @@ -150,21 +151,9 @@ export const inspectHtmlTask: PuppeteerTask = async (props) => {
$('a').each(function () {
const href = $(this).attr('href')
// href must be provided and not be javascript
if (!href || href.includes('javascript:') || href.includes('mailto:') || href === '#')
if (!href || href.includes('javascript:') || href.includes('mailto:') || href === '#' || !isImplicitOrExplicitHtml(href))
return

// if the URL doesn't end with a slash we may be dealing with a file
if (!href.endsWith('/')) {
// need to check for a dot, meaning a file
const parts = href.split('.')
// 1 part means there is no extension, or no dot in the url
if (parts.length > 1) {
// presumably the last part will be the extension
const extension = trimSlashes(parts[parts.length - 1]).replace('.', '')
if (extension !== 'html')
return
}
}
if ((href.startsWith('/') && !href.startsWith('//')) || href.includes(resolvedConfig.site))
internalLinks.push(href)
else
Expand Down
6 changes: 2 additions & 4 deletions packages/core/src/puppeteer/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import { matchPathToRule } from '../discovery'
import { useLogger } from '../logger'
import { useUnlighthouse } from '../unlighthouse'
import { createTaskReportFromRoute, formatBytes, ReportArtifacts } from '../util'
import { createFilter } from '../util/filter'
import { createFilter, isImplicitOrExplicitHtml } from '../util/filter'
import {
launchPuppeteerCluster,
} from './cluster'
Expand Down Expand Up @@ -116,9 +116,7 @@ export async function createUnlighthouseWorker(tasks: Record<UnlighthouseTask, T
}
}

const lastPathSegment = path.split('/').pop() || path
const extension = (lastPathSegment.includes('.') ? lastPathSegment.split('.').pop() : 'html') || 'html'
if (!extension.includes('html')) {
if (isImplicitOrExplicitHtml(path)) {
logger.debug('Skipping non-HTML file from scanning', { path })
return
}
Expand Down
22 changes: 22 additions & 0 deletions packages/core/src/util/filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,25 @@ export function createFilter(options: CreateFilterOptions = {}): (path: string)
return include.length === 0
}
}

// types of file extensions that would return a HTML mime type
const HTML_EXPLICIT_EXTENSIONS = [
// html
'.html',
'.htm',
// php
'.php',
// asp
'.asp',
'.aspx',
]
const FILE_MATCH_REGEX = /\.([0-9a-z])+$/i

export function isImplicitOrExplicitHtml(path: string): boolean {
const lastPathSegment = path.split('/').pop() || path
// if it ends with a slash, then we assume it's a index HTML
if (lastPathSegment.endsWith('/'))
return true // implicit
const extension = lastPathSegment?.match(FILE_MATCH_REGEX)?.[0]
return !extension || HTML_EXPLICIT_EXTENSIONS.includes(extension)
}
17 changes: 17 additions & 0 deletions packages/core/test/filters.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import { describe, expect, it } from 'vitest'
import { isImplicitOrExplicitHtml } from '../src/util/filter'

describe('filters', () => {
it ('misc file paths', () => {
expect(isImplicitOrExplicitHtml('')).toBe(true)
expect(isImplicitOrExplicitHtml('/')).toBe(true)
expect(isImplicitOrExplicitHtml('/some.foo/test')).toBe(true)
expect(isImplicitOrExplicitHtml('/some/file.pdf/')).toBe(true)
expect(isImplicitOrExplicitHtml('/dist/assets/chunk[213.4.931294]')).toBe(true)

// file paths
expect(isImplicitOrExplicitHtml('/foo/bar.fr9f9')).toBe(false)
expect(isImplicitOrExplicitHtml('/some/file.pdf')).toBe(false)
expect(isImplicitOrExplicitHtml('/dist/assets/chunk[213.4.931294].css')).toBe(false)
})
})

0 comments on commit 24e134d

Please sign in to comment.