Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: crawl URLs in <meta> tags #9900

Merged
merged 13 commits into from
May 17, 2023
5 changes: 5 additions & 0 deletions .changeset/thirty-garlics-tan.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@sveltejs/kit': minor
---

feat: Crawl urls in `<meta>` tags
LorisSigrist marked this conversation as resolved.
Show resolved Hide resolved
120 changes: 82 additions & 38 deletions packages/kit/src/core/postbuild/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@ const ATTRIBUTE_NAME = /[^\t\n\f />"'=]/;

const WHITESPACE = /[\s\n\r]/;

const CRAWLABLE_META_NAME_ATTRS = new Set([
'og:url',
'og:image',
'og:image:url',
'og:image:secure_url',
'og:video',
'og:video:url',
'og:video:secure_url',
'og:audio',
'og:audio:url',
'og:audio:secure_url',
'twitter:image'
]);

/**
* @param {string} html
* @param {string} base
Expand Down Expand Up @@ -80,6 +94,8 @@ export function crawl(html, base) {
}

const tag = html.slice(start, i).toUpperCase();
/** @type {Map<string,string>} */
const attributes = new Map();

if (tag === 'SCRIPT' || tag === 'STYLE') {
while (i < html.length) {
Expand Down Expand Up @@ -159,44 +175,7 @@ export function crawl(html, base) {
}

value = decode(value);

if (name === 'href') {
if (tag === 'BASE') {
base = resolve(base, value);
} else {
href = resolve(base, value);
}
} else if (name === 'id') {
ids.push(value);
} else if (name === 'name') {
if (tag === 'A') ids.push(value);
} else if (name === 'rel') {
rel = value;
} else if (name === 'src') {
if (value) hrefs.push(resolve(base, value));
} else if (name === 'srcset') {
const candidates = [];
let insideURL = true;
value = value.trim();
for (let i = 0; i < value.length; i++) {
if (
value[i] === ',' &&
(!insideURL || (insideURL && WHITESPACE.test(value[i + 1])))
) {
candidates.push(value.slice(0, i));
value = value.substring(i + 1).trim();
i = 0;
insideURL = true;
} else if (WHITESPACE.test(value[i])) {
insideURL = false;
}
}
candidates.push(value);
for (const candidate of candidates) {
const src = candidate.split(WHITESPACE)[0];
if (src) hrefs.push(resolve(base, src));
}
}
attributes.set(name, value);
} else {
i -= 1;
}
Expand All @@ -205,6 +184,71 @@ export function crawl(html, base) {
i += 1;
}

const href_attr = attributes.get('href');
const id_attr = attributes.get('id');
const name_attr = attributes.get('name');
const property_attr = attributes.get('property');
const rel_attr = attributes.get('rel');
const src_attr = attributes.get('src');
const srcset_attr = attributes.get('srcset');
const content_attr = attributes.get('content');

if (href_attr) {
if (tag === 'BASE') base = resolve(base, href_attr);
else href = resolve(base, href_attr);
}
if (id_attr) {
ids.push(id_attr);
}
if (name_attr && tag === 'A') {
ids.push(name_attr);
}
if (rel_attr) {
rel = rel_attr;
}
if (src_attr) {
hrefs.push(resolve(base, src_attr));
}
if (srcset_attr) {
let value = srcset_attr;
const candidates = [];
let insideURL = true;
value = value.trim();
for (let i = 0; i < value.length; i++) {
if (value[i] === ',' && (!insideURL || (insideURL && WHITESPACE.test(value[i + 1])))) {
candidates.push(value.slice(0, i));
value = value.substring(i + 1).trim();
i = 0;
insideURL = true;
} else if (WHITESPACE.test(value[i])) {
insideURL = false;
}
}
candidates.push(value);
for (const candidate of candidates) {
const src = candidate.split(WHITESPACE)[0];
if (src) hrefs.push(resolve(base, src));
}
}

if (
tag === 'META' &&
content_attr &&
name_attr &&
CRAWLABLE_META_NAME_ATTRS.has(name_attr)
) {
hrefs.push(resolve(base, content_attr.trim().toLowerCase()));
}

if (
tag === 'META' &&
content_attr &&
property_attr &&
CRAWLABLE_META_NAME_ATTRS.has(property_attr.trim().toLowerCase())
) {
hrefs.push(resolve(base, content_attr));
}

if (href && !/\bexternal\b/i.test(rel)) {
hrefs.push(resolve(base, href));
}
Expand Down
14 changes: 14 additions & 0 deletions packages/kit/src/core/postbuild/fixtures/meta/input.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="description" content="This is a description" />

<!--Only these should get crawled-->
<meta content="https://external.com" name="twitter:image" />
<meta name="og:image" content="/og-image.jpg" />
<meta property="og:audio" content="https://example.com/audio.mp3" />
<meta content="/video.mp4" property="og:video"/>
</head>
<body></body>
</html>
4 changes: 4 additions & 0 deletions packages/kit/src/core/postbuild/fixtures/meta/output.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"hrefs": ["https://external.com", "/og-image.jpg", "https://example.com/audio.mp3", "/video.mp4"],
"ids": []
}