Skip to content

Commit

Permalink
Merge pull request #235 from lxcid/lxcid/unescape-script-text
Browse files Browse the repository at this point in the history
fix: JSON parsing fails when Youtube escape '&' to '\x26' (#234)
  • Loading branch information
jshemas authored Jul 22, 2024
2 parents 3e5e609 + b3b546b commit c6bb642
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 1 deletion.
4 changes: 3 additions & 1 deletion lib/extract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { load } from 'cheerio';
import fallback from './fallback';
import fields from './fields';
import mediaSetup from './media';
import { unescapeScriptText } from './utils';

import type { OgObjectInteral, OpenGraphScraperOptions } from './types';

Expand Down Expand Up @@ -94,8 +95,9 @@ export default function extractMetaTags(body: string, options: OpenGraphScraperO
$('script').each((index, script) => {
if (script.attribs.type && script.attribs.type === 'application/ld+json') {
if (!ogObject.jsonLD) ogObject.jsonLD = [];
const scriptText = $(script).text();
let scriptText = $(script).text();
if (scriptText) {
scriptText = unescapeScriptText(scriptText);
ogObject.jsonLD.push(JSON.parse(scriptText));

Check warning on line 101 in lib/extract.ts

View workflow job for this annotation

GitHub Actions / buildAndTest (18)

Unsafe argument of type `any` assigned to a parameter of type `object`

Check warning on line 101 in lib/extract.ts

View workflow job for this annotation

GitHub Actions / buildAndTest (20)

Unsafe argument of type `any` assigned to a parameter of type `object`

Check warning on line 101 in lib/extract.ts

View workflow job for this annotation

GitHub Actions / buildAndTest (22)

Unsafe argument of type `any` assigned to a parameter of type `object`
}
}
Expand Down
36 changes: 36 additions & 0 deletions lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,39 @@ export function isCustomMetaTagsValid(customMetaTags: CustomMetaTags[]): boolean

return result;
}

/**
* Unescape script text.
*
* Certain websites escape script text within script tags, which can
* interfere with `JSON.parse()`. Therefore, we need to unescape it.
*
* Known good escape sequences:
*
* https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Character_escape#uhhhh
*
* ```js
* JSON.parse('"\\u2611"'); // '☑'
* ```
*
* Known bad escape sequences:
*
* https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Character_escape#xhh
*
* ```js
* JSON.parse('"\\x26"'); // '&'
* ```
*
* @param {string} scriptText - the text of the script tag
* @returns {string} unescaped script text
*/
export function unescapeScriptText(scriptText: string) {
// https://stackoverflow.com/a/34056693
return scriptText.replace(/\\x([0-9a-f]{2})/ig, (_, pair) => {
const charCode = parseInt(pair, 16);
if (charCode === 34) {
return '\\"';
}
return String.fromCharCode(charCode);
});
}
107 changes: 107 additions & 0 deletions tests/integration/video.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,113 @@ describe('video', function () {
});
});

it('Test Youtube Video with bad escape sequence - Should Return correct Open Graph Info', function () {
return ogs({ url: 'https://www.youtube.com/watch?v=nFbKMg4E3JM' }).then(function ({ error, result, response }) {
console.log('error:', error);
console.log('result:', result);
expect(error).to.be.eql(false);
expect(result.alAndroidAppName).to.be.eql('YouTube');
expect(result.alAndroidPackage).to.be.eql('com.google.android.youtube');
expect(result.alAndroidUrl).to.be.eql('vnd.youtube://www.youtube.com/watch?v=nFbKMg4E3JM&feature=applinks');
expect(result.alIosAppName).to.be.eql('YouTube');
expect(result.alIosAppStoreId).to.be.eql('544007664');
expect(result.alIosUrl).to.be.eql('vnd.youtube://www.youtube.com/watch?v=nFbKMg4E3JM&feature=applinks');
expect(result.alWebUrl).to.be.oneOf(['https://www.youtube.com/watch?v=nFbKMg4E3JM&feature=applinks', 'http://www.youtube.com/watch?v=nFbKMg4E3JM&feature=applinks']);
expect(result.ogSiteName).to.be.eql('YouTube');
expect(result.ogUrl).to.be.eql('https://www.youtube.com/watch?v=nFbKMg4E3JM');
expect(result.ogTitle).to.be.eql('Force Class 10 in One Shot (Full Chapter) | ICSE 10 Physics Chapter 1 - Abhishek Sir |Vedantu 9 & 10');
expect(result.ogDescription).to.be.an('string').and.to.not.be.empty;
expect(result.ogType).to.be.eql('video.other');
expect(result.ogLocale).to.be.oneOf(['en', 'en-US', 'nl-NL']);
expect(result.twitterCard).to.be.eql('player');
expect(result.twitterSite).to.be.eql('@youtube');
expect(result.twitterTitle).to.be.eql('Force Class 10 in One Shot (Full Chapter) | ICSE 10 Physics Chapter 1 - Abhishek Sir |Vedantu 9 & 10');
expect(result.twitterDescription).to.be.an('string').and.to.not.be.empty;
expect(result.twitterAppNameiPhone).to.be.eql('YouTube');
expect(result.twitterAppIdiPhone).to.be.eql('544007664');
expect(result.twitterAppNameiPad).to.be.eql('YouTube');
expect(result.twitterAppIdiPad).to.be.eql('544007664');
expect(result.twitterUrl).to.be.eql('https://www.youtube.com/watch?v=nFbKMg4E3JM');
expect(result.ogDate).to.be.eql('2021-06-11T09:14:37-07:00');
expect(result.twitterAppUrliPhone).to.be.eql('vnd.youtube://www.youtube.com/watch?v=nFbKMg4E3JM&feature=applinks');
expect(result.twitterAppUrliPad).to.be.eql('vnd.youtube://www.youtube.com/watch?v=nFbKMg4E3JM&feature=applinks');
expect(result.twitterAppNameGooglePlay).to.be.eql('YouTube');
expect(result.twitterAppIdGooglePlay).to.be.eql('com.google.android.youtube');
expect(result.twitterAppUrlGooglePlay).to.be.eql('https://www.youtube.com/watch?v=nFbKMg4E3JM');
expect(result.ogImage).to.be.eql([{
url: 'https://i.ytimg.com/vi/nFbKMg4E3JM/maxresdefault.jpg',
width: '1280',
height: '720',
type: 'jpg',
}]);
expect(result.ogVideo).to.be.eql([{
url: 'https://www.youtube.com/embed/nFbKMg4E3JM',
width: '1280',
height: '720',
type: 'text/html',
}]);
expect(result.twitterImage).to.be.eql([{
url: 'https://i.ytimg.com/vi/nFbKMg4E3JM/maxresdefault.jpg',
}]);
expect(result.twitterPlayer).to.be.eql([{
url: 'https://www.youtube.com/embed/nFbKMg4E3JM',
width: '1280',
height: '720',
}]);
expect(result.ogVideoTag).to.be.eql('vedantu');
expect(result.ogVideoSecureURL).to.be.eql('https://www.youtube.com/embed/nFbKMg4E3JM');
expect(result.requestUrl).to.be.eql('https://www.youtube.com/watch?v=nFbKMg4E3JM');
expect(result.charset).to.be.eql('UTF-8');
expect(result.success).to.be.eql(true);
expect(result.fbAppId).to.be.eql('87741124305');
expect(result.jsonLD).to.be.an('array').and.to.not.be.empty;
if (result.ogDate === undefined) result.ogDate = 'hack because sometimes this does not come back for some reason';
expect(result).to.have.all.keys(
'favicon',
'fbAppId',
'jsonLD',
'alAndroidAppName',
'alAndroidPackage',
'alAndroidUrl',
'alIosAppName',
'alIosAppStoreId',
'alIosUrl',
'alWebUrl',
'ogDate',
'ogDescription',
'ogImage',
'ogLocale',
'ogSiteName',
'ogTitle',
'ogType',
'ogUrl',
'ogVideo',
'ogVideoTag',
'ogVideoSecureURL',
'requestUrl',
'success',
'charset',
'twitterAppIdGooglePlay',
'twitterAppIdiPad',
'twitterAppIdiPhone',
'twitterAppNameGooglePlay',
'twitterAppNameiPad',
'twitterAppNameiPhone',
'twitterAppUrlGooglePlay',
'twitterAppUrliPad',
'twitterAppUrliPhone',
'twitterCard',
'twitterDescription',
'twitterImage',
'twitterPlayer',
'twitterSite',
'twitterTitle',
'twitterUrl',
);
expect(response).to.be.an('Response');
});
});

it('Test Twitch.tv Video - Should Return correct Open Graph Info', function () {
return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/twitch.html' }).then(function ({ error, result, response }) {
console.log('error:', error);
Expand Down
16 changes: 16 additions & 0 deletions tests/unit/utils.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
isThisANonHTMLUrl,
optionSetup,
removeNestedUndefinedValues,
unescapeScriptText,
validateAndFormatURL,
} from '../../lib/utils';

Expand Down Expand Up @@ -305,4 +306,19 @@ describe('utils', function () {
expect(response).to.eql(false);
});
});

describe('unescapeScriptText', function () {
it('is needed because `JSON.parse()` is not able to parse string with \\xHH', function () {
expect(JSON.parse('"\\u2611"')).to.eql('☑');
expect(() => {
JSON.parse('"\\x26"');
}).to.throw(SyntaxError);
});

it('should unescape script text', function () {
expect(unescapeScriptText('"\\x27"')).to.eql('"\'"');
expect(unescapeScriptText('"\\x26"')).to.eql('"&"');
expect(unescapeScriptText('"\\x22"')).to.eql('"\\""');
});
});
});

0 comments on commit c6bb642

Please sign in to comment.