Skip to content

Commit

Permalink
[metascraper-iframe]: Add oembed markup detection (#240)
Browse files Browse the repository at this point in the history
* refactor: use fs.promises

* fix: linter

* fix: missing require

* feat: oembed markup detection
  • Loading branch information
Kikobeats authored Nov 19, 2019
1 parent d92cdff commit b6fcc0b
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 23 deletions.
5 changes: 1 addition & 4 deletions packages/metascraper-amazon/test/index.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
'use strict'

const { readFile } = require('fs').promises
const snapshot = require('snap-shot')
const { promisify } = require('util')
const { resolve } = require('path')
const { omit } = require('lodash')
const should = require('should')
const fs = require('fs')

const readFile = promisify(fs.readFile)

const metascraper = require('metascraper')([
require('..')(),
Expand Down
6 changes: 5 additions & 1 deletion packages/metascraper-iframe/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,13 @@
],
"dependencies": {
"@metascraper/helpers": "^5.8.7",
"oembed-parser": "~1.3.6"
"got": "~9.6.0",
"lodash": "~4.17.15",
"oembed-parser": "~1.3.6",
"p-reflect": "~2.1.0"
},
"devDependencies": {
"cheerio": "latest",
"mocha": "latest",
"should": "latest",
"standard": "latest"
Expand Down
35 changes: 26 additions & 9 deletions packages/metascraper-iframe/src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,37 @@

const { extract, hasProvider } = require('oembed-parser')
const { memoizeOne } = require('@metascraper/helpers')
const { forEach, get } = require('lodash')
const pReflect = require('p-reflect')
const got = require('got')

const iframe = async ({ url, meta, htmlDom, ...opts }) => {
try {
const oembed = await extract(url, opts)
return oembed.html
} catch (err) {
return null
}
const jsonOembed = memoizeOne($ => {
const el = $('link[type="application/json+oembed"]')
return el.attr('href')
})

const fromProvider = async ({ url, meta, htmlDom, ...opts }) => {
const { value } = await pReflect(extract(url, opts))
return get(value, 'html', null)
}

const fromHTML = async ({ url, meta, htmlDom, ...opts }) => {
const oembedUrl = jsonOembed(htmlDom)
if (!oembedUrl) return null

const oembedUrlObj = new URL(oembedUrl)
forEach(opts, (value, key) => oembedUrlObj.searchParams.append(key, value))

const { value } = await pReflect(got(oembedUrlObj.toString(), { json: true }))
return get(value, 'body.html', null)
}

const isValidUrl = memoizeOne(({ url }) => hasProvider(url))
const isValidUrl = memoizeOne(
({ url, htmlDom: $ }) => !!jsonOembed($) || hasProvider(url)
)

module.exports = () => {
const rules = { iframe }
const rules = { iframe: [fromHTML, fromProvider] }
rules.test = isValidUrl
return rules
}
Expand Down
74 changes: 74 additions & 0 deletions packages/metascraper-iframe/test/fixtures/genially.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
<!DOCTYPE html>
<html lang=en>

<head>
<meta charset=utf-8>
<meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no,minimal-ui">
<meta name=theme-color content=#000000>
<meta name=apple-mobile-web-app-capable content=yes>
<meta name=mobile-web-app-capable content=yes>
<meta http-equiv=X-UA-Compatible content="IE=edge">
<meta name=google content=notranslate>
<meta http-equiv=Content-Type content="text/html; charset=utf-8">
<meta name=application-name content=Genial.ly>
<meta property=og:title content="Magic - 15 by Chema on Genial.ly">
<meta property=og:site_name content=Genial.ly>
<meta property=og:description content="Magic - 15 by Chema on Genial.ly">
<meta property=og:image
content=https://genially-pdf.s3.eu-west-1.amazonaws.com/589ba31e1999314dd40e69e7/pdf/aaca5e17-b03d-4d1b-90a1-e7b343ac162e.png?7fbdcaba-6ebd-4c6c-a0b1-0f93c06d3eae>
<meta property=og:image:secure_url
content=https://genially-pdf.s3.eu-west-1.amazonaws.com/589ba31e1999314dd40e69e7/pdf/aaca5e17-b03d-4d1b-90a1-e7b343ac162e.png?7fbdcaba-6ebd-4c6c-a0b1-0f93c06d3eae>
<meta property=og:image:width content=1200>
<meta property=og:image:height content=675>
<meta property=og:url content=https://view.genial.ly/5dc53cfa759d2a0f4c7db5f4> <meta property=og:type content=website>
<meta property=fb:app_id content=1494586530820799>
<meta name=twitter:card content=player>
<meta name=twitter:site content=@genially_es>
<meta name=twitter:domain content=http://genial.ly/> <meta name=twitter:title
content="Magic - 15 by Chema on Genial.ly">
<meta name=twitter:description content="Magic - 15 by Chema on Genial.ly">
<meta name=twitter:image
content=https://genially-pdf.s3.eu-west-1.amazonaws.com/589ba31e1999314dd40e69e7/pdf/aaca5e17-b03d-4d1b-90a1-e7b343ac162e.png?7fbdcaba-6ebd-4c6c-a0b1-0f93c06d3eae>
<meta name=twitter:player content=https://view.genial.ly/5dc53cfa759d2a0f4c7db5f4> <meta name=twitter:player:width
content=1200>
<meta name=twitter:player:height content=675>
<meta name=description content="Magic - 15 by Chema on Genial.ly">
<meta name=p:domain_verify content=eaaac27aee644a70f919ccfe01fcd3d6>
<base target=_blank>
<link rel=alternate type=application/json+oembed
href="https://www.genial.ly/services/oembed?url=https://view.genial.ly/5dc53cfa759d2a0f4c7db5f4"
title="Magic - 15 by Chema on Genial.ly">
<link rel=icon href=/favicon.ico type=image/x-icon> <link rel="shortcut icon" href=/favicon.ico type=image/x-icon>
<meta name=robots content=noindex>
<title>Magic - 15</title>
<link href=https://d3usyxos00s4ty.cloudfront.net/view/static/css/main.1441b84a.css rel=stylesheet>
<link href=https://d3usyxos00s4ty.cloudfront.net/view/static/css/view.0.0.43.css rel=stylesheet>
<script>
! function (e, t, a, r, n) {
e.dataLayer = e.dataLayer || [], e.dataLayer.push({
"gtm.start": (new Date).getTime(),
event: "gtm.js"
});
var s = t.getElementsByTagName("script")[0],
g = t.createElement("script");
g.async = !0, g.src = "https://www.googletagmanager.com/gtm.js?id=GTM-K3DDDL4", s.parentNode.insertBefore(g, s)
}(window, document)
</script>
</head>

<body>
<noscript>You need to enable JavaScript to run this app.</noscript>
<div class=container-wrapper-genially style="position: relative; min-height: 400px; width: 100%;"><img
src=https://genially.blob.core.windows.net/genially/version3.0/loader.gif class=loader-genially
style="position: absolute; top: 0; right: 0; bottom: 0; left: 0; margin-top: auto; margin-right: auto; margin-bottom: auto; margin-left: auto; z-index: 1; width: 80px;">
<div id=5dc53cfa759d2a0f4c7db5f4 class=genially-embed
style="margin: 0px auto; position: relative; height: auto; width: 100%;"></div>
</div>
<script type=text/javascript src=https://d3usyxos00s4ty.cloudfront.net/view/static/js/dist/view.0.0.39.min.js>
</script> <script type=text/javascript async
src=https://d3usyxos00s4ty.cloudfront.net/view/static/js/main.b99921b7.js> </script> <noscript>
<iframe src="https://www.googletagmanager.com/ns.html?id=GTM-K3DDDL4" height=0 width=0 style=display:none;visibility:hidden></iframe>
</noscript>
</body>

</html>
46 changes: 37 additions & 9 deletions packages/metascraper-iframe/test/index.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
'use strict'

const { readFile } = require('fs').promises
const { resolve } = require('path')
const should = require('should')
const cheerio = require('cheerio')

const createMetascraperIframe = require('..')
const createMetascraper = require('metascraper')

const { isValidUrl } = createMetascraperIframe

const urls = [
const commonProviders = [
'https://www.youtube.com/watch?v=Gu8X7vM3Avw',
'https://youtu.be/Gu8X7vM3Avw',
'https://www.youtube.com/watch?v=-TWztwbOpog&list=PL5aqr5w5fRe4nO30px44D5sBukIUw1UwX',
Expand All @@ -19,20 +22,45 @@ const urls = [

describe('metascraper-iframe', () => {
describe('.isValidUrl', () => {
urls.forEach(url => {
it(url, () => {
should(isValidUrl({ url })).be.true()
describe('from common providers', () => {
commonProviders.forEach(url => {
it(url, () => {
const htmlDom = cheerio.load('')
const isValid = isValidUrl({ url, htmlDom })
should(isValid).be.true()
})
})
})

it('from markup', async () => {
const html = await readFile(resolve(__dirname, 'fixtures/genially.html'))
const url = 'https://view.genial.ly/5dc53cfa759d2a0f4c7db5f4'

const htmlDom = cheerio.load(html)
const isValid = isValidUrl({ url, htmlDom })
should(isValid).be.true()
})
})

describe('iframe', () => {
urls.forEach(url => {
it(url, async () => {
const metascraper = createMetascraper([createMetascraperIframe()])
const meta = await metascraper({ url, escape: false })
should(meta.iframe).be.not.null()
describe('from common providers', () => {
commonProviders.forEach(url => {
it(url, async () => {
const metascraper = createMetascraper([createMetascraperIframe()])
const meta = await metascraper({ url, escape: false })
should(meta.iframe).be.not.null()
})
})
})

it('from markup', async () => {
const html = await readFile(resolve(__dirname, 'fixtures/genially.html'))
const url = 'https://view.genial.ly/5dc53cfa759d2a0f4c7db5f4'

const metascraper = createMetascraper([createMetascraperIframe()])
const meta = await metascraper({ url, html, escape: false })

should(meta.iframe).be.not.null()
})
})
})

0 comments on commit b6fcc0b

Please sign in to comment.