Skip to content

Commit

Permalink
build: handle www and non-www for providers (#245)
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikobeats authored Dec 13, 2019
1 parent 87e40fe commit c37c0bf
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 38 deletions.
23 changes: 23 additions & 0 deletions packages/metascraper-iframe/src/from-html.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
'use strict'

const { memoizeOne } = require('@metascraper/helpers')
const { forEach, get } = require('lodash')
const pReflect = require('p-reflect')
const got = require('got')

const jsonOembed = memoizeOne($ =>
$('link[type="application/json+oembed"]').attr('href')
)

const fromHTML = async ({ url, meta, htmlDom, ...opts }) => {
const oembedUrl = jsonOembed(htmlDom)
if (!oembedUrl) return null
const oembedUrlObj = new URL(oembedUrl)
forEach(opts, (value, key) => oembedUrlObj.searchParams.append(key, value))
const { value } = await pReflect(got(oembedUrlObj.toString(), { json: true }))
return get(value, 'body.html', null)
}

fromHTML.test = $ => !!jsonOembed($)

module.exports = fromHTML
38 changes: 38 additions & 0 deletions packages/metascraper-iframe/src/from-provider.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
'use strict'

const { extract, hasProvider } = require('oembed-parser')
const { memoizeOne } = require('@metascraper/helpers')
const pReflect = require('p-reflect')
const { get } = require('lodash')

const fromProvider = async ({ url, meta, htmlDom, ...opts }) => {
const { value } = await pReflect(extract(findProviderUrl(url), opts))
return get(value, 'html', null)
}

const findProviderUrl = memoizeOne(url => {
let providerUrl

// build up a list of URL variations to test against because the oembed
// providers list is not always up to date with scheme or www vs non-www
const baseUrl = url.replace(/^\/\/|^https?:\/\/(?:www\.)?/, '')
const testUrls = [
`http://${baseUrl}`,
`https://${baseUrl}`,
`http://www.${baseUrl}`,
`https://www.${baseUrl}`
]

for (const testUrl of testUrls) {
if (hasProvider(testUrl)) {
providerUrl = testUrl
break
}
}

return providerUrl
})

fromProvider.test = url => !!findProviderUrl(url)

module.exports = fromProvider
35 changes: 8 additions & 27 deletions packages/metascraper-iframe/src/index.js
Original file line number Diff line number Diff line change
@@ -1,40 +1,21 @@
'use strict'

const { extract, hasProvider } = require('oembed-parser')
const { memoizeOne } = require('@metascraper/helpers')
const { forEach, get } = require('lodash')
const pReflect = require('p-reflect')
const got = require('got')

const jsonOembed = memoizeOne($ => {
const el = $('link[type="application/json+oembed"]')
return el.attr('href')
})
const fromProvider = require('./from-provider')
const fromHTML = require('./from-html')

const fromProvider = async ({ url, meta, htmlDom, ...opts }) => {
const { value } = await pReflect(extract(url, opts))
return get(value, 'html', null)
}

const fromHTML = async ({ url, meta, htmlDom, ...opts }) => {
const oembedUrl = jsonOembed(htmlDom)
if (!oembedUrl) return null

const oembedUrlObj = new URL(oembedUrl)
forEach(opts, (value, key) => oembedUrlObj.searchParams.append(key, value))

const { value } = await pReflect(got(oembedUrlObj.toString(), { json: true }))
return get(value, 'body.html', null)
}
const htmlTest = fromHTML.test.bind(fromHTML)
const providerTest = fromProvider.test.bind(fromProvider)

const isValidUrl = memoizeOne(
({ url, htmlDom: $ }) => !!jsonOembed($) || hasProvider(url)
const test = memoizeOne(
({ url, htmlDom: $ }) => htmlTest($) || providerTest(url)
)

module.exports = () => {
const rules = { iframe: [fromHTML, fromProvider] }
rules.test = isValidUrl
rules.test = test
return rules
}

module.exports.isValidUrl = isValidUrl
module.exports.test = test
17 changes: 6 additions & 11 deletions packages/metascraper-iframe/test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const cheerio = require('cheerio')
const createMetascraperIframe = require('..')
const createMetascraper = require('metascraper')

const { isValidUrl } = createMetascraperIframe
const { test } = createMetascraperIframe

const commonProviders = [
'https://www.youtube.com/watch?v=Gu8X7vM3Avw',
Expand All @@ -21,27 +21,24 @@ const commonProviders = [
]

describe('metascraper-iframe', () => {
describe('.isValidUrl', () => {
describe('.test', () => {
describe('from common providers', () => {
commonProviders.forEach(url => {
it(url, () => {
const htmlDom = cheerio.load('')
const isValid = isValidUrl({ url, htmlDom })
const isValid = test({ url, htmlDom })
should(isValid).be.true()
})
})
})

it('from markup', async () => {
const html = await readFile(resolve(__dirname, 'fixtures/genially.html'))
const url = 'https://view.genial.ly/5dc53cfa759d2a0f4c7db5f4'

const htmlDom = cheerio.load(html)
const isValid = isValidUrl({ url, htmlDom })
const isValid = test({ url, htmlDom })
should(isValid).be.true()
})
})

describe('iframe', () => {
describe('from common providers', () => {
commonProviders.forEach(url => {
Expand All @@ -52,14 +49,12 @@ describe('metascraper-iframe', () => {
})
})
})

it('from markup', async () => {
const html = await readFile(resolve(__dirname, 'fixtures/genially.html'))
const url = 'https://view.genial.ly/5dc53cfa759d2a0f4c7db5f4'

const metascraper = createMetascraper([createMetascraperIframe()])
const rules = [createMetascraperIframe()]
const metascraper = createMetascraper(rules)
const meta = await metascraper({ url, html, escape: false })

should(meta.iframe).be.not.null()
})
})
Expand Down

0 comments on commit c37c0bf

Please sign in to comment.