From 70fb3d75adabe82ce70bc0b0b1dca977a7d41107 Mon Sep 17 00:00:00 2001 From: Josiah Campbell <9521010+jocmp@users.noreply.github.com> Date: Fri, 6 Dec 2024 20:48:34 -0600 Subject: [PATCH 1/3] Add custom parser - 1pezeshk.com --- dist/mercury.js | 28 ++++++- dist/mercury.js.map | 2 +- dist/mercury.web.js | 2 +- dist/mercury.web.js.map | 2 +- fixtures/www.1pezeshk.com/1733537734038.html | 3 + package.json | 1 - src/cleaners/lead-image-url.js | 11 +-- src/extractors/custom/index.js | 1 + .../custom/www.1pezeshk.com/index.js | 30 +++++++ .../custom/www.1pezeshk.com/index.test.js | 81 +++++++++++++++++++ 10 files changed, 149 insertions(+), 12 deletions(-) create mode 100644 fixtures/www.1pezeshk.com/1733537734038.html create mode 100644 src/extractors/custom/www.1pezeshk.com/index.js create mode 100644 src/extractors/custom/www.1pezeshk.com/index.test.js diff --git a/dist/mercury.js b/dist/mercury.js index cf9dd2df..16a1babf 100644 --- a/dist/mercury.js +++ b/dist/mercury.js @@ -6176,6 +6176,31 @@ var WwwVersantsComExtractor = { } }; +var Www1pezeshkComExtractor = { + domain: 'www.1pezeshk.com', + title: { + selectors: [['meta[name="og:title"]', 'value'], 'h1.post-title'] + }, + author: { + selectors: [['meta[name="author"]', 'value']] + }, + date_published: { + selectors: [['meta[property="article:published_time"', 'content']] + }, + lead_image_url: { + selectors: [['meta[property="og:image"', 'content']] + }, + content: { + selectors: [// enter content selectors + ], + transforms: {}, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [] + } +}; + var CustomExtractors = /*#__PURE__*/Object.freeze({ @@ -6322,7 +6347,8 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({ PostlightComExtractor: PostlightComExtractor, WwwInvestmentexecutiveComExtractor: WwwInvestmentexecutiveComExtractor, WwwCbcCaExtractor: WwwCbcCaExtractor, - WwwVersantsComExtractor: WwwVersantsComExtractor + WwwVersantsComExtractor: WwwVersantsComExtractor, + Www1pezeshkComExtractor: Www1pezeshkComExtractor }); var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) { diff --git a/dist/mercury.js.map b/dist/mercury.js.map index a5ebb7cd..b03816fa 100644 --- a/dist/mercury.js.map +++ b/dist/mercury.js.map @@ -1 +1 @@ -{"version":3,"file":"mercury.js","sources":["../src/utils/text/normalize-spaces.js","../src/utils/text/extract-from-url.js","../src/utils/text/constants.js","../src/utils/text/page-num-from-url.js","../src/utils/text/remove-anchor.js","../src/utils/text/article-base-url.js","../src/utils/text/has-sentence-end.js","../src/utils/text/excerpt-content.js","../src/utils/text/get-encoding.js","../src/resource/utils/constants.js","../src/resource/utils/fetch-resource.js","../src/resource/utils/dom/normalize-meta-tags.js","../src/utils/dom/constants.js","../src/utils/dom/strip-unlikely-candidates.js","../src/utils/dom/brs-to-ps.js","../src/utils/dom/paragraphize.js","../src/utils/dom/convert-to-paragraphs.js","../src/utils/dom/convert-node-to.js","../src/utils/dom/clean-images.js","../src/utils/dom/mark-to-keep.js","../src/utils/dom/strip-junk-tags.js","../src/utils/dom/clean-h-ones.js","../src/utils/dom/clean-attributes.js","../src/utils/dom/remove-empty.js","../src/extractors/generic/content/scoring/constants.js","../src/extractors/generic/content/scoring/get-weight.js","../src/extractors/generic/content/scoring/get-score.js","../src/extractors/generic/content/scoring/score-commas.js","../src/extractors/generic/content/scoring/score-length.js","../src/extractors/generic/content/scoring/score-paragraph.js","../src/extractors/generic/content/scoring/set-score.js","../src/extractors/generic/content/scoring/add-score.js","../src/extractors/generic/content/scoring/add-to-parent.js","../src/extractors/generic/content/scoring/get-or-init-score.js","../src/extractors/generic/content/scoring/score-node.js","../src/extractors/generic/content/scoring/score-content.js","../src/extractors/generic/content/scoring/merge-siblings.js","../src/extractors/generic/content/scoring/find-top-candidate.js","../src/extractors/generic/content/scoring/index.js","../src/utils/dom/clean-tags.js","../src/utils/dom/clean-headers.js","../src/utils/dom/rewrite-top-level.js","../src/utils/dom/make-links-absolute.js","../src/utils/dom/link-density.js","../src/utils/dom/extract-from-meta.js","../src/utils/dom/extract-from-selectors.js","../src/utils/dom/strip-tags.js","../src/utils/dom/within-comment.js","../src/utils/dom/node-is-sufficient.js","../src/utils/dom/is-wordpress.js","../src/utils/dom/get-attrs.js","../src/utils/dom/set-attr.js","../src/utils/dom/set-attrs.js","../src/utils/dom/index.js","../src/resource/utils/dom/constants.js","../src/resource/utils/dom/convert-lazy-loaded-images.js","../src/resource/utils/dom/clean.js","../src/resource/index.js","../src/utils/range.js","../src/utils/validate-url.js","../src/utils/merge-supported-domains.js","../src/extractors/add-extractor.js","../src/extractors/custom/blogspot.com/index.js","../src/extractors/custom/nymag.com/index.js","../src/extractors/custom/wikipedia.org/index.js","../src/extractors/custom/twitter.com/index.js","../src/extractors/custom/www.nytimes.com/index.js","../src/extractors/custom/www.theatlantic.com/index.js","../src/extractors/custom/www.newyorker.com/index.js","../src/extractors/custom/www.wired.com/index.js","../src/extractors/custom/www.msn.com/index.js","../src/extractors/custom/www.yahoo.com/index.js","../src/extractors/custom/www.buzzfeed.com/index.js","../src/extractors/custom/fandom.wikia.com/index.js","../src/extractors/custom/www.littlethings.com/index.js","../src/extractors/custom/www.politico.com/index.js","../src/extractors/custom/deadspin.com/index.js","../src/extractors/custom/www.broadwayworld.com/index.js","../src/extractors/custom/www.apartmenttherapy.com/index.js","../src/extractors/custom/medium.com/index.js","../src/extractors/custom/www.tmz.com/index.js","../src/extractors/custom/www.washingtonpost.com/index.js","../src/extractors/custom/www.huffingtonpost.com/index.js","../src/extractors/custom/newrepublic.com/index.js","../src/extractors/custom/money.cnn.com/index.js","../src/extractors/custom/www.theverge.com/index.js","../src/extractors/custom/www.cnn.com/index.js","../src/extractors/custom/www.aol.com/index.js","../src/extractors/custom/www.youtube.com/index.js","../src/extractors/custom/www.theguardian.com/index.js","../src/extractors/custom/www.sbnation.com/index.js","../src/extractors/custom/www.bloomberg.com/index.js","../src/extractors/custom/www.bustle.com/index.js","../src/extractors/custom/www.npr.org/index.js","../src/extractors/custom/www.recode.net/index.js","../src/extractors/custom/qz.com/index.js","../src/extractors/custom/www.dmagazine.com/index.js","../src/extractors/custom/www.reuters.com/index.js","../src/extractors/custom/mashable.com/index.js","../src/extractors/custom/www.chicagotribune.com/index.js","../src/extractors/custom/www.vox.com/index.js","../src/extractors/custom/news.nationalgeographic.com/index.js","../src/extractors/custom/www.nationalgeographic.com/index.js","../src/extractors/custom/www.latimes.com/index.js","../src/extractors/custom/pagesix.com/index.js","../src/extractors/custom/thefederalistpapers.org/index.js","../src/extractors/custom/www.cbssports.com/index.js","../src/extractors/custom/www.msnbc.com/index.js","../src/extractors/custom/www.thepoliticalinsider.com/index.js","../src/extractors/custom/www.mentalfloss.com/index.js","../src/extractors/custom/abcnews.go.com/index.js","../src/extractors/custom/www.nydailynews.com/index.js","../src/extractors/custom/www.cnbc.com/index.js","../src/extractors/custom/www.popsugar.com/index.js","../src/extractors/custom/observer.com/index.js","../src/extractors/custom/people.com/index.js","../src/extractors/custom/www.usmagazine.com/index.js","../src/extractors/custom/www.rollingstone.com/index.js","../src/extractors/custom/247sports.com/index.js","../src/extractors/custom/uproxx.com/index.js","../src/extractors/custom/www.eonline.com/index.js","../src/extractors/custom/www.miamiherald.com/index.js","../src/extractors/custom/www.refinery29.com/index.js","../src/extractors/custom/www.macrumors.com/index.js","../src/extractors/custom/www.androidcentral.com/index.js","../src/extractors/custom/www.si.com/index.js","../src/extractors/custom/www.rawstory.com/index.js","../src/extractors/custom/www.cnet.com/index.js","../src/extractors/custom/www.today.com/index.js","../src/extractors/custom/www.al.com/index.js","../src/extractors/custom/www.thepennyhoarder.com/index.js","../src/extractors/custom/www.westernjournalism.com/index.js","../src/extractors/custom/www.americanow.com/index.js","../src/extractors/custom/sciencefly.com/index.js","../src/extractors/custom/hellogiggles.com/index.js","../src/extractors/custom/thoughtcatalog.com/index.js","../src/extractors/custom/www.inquisitr.com/index.js","../src/extractors/custom/www.nbcnews.com/index.js","../src/extractors/custom/fortune.com/index.js","../src/extractors/custom/www.linkedin.com/index.js","../src/extractors/custom/obamawhitehouse.archives.gov/index.js","../src/extractors/custom/www.opposingviews.com/index.js","../src/extractors/custom/www.prospectmagazine.co.uk/index.js","../src/extractors/custom/forward.com/index.js","../src/extractors/custom/www.qdaily.com/index.js","../src/extractors/custom/gothamist.com/index.js","../src/extractors/custom/www.fool.com/index.js","../src/extractors/custom/www.slate.com/index.js","../src/extractors/custom/ici.radio-canada.ca/index.js","../src/extractors/custom/www.fortinet.com/index.js","../src/extractors/custom/www.fastcompany.com/index.js","../src/extractors/custom/blisterreview.com/index.js","../src/extractors/custom/news.mynavi.jp/index.js","../src/extractors/custom/clinicaltrials.gov/index.js","../src/extractors/custom/github.com/index.js","../src/extractors/custom/www.reddit.com/index.js","../src/extractors/custom/otrs.com/index.js","../src/extractors/custom/www.ossnews.jp/index.js","../src/extractors/custom/buzzap.jp/index.js","../src/extractors/custom/www.asahi.com/index.js","../src/extractors/custom/www.sanwa.co.jp/index.js","../src/extractors/custom/www.elecom.co.jp/index.js","../src/extractors/custom/scan.netsecurity.ne.jp/index.js","../src/extractors/custom/jvndb.jvn.jp/index.js","../src/extractors/custom/genius.com/index.js","../src/extractors/custom/www.jnsa.org/index.js","../src/extractors/custom/phpspot.org/index.js","../src/extractors/custom/www.infoq.com/index.js","../src/extractors/custom/www.moongift.jp/index.js","../src/extractors/custom/www.itmedia.co.jp/index.js","../src/extractors/custom/www.publickey1.jp/index.js","../src/extractors/custom/takagi-hiromitsu.jp/index.js","../src/extractors/custom/bookwalker.jp/index.js","../src/extractors/custom/www.yomiuri.co.jp/index.js","../src/extractors/custom/japan.cnet.com/index.js","../src/extractors/custom/deadline.com/index.js","../src/extractors/custom/www.gizmodo.jp/index.js","../src/extractors/custom/getnews.jp/index.js","../src/extractors/custom/www.lifehacker.jp/index.js","../src/extractors/custom/sect.iij.ad.jp/index.js","../src/extractors/custom/www.oreilly.co.jp/index.js","../src/extractors/custom/www.ipa.go.jp/index.js","../src/extractors/custom/weekly.ascii.jp/index.js","../src/extractors/custom/techlog.iij.ad.jp/index.js","../src/extractors/custom/wired.jp/index.js","../src/extractors/custom/japan.zdnet.com/index.js","../src/extractors/custom/www.rbbtoday.com/index.js","../src/extractors/custom/www.lemonde.fr/index.js","../src/extractors/custom/www.phoronix.com/index.js","../src/extractors/custom/pitchfork.com/index.js","../src/extractors/custom/biorxiv.org/index.js","../src/extractors/custom/epaper.zeit.de/index.js","../src/extractors/custom/www.ladbible.com/index.js","../src/extractors/custom/timesofindia.indiatimes.com/index.js","../src/extractors/custom/ma.ttias.be/index.js","../src/extractors/custom/pastebin.com/index.js","../src/extractors/custom/www.abendblatt.de/index.js","../src/extractors/custom/www.gruene.de/index.js","../src/extractors/custom/www.engadget.com/index.js","../src/extractors/custom/arstechnica.com/index.js","../src/extractors/custom/www.ndtv.com/index.js","../src/extractors/custom/www.spektrum.de/index.js","../src/extractors/custom/postlight.com/index.js","../src/extractors/custom/www.investmentexecutive.com/index.js","../src/extractors/custom/www.cbc.ca/index.js","../src/extractors/custom/www.versants.com/index.js","../src/extractors/all.js","../src/cleaners/constants.js","../src/cleaners/author.js","../src/cleaners/lead-image-url.js","../src/cleaners/dek.js","../src/cleaners/date-published.js","../src/cleaners/content.js","../src/cleaners/title.js","../src/cleaners/resolve-split-title.js","../src/cleaners/index.js","../src/extractors/generic/content/extract-best-node.js","../src/extractors/generic/content/extractor.js","../src/extractors/generic/title/constants.js","../src/extractors/generic/title/extractor.js","../src/extractors/generic/author/constants.js","../src/extractors/generic/author/extractor.js","../src/extractors/generic/date-published/constants.js","../src/extractors/generic/date-published/extractor.js","../src/extractors/generic/dek/extractor.js","../src/extractors/generic/lead-image-url/constants.js","../src/extractors/generic/lead-image-url/score-image.js","../src/extractors/generic/lead-image-url/extractor.js","../src/extractors/generic/next-page-url/scoring/utils/score-similarity.js","../src/extractors/generic/next-page-url/scoring/utils/score-link-text.js","../src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.js","../src/extractors/generic/next-page-url/scoring/constants.js","../src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.js","../src/extractors/generic/next-page-url/scoring/utils/score-by-parents.js","../src/extractors/generic/next-page-url/scoring/utils/score-prev-link.js","../src/extractors/generic/next-page-url/scoring/utils/should-score.js","../src/extractors/generic/next-page-url/scoring/utils/score-base-url.js","../src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.js","../src/extractors/generic/next-page-url/scoring/utils/score-cap-links.js","../src/extractors/generic/next-page-url/scoring/score-links.js","../src/extractors/generic/next-page-url/extractor.js","../src/extractors/generic/url/constants.js","../src/extractors/generic/url/extractor.js","../src/extractors/generic/excerpt/constants.js","../src/extractors/generic/excerpt/extractor.js","../src/extractors/generic/word-count/extractor.js","../src/extractors/generic/index.js","../src/extractors/detect-by-html.js","../src/extractors/get-extractor.js","../src/extractors/root-extractor.js","../src/extractors/collect-all-pages.js","../src/mercury.js"],"sourcesContent":["const NORMALIZE_RE = /\\s{2,}(?![^<>]*<\\/(pre|code|textarea)>)/g;\n\nexport default function normalizeSpaces(text) {\n return text.replace(NORMALIZE_RE, ' ').trim();\n}\n","// Given a node type to search for, and a list of regular expressions,\n// look to see if this extraction can be found in the URL. Expects\n// that each expression in r_list will return group(1) as the proper\n// string to be cleaned.\n// Only used for date_published currently.\nexport default function extractFromUrl(url, regexList) {\n const matchRe = regexList.find(re => re.test(url));\n if (matchRe) {\n return matchRe.exec(url)[1];\n }\n\n return null;\n}\n","// An expression that looks to try to find the page digit within a URL, if\n// it exists.\n// Matches:\n// page=1\n// pg=1\n// p=1\n// paging=12\n// pag=7\n// pagination/1\n// paging/88\n// pa/83\n// p/11\n//\n// Does not match:\n// pg=102\n// page:2\nexport const PAGE_IN_HREF_RE = new RegExp(\n '(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})',\n 'i'\n);\n\nexport const HAS_ALPHA_RE = /[a-z]/i;\n\nexport const IS_ALPHA_RE = /^[a-z]+$/i;\nexport const IS_DIGIT_RE = /^[0-9]+$/i;\n\nexport const ENCODING_RE = /charset=([\\w-]+)\\b/;\nexport const DEFAULT_ENCODING = 'utf-8';\n","import { PAGE_IN_HREF_RE } from './constants';\n\nexport default function pageNumFromUrl(url) {\n const matches = url.match(PAGE_IN_HREF_RE);\n if (!matches) return null;\n\n const pageNum = parseInt(matches[6], 10);\n\n // Return pageNum < 100, otherwise\n // return null\n return pageNum < 100 ? pageNum : null;\n}\n","export default function removeAnchor(url) {\n return url.split('#')[0].replace(/\\/$/, '');\n}\n","import URL from 'url';\n\nimport {\n HAS_ALPHA_RE,\n IS_ALPHA_RE,\n IS_DIGIT_RE,\n PAGE_IN_HREF_RE,\n} from './constants';\n\nfunction isGoodSegment(segment, index, firstSegmentHasLetters) {\n let goodSegment = true;\n\n // If this is purely a number, and it's the first or second\n // url_segment, it's probably a page number. Remove it.\n if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {\n goodSegment = true;\n }\n\n // If this is the first url_segment and it's just \"index\",\n // remove it\n if (index === 0 && segment.toLowerCase() === 'index') {\n goodSegment = false;\n }\n\n // If our first or second url_segment is smaller than 3 characters,\n // and the first url_segment had no alphas, remove it.\n if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {\n goodSegment = false;\n }\n\n return goodSegment;\n}\n\n// Take a URL, and return the article base of said URL. That is, no\n// pagination data exists in it. Useful for comparing to other links\n// that might have pagination data within them.\nexport default function articleBaseUrl(url, parsed) {\n const parsedUrl = parsed || URL.parse(url);\n const { protocol, host, path } = parsedUrl;\n\n let firstSegmentHasLetters = false;\n const cleanedSegments = path\n .split('/')\n .reverse()\n .reduce((acc, rawSegment, index) => {\n let segment = rawSegment;\n\n // Split off and save anything that looks like a file type.\n if (segment.includes('.')) {\n const [possibleSegment, fileExt] = segment.split('.');\n if (IS_ALPHA_RE.test(fileExt)) {\n segment = possibleSegment;\n }\n }\n\n // If our first or second segment has anything looking like a page\n // number, remove it.\n if (PAGE_IN_HREF_RE.test(segment) && index < 2) {\n segment = segment.replace(PAGE_IN_HREF_RE, '');\n }\n\n // If we're on the first segment, check to see if we have any\n // characters in it. The first segment is actually the last bit of\n // the URL, and this will be helpful to determine if we're on a URL\n // segment that looks like \"/2/\" for example.\n if (index === 0) {\n firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);\n }\n\n // If it's not marked for deletion, push it to cleaned_segments.\n if (isGoodSegment(segment, index, firstSegmentHasLetters)) {\n acc.push(segment);\n }\n\n return acc;\n }, []);\n\n return `${protocol}//${host}${cleanedSegments.reverse().join('/')}`;\n}\n","// Given a string, return True if it appears to have an ending sentence\n// within it, false otherwise.\nconst SENTENCE_END_RE = new RegExp('.( |$)');\nexport default function hasSentenceEnd(text) {\n return SENTENCE_END_RE.test(text);\n}\n","export default function excerptContent(content, words = 10) {\n return content\n .trim()\n .split(/\\s+/)\n .slice(0, words)\n .join(' ');\n}\n","import iconv from 'iconv-lite';\nimport { DEFAULT_ENCODING, ENCODING_RE } from './constants';\n\n// check a string for encoding; this is\n// used in our fetchResource function to\n// ensure correctly encoded responses\nexport default function getEncoding(str) {\n let encoding = DEFAULT_ENCODING;\n const matches = ENCODING_RE.exec(str);\n if (matches !== null) {\n [, str] = matches;\n }\n if (iconv.encodingExists(str)) {\n encoding = str;\n }\n return encoding;\n}\n","import cheerio from 'cheerio';\n\n// Browser does not like us setting user agent\nexport const REQUEST_HEADERS = cheerio.browser\n ? {}\n : {\n 'User-Agent':\n 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',\n };\n\n// The number of milliseconds to attempt to fetch a resource before timing out.\nexport const FETCH_TIMEOUT = 10000;\n\n// Content types that we do not extract content from\nconst BAD_CONTENT_TYPES = [\n 'audio/mpeg',\n 'image/gif',\n 'image/jpeg',\n 'image/jpg',\n];\n\nexport const BAD_CONTENT_TYPES_RE = new RegExp(\n `^(${BAD_CONTENT_TYPES.join('|')})$`,\n 'i'\n);\n\n// Use this setting as the maximum size an article can be\n// for us to attempt parsing. Defaults to 5 MB.\nexport const MAX_CONTENT_LENGTH = 5242880;\n\n// Turn the global proxy on or off\n// Proxying is not currently enabled in Python source\n// so not implementing logic in port.\nexport const PROXY_DOMAINS = false;\nexport const REQUESTS_PROXIES = {\n http: 'http://38.98.105.139:33333',\n https: 'http://38.98.105.139:33333',\n};\n\nexport const DOMAINS_TO_PROXY = ['nih.gov', 'gutenberg.org'];\n","import URL from 'url';\nimport request from 'postman-request';\n\nimport {\n REQUEST_HEADERS,\n FETCH_TIMEOUT,\n BAD_CONTENT_TYPES_RE,\n MAX_CONTENT_LENGTH,\n} from './constants';\n\nfunction get(options) {\n return new Promise((resolve, reject) => {\n request(options, (err, response, body) => {\n if (err) {\n reject(err);\n } else {\n resolve({ body, response });\n }\n });\n });\n}\n\n// Evaluate a response to ensure it's something we should be keeping.\n// This does not validate in the sense of a response being 200 or not.\n// Validation here means that we haven't found reason to bail from\n// further processing of this url.\n\nexport function validateResponse(response, parseNon200 = false) {\n // Check if we got a valid status code\n // This isn't great, but I'm requiring a statusMessage to be set\n // before short circuiting b/c nock doesn't set it in tests\n // statusMessage only not set in nock response, in which case\n // I check statusCode, which is currently only 200 for OK responses\n // in tests\n if (\n (response.statusMessage && response.statusMessage !== 'OK') ||\n response.statusCode !== 200\n ) {\n if (!response.statusCode) {\n throw new Error(\n `Unable to fetch content. Original exception was ${response.error}`\n );\n } else if (!parseNon200) {\n throw new Error(\n `Resource returned a response status code of ${\n response.statusCode\n } and resource was instructed to reject non-200 status codes.`\n );\n }\n }\n\n const {\n 'content-type': contentType,\n 'content-length': contentLength,\n } = response.headers;\n\n // Check that the content is not in BAD_CONTENT_TYPES\n if (BAD_CONTENT_TYPES_RE.test(contentType)) {\n throw new Error(\n `Content-type for this resource was ${contentType} and is not allowed.`\n );\n }\n\n // Check that the content length is below maximum\n if (contentLength > MAX_CONTENT_LENGTH) {\n throw new Error(\n `Content for this resource was too large. Maximum content length is ${MAX_CONTENT_LENGTH}.`\n );\n }\n\n return true;\n}\n\n// Grabs the last two pieces of the URL and joins them back together\n// This is to get the 'livejournal.com' from 'erotictrains.livejournal.com'\nexport function baseDomain({ host }) {\n return host\n .split('.')\n .slice(-2)\n .join('.');\n}\n\n// Set our response attribute to the result of fetching our URL.\n// TODO: This should gracefully handle timeouts and raise the\n// proper exceptions on the many failure cases of HTTP.\n// TODO: Ensure we are not fetching something enormous. Always return\n// unicode content for HTML, with charset conversion.\n\nexport default async function fetchResource(url, parsedUrl, headers = {}) {\n parsedUrl = parsedUrl || URL.parse(encodeURI(url));\n const options = {\n url: parsedUrl.href,\n headers: { ...REQUEST_HEADERS, ...headers },\n timeout: FETCH_TIMEOUT,\n // Accept cookies\n jar: true,\n // Set to null so the response returns as binary and body as buffer\n // https://github.com/request/request#requestoptions-callback\n encoding: null,\n // Accept and decode gzip\n gzip: true,\n // Follow any non-GET redirects\n followAllRedirects: true,\n ...(typeof window !== 'undefined'\n ? {}\n : {\n // Follow GET redirects; this option is for Node only\n followRedirect: true,\n }),\n };\n\n const { response, body } = await get(options);\n\n try {\n validateResponse(response);\n return {\n body,\n response,\n };\n } catch (e) {\n return {\n error: true,\n message: e.message,\n };\n }\n}\n","function convertMetaProp($, from, to) {\n $(`meta[${from}]`).each((_, node) => {\n const $node = $(node);\n\n const value = $node.attr(from);\n $node.attr(to, value);\n $node.removeAttr(from);\n });\n\n return $;\n}\n\n// For ease of use in extracting from meta tags,\n// replace the \"content\" attribute on meta tags with the\n// \"value\" attribute.\n//\n// In addition, normalize 'property' attributes to 'name' for ease of\n// querying later. See, e.g., og or twitter meta tags.\n\nexport default function normalizeMetaTags($) {\n $ = convertMetaProp($, 'content', 'value');\n $ = convertMetaProp($, 'property', 'name');\n return $;\n}\n","// Spacer images to be removed\nexport const SPACER_RE = new RegExp('transparent|spacer|blank', 'i');\n\n// The class we will use to mark elements we want to keep\n// but would normally remove\nexport const KEEP_CLASS = 'mercury-parser-keep';\n\nexport const KEEP_SELECTORS = [\n 'iframe[src^=\"https://www.youtube.com\"]',\n 'iframe[src^=\"https://www.youtube-nocookie.com\"]',\n 'iframe[src^=\"http://www.youtube.com\"]',\n 'iframe[src^=\"https://player.vimeo\"]',\n 'iframe[src^=\"http://player.vimeo\"]',\n 'iframe[src^=\"https://www.redditmedia.com\"]',\n];\n\n// A list of tags to strip from the output if we encounter them.\nexport const STRIP_OUTPUT_TAGS = [\n 'title',\n 'script',\n 'noscript',\n 'link',\n 'style',\n 'hr',\n 'embed',\n 'iframe',\n 'object',\n];\n\n// cleanAttributes\nexport const REMOVE_ATTRS = ['style', 'align'];\nexport const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(\n selector => `[${selector}]`\n);\nexport const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');\nexport const WHITELIST_ATTRS = [\n 'src',\n 'srcset',\n 'sizes',\n 'type',\n 'href',\n 'class',\n 'id',\n 'alt',\n 'xlink:href',\n 'width',\n 'height',\n];\n\nexport const WHITELIST_ATTRS_RE = new RegExp(\n `^(${WHITELIST_ATTRS.join('|')})$`,\n 'i'\n);\n\n// removeEmpty\nexport const REMOVE_EMPTY_TAGS = ['p'];\nexport const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(\n tag => `${tag}:empty`\n).join(',');\n\n// cleanTags\nexport const CLEAN_CONDITIONALLY_TAGS = [\n 'ul',\n 'ol',\n 'table',\n 'div',\n 'button',\n 'form',\n].join(',');\n\n// cleanHeaders\nconst HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];\nexport const HEADER_TAG_LIST = HEADER_TAGS.join(',');\n\n// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nexport const UNLIKELY_CANDIDATES_BLACKLIST = [\n 'ad-break',\n 'adbox',\n 'advert',\n 'addthis',\n 'agegate',\n 'aux',\n 'blogger-labels',\n 'combx',\n 'comment',\n 'conversation',\n 'disqus',\n 'entry-unrelated',\n 'extra',\n 'foot',\n // 'form', // This is too generic, has too many false positives\n 'header',\n 'hidden',\n 'loader',\n 'login', // Note: This can hit 'blogindex'.\n 'menu',\n 'meta',\n 'nav',\n 'outbrain',\n 'pager',\n 'pagination',\n 'predicta', // readwriteweb inline ad box\n 'presence_control_external', // lifehacker.com container full of false positives\n 'popup',\n 'printfriendly',\n 'related',\n 'remove',\n 'remark',\n 'rss',\n 'share',\n 'shoutbox',\n 'sidebar',\n 'sociable',\n 'sponsor',\n 'taboola',\n 'tools',\n];\n\n// A list of strings that can be considered LIKELY candidates when\n// extracting content from a resource. Essentially, the inverse of the\n// blacklist above - if something matches both blacklist and whitelist,\n// it is kept. This is useful, for example, if something has a className\n// of \"rss-content entry-content\". It matched 'rss', so it would normally\n// be removed, however, it's also the entry content, so it should be left\n// alone.\n//\n// These strings are joined together and then tested for existence using\n// re:test, so may contain simple, non-pipe style regular expression queries\n// if necessary.\nexport const UNLIKELY_CANDIDATES_WHITELIST = [\n 'and',\n 'article',\n 'body',\n 'blogindex',\n 'column',\n 'content',\n 'entry-content-asset',\n 'format', // misuse of form\n 'hfeed',\n 'hentry',\n 'hatom',\n 'main',\n 'page',\n 'posts',\n 'shadow',\n];\n\n// A list of tags which, if found inside, should cause a
to NOT\n// be turned into a paragraph tag. Shallow div tags without these elements\n// should be turned into tags.\nexport const DIV_TO_P_BLOCK_TAGS = [\n 'a',\n 'blockquote',\n 'dl',\n 'div',\n 'img',\n 'p',\n 'pre',\n 'table',\n].join(',');\n\n// A list of tags that should be ignored when trying to find the top candidate\n// for a document.\nexport const NON_TOP_CANDIDATE_TAGS = [\n 'br',\n 'b',\n 'i',\n 'label',\n 'hr',\n 'area',\n 'base',\n 'basefont',\n 'input',\n 'img',\n 'link',\n 'meta',\n];\n\nexport const NON_TOP_CANDIDATE_TAGS_RE = new RegExp(\n `^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`,\n 'i'\n);\n\n// A list of selectors that specify, very clearly, either hNews or other\n// very content-specific style content, like Blogger templates.\n// More examples here: http://microformats.org/wiki/blog-post-formats\nexport const HNEWS_CONTENT_SELECTORS = [\n ['.hentry', '.entry-content'],\n ['entry', '.entry-content'],\n ['.entry', '.entry_content'],\n ['.post', '.postbody'],\n ['.post', '.post_body'],\n ['.post', '.post-body'],\n];\n\nexport const PHOTO_HINTS = ['figure', 'photo', 'image', 'caption'];\nexport const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');\n\n// A list of strings that denote a positive scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const POSITIVE_SCORE_HINTS = [\n 'article',\n 'articlecontent',\n 'instapaper_body',\n 'blog',\n 'body',\n 'content',\n 'entry-content-asset',\n 'entry',\n 'hentry',\n 'main',\n 'Normal',\n 'page',\n 'pagination',\n 'permalink',\n 'post',\n 'story',\n 'text',\n '[-_]copy', // usatoday\n '\\\\Bcopy',\n];\n\n// The above list, joined into a matching regular expression\nexport const POSITIVE_SCORE_RE = new RegExp(\n POSITIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// Readability publisher-specific guidelines\nexport const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');\n\n// A list of strings that denote a negative scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const NEGATIVE_SCORE_HINTS = [\n 'adbox',\n 'advert',\n 'author',\n 'bio',\n 'bookmark',\n 'bottom',\n 'byline',\n 'clear',\n 'com-',\n 'combx',\n 'comment',\n 'comment\\\\B',\n 'contact',\n 'copy',\n 'credit',\n 'crumb',\n 'date',\n 'deck',\n 'excerpt',\n 'featured', // tnr.com has a featured_content which throws us off\n 'foot',\n 'footer',\n 'footnote',\n 'graf',\n 'head',\n 'info',\n 'infotext', // newscientist.com copyright\n 'instapaper_ignore',\n 'jump',\n 'linebreak',\n 'link',\n 'masthead',\n 'media',\n 'meta',\n 'modal',\n 'outbrain', // slate.com junk\n 'promo',\n 'pr_', // autoblog - press release\n 'related',\n 'respond',\n 'roundcontent', // lifehacker restricted content warning\n 'scroll',\n 'secondary',\n 'share',\n 'shopping',\n 'shoutbox',\n 'side',\n 'sidebar',\n 'sponsor',\n 'stamp',\n 'sub',\n 'summary',\n 'tags',\n 'tools',\n 'widget',\n];\n// The above list, joined into a matching regular expression\nexport const NEGATIVE_SCORE_RE = new RegExp(\n NEGATIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// XPath to try to determine if a page is wordpress. Not always successful.\nexport const IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]';\n\n// Match a digit. Pretty clear.\nexport const DIGIT_RE = new RegExp('[0-9]');\n\n// A list of words that, if found in link text or URLs, likely mean that\n// this link is not a next page link.\nexport const EXTRANEOUS_LINK_HINTS = [\n 'print',\n 'archive',\n 'comment',\n 'discuss',\n 'e-mail',\n 'email',\n 'share',\n 'reply',\n 'all',\n 'login',\n 'sign',\n 'single',\n 'adx',\n 'entry-unrelated',\n];\nexport const EXTRANEOUS_LINK_HINTS_RE = new RegExp(\n EXTRANEOUS_LINK_HINTS.join('|'),\n 'i'\n);\n\n// Match any phrase that looks like it could be page, or paging, or pagination\nexport const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');\n\n// Match any link text/classname/id that looks like it could mean the next\n// page. Things like: next, continue, >, >>, » but not >|, »| as those can\n// mean last page.\n// export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))', 'i');\nexport const NEXT_LINK_TEXT_RE = /(next|weiter|continue|>([^|]|$)|»([^|]|$))/i;\n\n// Match any link text/classname/id that looks like it is an end link: things\n// like \"first\", \"last\", \"end\", etc.\nexport const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i');\n\n// Match any link text/classname/id that looks like it means the previous\n// page.\nexport const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i');\n\n// Match 2 or more consecutive\s*$/g;function Te(e,t){return M(e,"table")&&M(11!==t.nodeType?t:t.firstChild,"tr")&&E(e).children("tbody")[0]||e}function Ce(e){return e.type=(null!==e.getAttribute("type"))+"/"+e.type,e}function De(e){return"true/"===(e.type||"").slice(0,5)?e.type=e.type.slice(5):e.removeAttribute("type"),e}function Oe(e,t){var r,n,a,i,o,s;if(1===t.nodeType){if(V.hasData(e)&&(s=V.get(e).events))for(a in V.remove(t,"handle events"),s)for(r=0,n=s[a].length;r ([^|]|$)|»([^|]|$))","i"),Tu=new RegExp("(first|last|end)","i"),Cu=new RegExp("(prev|earl|old|new|<|«)","i");function Du(e){var t=e.links,k=e.articleUrl,E=e.baseUrl,M=e.parsedUrl,S=e.$,r=e.previousUrls,T=void 0===r?[]:r;M=M||Sn.parse(k);var C=new RegExp("^".concat(E),"i"),D=0 ' + func(text) + ' fred, barney, & pebbles").append(E.parseHTML(e)).find(n):e)}).always(r&&function(e,t){o.each(function(){r.apply(this,i||[e.responseText,t,e])})}),this},E.expr.pseudos.animated=function(t){return E.grep(E.timers,function(e){return t===e.elem}).length},E.offset={setOffset:function(e,t,r){var n,a,i,o,s,c,u=E.css(e,"position"),l=E(e),f={};"static"===u&&(e.style.position="relative"),s=l.offset(),i=E.css(e,"top"),c=E.css(e,"left"),a=("absolute"===u||"fixed"===u)&&-1<(i+c).indexOf("auto")?(o=(n=l.position()).top,n.left):(o=parseFloat(i)||0,parseFloat(c)||0),y(t)&&(t=t.call(e,r,E.extend({},s))),null!=t.top&&(f.top=t.top-s.top+o),null!=t.left&&(f.left=t.left-s.left+a),"using"in t?t.using.call(e,f):l.css(f)}},E.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){E.offset.setOffset(this,t,e)});var e,r,n=this[0];return n?n.getClientRects().length?(e=n.getBoundingClientRect(),r=n.ownerDocument.defaultView,{top:e.top+r.pageYOffset,left:e.left+r.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,r,n=this[0],a={top:0,left:0};if("fixed"===E.css(n,"position"))t=n.getBoundingClientRect();else{for(t=this.offset(),r=n.ownerDocument,e=n.offsetParent||r.documentElement;e&&(e===r.body||e===r.documentElement)&&"static"===E.css(e,"position");)e=e.parentNode;e&&e!==n&&1===e.nodeType&&((a=E(e).offset()).top+=E.css(e,"borderTopWidth",!0),a.left+=E.css(e,"borderLeftWidth",!0))}return{top:t.top-a.top-E.css(n,"marginTop",!0),left:t.left-a.left-E.css(n,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){for(var e=this.offsetParent;e&&"static"===E.css(e,"position");)e=e.offsetParent;return e||re})}}),E.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,a){var i="pageYOffset"===a;E.fn[t]=function(e){return B(this,function(e,t,r){var n;if(_(e)?n=e:9===e.nodeType&&(n=e.defaultView),void 0===r)return n?n[a]:e[t];n?n.scrollTo(i?n.pageXOffset:r,i?r:n.pageYOffset):e[t]=r},t,e,arguments.length)}}),E.each(["top","left"],function(e,r){E.cssHooks[r]=We(v.pixelPosition,function(e,t){if(t)return t=Ye(e,r),je.test(t)?E(e).position()[r]+"px":t})}),E.each({Height:"height",Width:"width"},function(o,s){E.each({padding:"inner"+o,content:s,"":"outer"+o},function(n,i){E.fn[i]=function(e,t){var r=arguments.length&&(n||"boolean"!=typeof e),a=n||(!0===e||!0===t?"margin":"border");return B(this,function(e,t,r){var n;return _(e)?0===i.indexOf("outer")?e["inner"+o]:e.document.documentElement["client"+o]:9===e.nodeType?(n=e.documentElement,Math.max(e.body["scroll"+o],n["scroll"+o],e.body["offset"+o],n["offset"+o],n["client"+o])):void 0===r?E.css(e,t,a):E.style(e,t,r,a)},s,r?e:void 0,r)}})}),E.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){E.fn[t]=function(e){return this.on(t,e)}}),E.fn.extend({bind:function(e,t,r){return this.on(e,null,t,r)},unbind:function(e,t){return this.off(e,null,t)},delegate:function(e,t,r,n){return this.on(t,e,r,n)},undelegate:function(e,t,r){return 1===arguments.length?this.off(e,"**"):this.off(t,e||"**",r)},hover:function(e,t){return this.mouseenter(e).mouseleave(t||e)}}),E.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,r){E.fn[r]=function(e,t){return 0r.valueOf():r.valueOf()e.score?r:e},{score:-100});return 50<=u.score?u.href:null}},Nu=["og:url"];function Pu(e){return{url:e,domain:(t=e,Sn.parse(t).hostname)};var t}var ju={extract:function(e){var t=e.$,r=e.url,n=e.metaCache,a=t("link[rel=canonical]");if(0!==a.length){var i=a.attr("href");if(i)return Pu(i)}var o=Xo(t,Nu,n);return Pu(o||r)}},Ru={ellipse:"…",chars:[" ","-"],max:140,truncate:!0};var zu=function(e,t,r){if("string"!=typeof e||0===e.length)return"";if(0===t)return"";for(var n in r=r||{},Ru)null!==r[n]&&void 0!==r[n]||(r[n]=Ru[n]);return r.max=t||r.max,function(e,t,r,n,a){if(e.lengthPage ").concat(l,"
").concat(h.content)}),r=h.next_page_url,e.next=3;break;case 16:return p=Bu.word_count({content:"","
"],col:[2,"
"],tr:[2,"","
"],td:[3,"
"],_default:[0,"",""]});function C(e,t){var r=void 0!==e.getElementsByTagName?e.getElementsByTagName(t||"*"):void 0!==e.querySelectorAll?e.querySelectorAll(t||"*"):[];return void 0===t||t&&c(e,t)?k.merge([e],r):r}function ke(e,t){for(var r=0,n=e.length;r"," ").append(k.parseHTML(e)).find(n):e)}).always(r&&function(e,t){o.each(function(){r.apply(this,i||[e.responseText,t,e])})}),this},k.expr.pseudos.animated=function(t){return k.grep(k.timers,function(e){return t===e.elem}).length},k.offset={setOffset:function(e,t,r){var n,a,i,o,s=k.css(e,"position"),c=k(e),u={};"static"===s&&(e.style.position="relative"),i=c.offset(),n=k.css(e,"top"),o=k.css(e,"left"),s=("absolute"===s||"fixed"===s)&&-1<(n+o).indexOf("auto")?(a=(s=c.position()).top,s.left):(a=parseFloat(n)||0,parseFloat(o)||0),null!=(t=y(t)?t.call(e,r,k.extend({},i)):t).top&&(u.top=t.top-i.top+a),null!=t.left&&(u.left=t.left-i.left+s),"using"in t?t.using.call(e,u):c.css(u)}},k.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){k.offset.setOffset(this,t,e)});var e,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),r=r.ownerDocument.defaultView,{top:e.top+r.pageYOffset,left:e.left+r.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,r,n=this[0],a={top:0,left:0};if("fixed"===k.css(n,"position"))t=n.getBoundingClientRect();else{for(t=this.offset(),r=n.ownerDocument,e=n.offsetParent||r.documentElement;e&&(e===r.body||e===r.documentElement)&&"static"===k.css(e,"position");)e=e.parentNode;e&&e!==n&&1===e.nodeType&&((a=k(e).offset()).top+=k.css(e,"borderTopWidth",!0),a.left+=k.css(e,"borderLeftWidth",!0))}return{top:t.top-a.top-k.css(n,"marginTop",!0),left:t.left-a.left-k.css(n,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){for(var e=this.offsetParent;e&&"static"===k.css(e,"position");)e=e.offsetParent;return e||A})}}),k.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,a){var i="pageYOffset"===a;k.fn[t]=function(e){return f(this,function(e,t,r){var n;if(m(e)?n=e:9===e.nodeType&&(n=e.defaultView),void 0===r)return n?n[a]:e[t];n?n.scrollTo(i?n.pageXOffset:r,i?r:n.pageYOffset):e[t]=r},t,e,arguments.length)}}),k.each(["top","left"],function(e,r){k.cssHooks[r]=Xe(g.pixelPosition,function(e,t){if(t)return t=Je(e,r),Ve.test(t)?k(e).position()[r]+"px":t})}),k.each({Height:"height",Width:"width"},function(o,s){k.each({padding:"inner"+o,content:s,"":"outer"+o},function(n,i){k.fn[i]=function(e,t){var r=arguments.length&&(n||"boolean"!=typeof e),a=n||(!0===e||!0===t?"margin":"border");return f(this,function(e,t,r){var n;return m(e)?0===i.indexOf("outer")?e["inner"+o]:e.document.documentElement["client"+o]:9===e.nodeType?(n=e.documentElement,Math.max(e.body["scroll"+o],n["scroll"+o],e.body["offset"+o],n["offset"+o],n["client"+o])):void 0===r?k.css(e,t,a):k.style(e,t,r,a)},s,r?e:void 0,r)}})}),k.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){k.fn[t]=function(e){return this.on(t,e)}}),k.fn.extend({bind:function(e,t,r){return this.on(e,null,t,r)},unbind:function(e,t){return this.off(e,null,t)},delegate:function(e,t,r,n){return this.on(t,e,r,n)},undelegate:function(e,t,r){return 1===arguments.length?this.off(e,"**"):this.off(t,e||"**",r)},hover:function(e,t){return this.mouseenter(e).mouseleave(t||e)}}),k.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,r){k.fn[r]=function(e,t){return 0Page ").concat(u,"
").concat(f.content)}),r=f.next_page_url,e.next=3;break;case 16:return f=Gc.word_count({content:"\",\"
\"],col:[2,\"
\"],tr:[2,\"\",\"
\"],td:[3,\"
\"],_default:[0,\"\",\"\"]};function ve(e,t){var n;return n=\"undefined\"!=typeof e.getElementsByTagName?e.getElementsByTagName(t||\"*\"):\"undefined\"!=typeof e.querySelectorAll?e.querySelectorAll(t||\"*\"):[],void 0===t||t&&A(e,t)?S.merge([e],n):n}function ye(e,t){for(var n=0,r=e.length;n\",\" \").append(S.parseHTML(e)).find(r):e)}).always(n&&function(e,t){a.each(function(){n.apply(this,o||[e.responseText,t,e])})}),this},S.expr.pseudos.animated=function(t){return S.grep(S.timers,function(e){return t===e.elem}).length},S.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=S.css(e,\"position\"),c=S(e),f={};\"static\"===l&&(e.style.position=\"relative\"),s=c.offset(),o=S.css(e,\"top\"),u=S.css(e,\"left\"),(\"absolute\"===l||\"fixed\"===l)&&-1<(o+u).indexOf(\"auto\")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),m(t)&&(t=t.call(e,n,S.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),\"using\"in t?t.using.call(e,f):c.css(f)}},S.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){S.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if(\"fixed\"===S.css(r,\"position\"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&\"static\"===S.css(e,\"position\"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=S(e).offset()).top+=S.css(e,\"borderTopWidth\",!0),i.left+=S.css(e,\"borderLeftWidth\",!0))}return{top:t.top-i.top-S.css(r,\"marginTop\",!0),left:t.left-i.left-S.css(r,\"marginLeft\",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&\"static\"===S.css(e,\"position\"))e=e.offsetParent;return e||re})}}),S.each({scrollLeft:\"pageXOffset\",scrollTop:\"pageYOffset\"},function(t,i){var o=\"pageYOffset\"===i;S.fn[t]=function(e){return $(this,function(e,t,n){var r;if(x(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),S.each([\"top\",\"left\"],function(e,n){S.cssHooks[n]=Fe(y.pixelPosition,function(e,t){if(t)return t=We(e,n),Pe.test(t)?S(e).position()[n]+\"px\":t})}),S.each({Height:\"height\",Width:\"width\"},function(a,s){S.each({padding:\"inner\"+a,content:s,\"\":\"outer\"+a},function(r,o){S.fn[o]=function(e,t){var n=arguments.length&&(r||\"boolean\"!=typeof e),i=r||(!0===e||!0===t?\"margin\":\"border\");return $(this,function(e,t,n){var r;return x(e)?0===o.indexOf(\"outer\")?e[\"inner\"+a]:e.document.documentElement[\"client\"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body[\"scroll\"+a],r[\"scroll\"+a],e.body[\"offset\"+a],r[\"offset\"+a],r[\"client\"+a])):void 0===n?S.css(e,t,i):S.style(e,t,n,i)},s,n?e:void 0,n)}})}),S.each([\"ajaxStart\",\"ajaxStop\",\"ajaxComplete\",\"ajaxError\",\"ajaxSuccess\",\"ajaxSend\"],function(e,t){S.fn[t]=function(e){return this.on(t,e)}}),S.fn.extend({bind:function(e,t,n){return this.on(e,null,t,n)},unbind:function(e,t){return this.off(e,null,t)},delegate:function(e,t,n,r){return this.on(t,e,n,r)},undelegate:function(e,t,n){return 1===arguments.length?this.off(e,\"**\"):this.off(t,e||\"**\",n)},hover:function(e,t){return this.mouseenter(e).mouseleave(t||e)}}),S.each(\"blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu\".split(\" \"),function(e,n){S.fn[n]=function(e,t){return 0
tags\nexport const BR_TAGS_RE = new RegExp('(
]*>[ \\n\\r\\t]*){2,}', 'i');\n\n// Match 1 BR tag.\nexport const BR_TAG_RE = new RegExp('
]*>', 'i');\n\n// A list of all of the block level tags known in HTML5 and below. Taken from\n// http://bit.ly/qneNIT\nexport const BLOCK_LEVEL_TAGS = [\n 'article',\n 'aside',\n 'blockquote',\n 'body',\n 'br',\n 'button',\n 'canvas',\n 'caption',\n 'col',\n 'colgroup',\n 'dd',\n 'div',\n 'dl',\n 'dt',\n 'embed',\n 'fieldset',\n 'figcaption',\n 'figure',\n 'footer',\n 'form',\n 'h1',\n 'h2',\n 'h3',\n 'h4',\n 'h5',\n 'h6',\n 'header',\n 'hgroup',\n 'hr',\n 'li',\n 'map',\n 'object',\n 'ol',\n 'output',\n 'p',\n 'pre',\n 'progress',\n 'section',\n 'table',\n 'tbody',\n 'textarea',\n 'tfoot',\n 'th',\n 'thead',\n 'tr',\n 'ul',\n 'video',\n];\nexport const BLOCK_LEVEL_TAGS_RE = new RegExp(\n `^(${BLOCK_LEVEL_TAGS.join('|')})$`,\n 'i'\n);\n\n// The removal is implemented as a blacklist and whitelist, this test finds\n// blacklisted elements that aren't whitelisted. We do this all in one\n// expression-both because it's only one pass, and because this skips the\n// serialization for whitelisted nodes.\nconst candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');\nexport const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');\n\nconst candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');\nexport const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');\n\nexport const UNLIKELY_RE = new RegExp(\n `!(${candidatesWhitelist})|(${candidatesBlacklist})`,\n 'i'\n);\n\nexport const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');\nexport const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');\nexport const BAD_TAGS = new RegExp('^(address|form)$', 'i');\n\nexport const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');\n","import { paragraphize } from './index';\n\n// ## NOTES:\n// Another good candidate for refactoring/optimizing.\n// Very imperative code, I don't love it. - AP\n\n// Given cheerio object, convert consecutive
tags into\n// tags instead.\n//\n// :param $: A cheerio object\n\nexport default function brsToPs($) {\n let collapsing = false;\n $('br').each((index, element) => {\n const $element = $(element);\n const nextElement = $element.next().get(0);\n\n if (nextElement && nextElement.tagName.toLowerCase() === 'br') {\n collapsing = true;\n $element.remove();\n } else if (collapsing) {\n collapsing = false;\n paragraphize(element, $, true);\n }\n });\n\n return $;\n}\n","import { BLOCK_LEVEL_TAGS_RE } from './constants';\n\n// Given a node, turn it into a P if it is not already a P, and\n// make sure it conforms to the constraints of a P tag (I.E. does\n// not contain any other block tags.)\n//\n// If the node is a
, it treats the following inline siblings\n// as if they were its children.\n//\n// :param node: The node to paragraphize; this is a raw node\n// :param $: The cheerio object to handle dom manipulation\n// :param br: Whether or not the passed node is a br\n\nexport default function paragraphize(node, $, br = false) {\n const $node = $(node);\n\n if (br) {\n let sibling = node.nextSibling;\n const p = $('');\n\n // while the next node is text or not a block level element\n // append it to a new p node\n while (\n sibling &&\n !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))\n ) {\n const { nextSibling } = sibling;\n $(sibling).appendTo(p);\n sibling = nextSibling;\n }\n\n $node.replaceWith(p);\n $node.remove();\n return $;\n }\n\n return $;\n}\n","import { brsToPs, convertNodeTo } from 'utils/dom';\n\nimport { DIV_TO_P_BLOCK_TAGS } from './constants';\n\nfunction convertDivs($) {\n $('div').each((index, div) => {\n const $div = $(div);\n const convertible = $div.children(DIV_TO_P_BLOCK_TAGS).length === 0;\n\n if (convertible) {\n convertNodeTo($div, $, 'p');\n }\n });\n\n return $;\n}\n\nfunction convertSpans($) {\n $('span').each((index, span) => {\n const $span = $(span);\n const convertible = $span.parents('p, div, li, figcaption').length === 0;\n if (convertible) {\n convertNodeTo($span, $, 'p');\n }\n });\n\n return $;\n}\n\n// Loop through the provided doc, and convert any p-like elements to\n// actual paragraph tags.\n//\n// Things fitting this criteria:\n// * Multiple consecutive
tags.\n// * tags without block level elements inside of them\n// * tags who are not children of or tags.\n//\n// :param $: A cheerio object to search\n// :return cheerio object with new p elements\n// (By-reference mutation, though. Returned just for convenience.)\n\nexport default function convertToParagraphs($) {\n $ = brsToPs($);\n $ = convertDivs($);\n $ = convertSpans($);\n\n return $;\n}\n","import { getAttrs } from 'utils/dom';\n\nexport default function convertNodeTo($node, $, tag = 'p') {\n const node = $node.get(0);\n if (!node) {\n return $;\n }\n const attrs = getAttrs(node) || {};\n\n const attribString = Reflect.ownKeys(attrs)\n .map(key => `${key}=${attrs[key]}`)\n .join(' ');\n let html;\n\n if ($.browser) {\n // In the browser, the contents of noscript tags aren't rendered, therefore\n // transforms on the noscript tag (commonly used for lazy-loading) don't work\n // as expected. This test case handles that\n html =\n node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html();\n } else {\n html = $node.contents();\n }\n $node.replaceWith(`<${tag} ${attribString}>${html}${tag}>`);\n return $;\n}\n","import { SPACER_RE } from './constants';\n\nfunction cleanForHeight($img, $) {\n const height = parseInt($img.attr('height'), 10);\n const width = parseInt($img.attr('width'), 10) || 20;\n\n // Remove images that explicitly have very small heights or\n // widths, because they are most likely shims or icons,\n // which aren't very useful for reading.\n if ((height || 20) < 10 || width < 10) {\n $img.remove();\n } else if (height) {\n // Don't ever specify a height on images, so that we can\n // scale with respect to width without screwing up the\n // aspect ratio.\n $img.removeAttr('height');\n }\n\n return $;\n}\n\n// Cleans out images where the source string matches transparent/spacer/etc\n// TODO This seems very aggressive - AP\nfunction removeSpacers($img, $) {\n if (SPACER_RE.test($img.attr('src'))) {\n $img.remove();\n }\n\n return $;\n}\n\nexport default function cleanImages($article, $) {\n $article.find('img').each((index, img) => {\n const $img = $(img);\n\n cleanForHeight($img, $);\n removeSpacers($img, $);\n });\n\n return $;\n}\n","var _Array$isArray = require(\"../core-js/array/is-array\");\n\nfunction _arrayWithoutHoles(arr) {\n if (_Array$isArray(arr)) {\n for (var i = 0, arr2 = new Array(arr.length); i < arr.length; i++) {\n arr2[i] = arr[i];\n }\n\n return arr2;\n }\n}\n\nmodule.exports = _arrayWithoutHoles;","'use strict';\nvar $defineProperty = require('./_object-dp');\nvar createDesc = require('./_property-desc');\n\nmodule.exports = function (object, index, value) {\n if (index in object) $defineProperty.f(object, index, createDesc(0, value));\n else object[index] = value;\n};\n","'use strict';\nvar ctx = require('./_ctx');\nvar $export = require('./_export');\nvar toObject = require('./_to-object');\nvar call = require('./_iter-call');\nvar isArrayIter = require('./_is-array-iter');\nvar toLength = require('./_to-length');\nvar createProperty = require('./_create-property');\nvar getIterFn = require('./core.get-iterator-method');\n\n$export($export.S + $export.F * !require('./_iter-detect')(function (iter) { Array.from(iter); }), 'Array', {\n // 22.1.2.1 Array.from(arrayLike, mapfn = undefined, thisArg = undefined)\n from: function from(arrayLike /* , mapfn = undefined, thisArg = undefined */) {\n var O = toObject(arrayLike);\n var C = typeof this == 'function' ? this : Array;\n var aLen = arguments.length;\n var mapfn = aLen > 1 ? arguments[1] : undefined;\n var mapping = mapfn !== undefined;\n var index = 0;\n var iterFn = getIterFn(O);\n var length, result, step, iterator;\n if (mapping) mapfn = ctx(mapfn, aLen > 2 ? arguments[2] : undefined, 2);\n // if object isn't iterable or it's array with default iterator - use simple case\n if (iterFn != undefined && !(C == Array && isArrayIter(iterFn))) {\n for (iterator = iterFn.call(O), result = new C(); !(step = iterator.next()).done; index++) {\n createProperty(result, index, mapping ? call(iterator, mapfn, [step.value, index], true) : step.value);\n }\n } else {\n length = toLength(O.length);\n for (result = new C(length); length > index; index++) {\n createProperty(result, index, mapping ? mapfn(O[index], index) : O[index]);\n }\n }\n result.length = index;\n return result;\n }\n});\n","require('../../modules/es6.string.iterator');\nrequire('../../modules/es6.array.from');\nmodule.exports = require('../../modules/_core').Array.from;\n","var classof = require('./_classof');\nvar ITERATOR = require('./_wks')('iterator');\nvar Iterators = require('./_iterators');\nmodule.exports = require('./_core').isIterable = function (it) {\n var O = Object(it);\n return O[ITERATOR] !== undefined\n || '@@iterator' in O\n // eslint-disable-next-line no-prototype-builtins\n || Iterators.hasOwnProperty(classof(O));\n};\n","var _Array$from = require(\"../core-js/array/from\");\n\nvar _isIterable = require(\"../core-js/is-iterable\");\n\nfunction _iterableToArray(iter) {\n if (_isIterable(Object(iter)) || Object.prototype.toString.call(iter) === \"[object Arguments]\") return _Array$from(iter);\n}\n\nmodule.exports = _iterableToArray;","function _nonIterableSpread() {\n throw new TypeError(\"Invalid attempt to spread non-iterable instance\");\n}\n\nmodule.exports = _nonIterableSpread;","var arrayWithoutHoles = require(\"./arrayWithoutHoles\");\n\nvar iterableToArray = require(\"./iterableToArray\");\n\nvar nonIterableSpread = require(\"./nonIterableSpread\");\n\nfunction _toConsumableArray(arr) {\n return arrayWithoutHoles(arr) || iterableToArray(arr) || nonIterableSpread();\n}\n\nmodule.exports = _toConsumableArray;","import { getAttrs, setAttrs } from 'utils/dom';\n\nimport { WHITELIST_ATTRS_RE, KEEP_CLASS } from './constants';\n\nfunction removeAllButWhitelist($article, $) {\n $article.find('*').each((index, node) => {\n const attrs = getAttrs(node);\n\n setAttrs(\n node,\n Reflect.ownKeys(attrs).reduce((acc, attr) => {\n if (WHITELIST_ATTRS_RE.test(attr)) {\n return { ...acc, [attr]: attrs[attr] };\n }\n\n return acc;\n }, {})\n );\n });\n\n // Remove the mercury-parser-keep class from result\n $(`.${KEEP_CLASS}`, $article).removeClass(KEEP_CLASS);\n\n return $article;\n}\n\n// Remove attributes like style or align\nexport default function cleanAttributes($article, $) {\n // Grabbing the parent because at this point\n // $article will be wrapped in a div which will\n // have a score set on it.\n return removeAllButWhitelist(\n $article.parent().length ? $article.parent() : $article,\n $\n );\n}\n","export default function setAttrs(node, attrs) {\n if (node.attribs) {\n node.attribs = attrs;\n } else if (node.attributes) {\n while (node.attributes.length > 0) {\n node.removeAttribute(node.attributes[0].name);\n }\n\n Reflect.ownKeys(attrs).forEach(key => {\n node.setAttribute(key, attrs[key]);\n });\n }\n\n return node;\n}\n","// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nexport const UNLIKELY_CANDIDATES_BLACKLIST = [\n 'ad-break',\n 'adbox',\n 'advert',\n 'addthis',\n 'agegate',\n 'aux',\n 'blogger-labels',\n 'combx',\n 'comment',\n 'conversation',\n 'disqus',\n 'entry-unrelated',\n 'extra',\n 'foot',\n 'form',\n 'header',\n 'hidden',\n 'loader',\n 'login', // Note: This can hit 'blogindex'.\n 'menu',\n 'meta',\n 'nav',\n 'pager',\n 'pagination',\n 'predicta', // readwriteweb inline ad box\n 'presence_control_external', // lifehacker.com container full of false positives\n 'popup',\n 'printfriendly',\n 'related',\n 'remove',\n 'remark',\n 'rss',\n 'share',\n 'shoutbox',\n 'sidebar',\n 'sociable',\n 'sponsor',\n 'tools',\n];\n\n// A list of strings that can be considered LIKELY candidates when\n// extracting content from a resource. Essentially, the inverse of the\n// blacklist above - if something matches both blacklist and whitelist,\n// it is kept. This is useful, for example, if something has a className\n// of \"rss-content entry-content\". It matched 'rss', so it would normally\n// be removed, however, it's also the entry content, so it should be left\n// alone.\n//\n// These strings are joined together and then tested for existence using\n// re:test, so may contain simple, non-pipe style regular expression queries\n// if necessary.\nexport const UNLIKELY_CANDIDATES_WHITELIST = [\n 'and',\n 'article',\n 'body',\n 'blogindex',\n 'column',\n 'content',\n 'entry-content-asset',\n 'format', // misuse of form\n 'hfeed',\n 'hentry',\n 'hatom',\n 'main',\n 'page',\n 'posts',\n 'shadow',\n];\n\n// A list of tags which, if found inside, should cause a to NOT\n// be turned into a paragraph tag. Shallow div tags without these elements\n// should be turned into tags.\nexport const DIV_TO_P_BLOCK_TAGS = [\n 'a',\n 'blockquote',\n 'dl',\n 'div',\n 'img',\n 'p',\n 'pre',\n 'table',\n].join(',');\n\n// A list of tags that should be ignored when trying to find the top candidate\n// for a document.\nexport const NON_TOP_CANDIDATE_TAGS = [\n 'br',\n 'b',\n 'i',\n 'label',\n 'hr',\n 'area',\n 'base',\n 'basefont',\n 'input',\n 'img',\n 'link',\n 'meta',\n];\n\nexport const NON_TOP_CANDIDATE_TAGS_RE = new RegExp(\n `^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`,\n 'i'\n);\n\n// A list of selectors that specify, very clearly, either hNews or other\n// very content-specific style content, like Blogger templates.\n// More examples here: http://microformats.org/wiki/blog-post-formats\nexport const HNEWS_CONTENT_SELECTORS = [\n ['.hentry', '.entry-content'],\n ['entry', '.entry-content'],\n ['.entry', '.entry_content'],\n ['.post', '.postbody'],\n ['.post', '.post_body'],\n ['.post', '.post-body'],\n];\n\nexport const PHOTO_HINTS = ['figure', 'photo', 'image', 'caption'];\nexport const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');\n\n// A list of strings that denote a positive scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const POSITIVE_SCORE_HINTS = [\n 'article',\n 'articlecontent',\n 'instapaper_body',\n 'blog',\n 'body',\n 'content',\n 'entry-content-asset',\n 'entry',\n 'hentry',\n 'main',\n 'Normal',\n 'page',\n 'pagination',\n 'permalink',\n 'post',\n 'story',\n 'text',\n '[-_]copy', // usatoday\n '\\\\Bcopy',\n];\n\n// The above list, joined into a matching regular expression\nexport const POSITIVE_SCORE_RE = new RegExp(\n POSITIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// Readability publisher-specific guidelines\nexport const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');\n\n// A list of strings that denote a negative scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const NEGATIVE_SCORE_HINTS = [\n 'adbox',\n 'advert',\n 'author',\n 'bio',\n 'bookmark',\n 'bottom',\n 'byline',\n 'clear',\n 'com-',\n 'combx',\n 'comment',\n 'comment\\\\B',\n 'contact',\n 'copy',\n 'credit',\n 'crumb',\n 'date',\n 'deck',\n 'excerpt',\n 'featured', // tnr.com has a featured_content which throws us off\n 'foot',\n 'footer',\n 'footnote',\n 'graf',\n 'head',\n 'info',\n 'infotext', // newscientist.com copyright\n 'instapaper_ignore',\n 'jump',\n 'linebreak',\n 'link',\n 'masthead',\n 'media',\n 'meta',\n 'modal',\n 'outbrain', // slate.com junk\n 'promo',\n 'pr_', // autoblog - press release\n 'related',\n 'respond',\n 'roundcontent', // lifehacker restricted content warning\n 'scroll',\n 'secondary',\n 'share',\n 'shopping',\n 'shoutbox',\n 'side',\n 'sidebar',\n 'sponsor',\n 'stamp',\n 'sub',\n 'summary',\n 'tags',\n 'tools',\n 'widget',\n];\n// The above list, joined into a matching regular expression\nexport const NEGATIVE_SCORE_RE = new RegExp(\n NEGATIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// Match a digit. Pretty clear.\nexport const DIGIT_RE = new RegExp('[0-9]');\n\n// Match 2 or more consecutive
tags\nexport const BR_TAGS_RE = new RegExp('(
]*>[ \\n\\r\\t]*){2,}', 'i');\n\n// Match 1 BR tag.\nexport const BR_TAG_RE = new RegExp('
]*>', 'i');\n\n// A list of all of the block level tags known in HTML5 and below. Taken from\n// http://bit.ly/qneNIT\nexport const BLOCK_LEVEL_TAGS = [\n 'article',\n 'aside',\n 'blockquote',\n 'body',\n 'br',\n 'button',\n 'canvas',\n 'caption',\n 'col',\n 'colgroup',\n 'dd',\n 'div',\n 'dl',\n 'dt',\n 'embed',\n 'fieldset',\n 'figcaption',\n 'figure',\n 'footer',\n 'form',\n 'h1',\n 'h2',\n 'h3',\n 'h4',\n 'h5',\n 'h6',\n 'header',\n 'hgroup',\n 'hr',\n 'li',\n 'map',\n 'object',\n 'ol',\n 'output',\n 'p',\n 'pre',\n 'progress',\n 'section',\n 'table',\n 'tbody',\n 'textarea',\n 'tfoot',\n 'th',\n 'thead',\n 'tr',\n 'ul',\n 'video',\n];\nexport const BLOCK_LEVEL_TAGS_RE = new RegExp(\n `^(${BLOCK_LEVEL_TAGS.join('|')})$`,\n 'i'\n);\n\n// The removal is implemented as a blacklist and whitelist, this test finds\n// blacklisted elements that aren't whitelisted. We do this all in one\n// expression-both because it's only one pass, and because this skips the\n// serialization for whitelisted nodes.\nconst candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');\nexport const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');\n\nconst candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');\nexport const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');\n\nexport const UNLIKELY_RE = new RegExp(\n `!(${candidatesWhitelist})|(${candidatesBlacklist})`,\n 'i'\n);\n\nexport const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');\nexport const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');\nexport const BAD_TAGS = new RegExp('^(address|form)$', 'i');\n\nexport const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');\n","import {\n NEGATIVE_SCORE_RE,\n POSITIVE_SCORE_RE,\n PHOTO_HINTS_RE,\n READABILITY_ASSET,\n} from './constants';\n\n// Get the score of a node based on its className and id.\nexport default function getWeight(node) {\n const classes = node.attr('class');\n const id = node.attr('id');\n let score = 0;\n\n if (id) {\n // if id exists, try to score on both positive and negative\n if (POSITIVE_SCORE_RE.test(id)) {\n score += 25;\n }\n if (NEGATIVE_SCORE_RE.test(id)) {\n score -= 25;\n }\n }\n\n if (classes) {\n if (score === 0) {\n // if classes exist and id did not contribute to score\n // try to score on both positive and negative\n if (POSITIVE_SCORE_RE.test(classes)) {\n score += 25;\n }\n if (NEGATIVE_SCORE_RE.test(classes)) {\n score -= 25;\n }\n }\n\n // even if score has been set by id, add score for\n // possible photo matches\n // \"try to keep photos if we can\"\n if (PHOTO_HINTS_RE.test(classes)) {\n score += 10;\n }\n\n // add 25 if class matches entry-content-asset,\n // a class apparently instructed for use in the\n // Readability publisher guidelines\n // https://www.readability.com/developers/guidelines\n if (READABILITY_ASSET.test(classes)) {\n score += 25;\n }\n }\n\n return score;\n}\n","var $parseFloat = require('./_global').parseFloat;\nvar $trim = require('./_string-trim').trim;\n\nmodule.exports = 1 / $parseFloat(require('./_string-ws') + '-0') !== -Infinity ? function parseFloat(str) {\n var string = $trim(String(str), 3);\n var result = $parseFloat(string);\n return result === 0 && string.charAt(0) == '-' ? -0 : result;\n} : $parseFloat;\n","var $export = require('./_export');\nvar $parseFloat = require('./_parse-float');\n// 18.2.4 parseFloat(string)\n$export($export.G + $export.F * (parseFloat != $parseFloat), { parseFloat: $parseFloat });\n","require('../modules/es6.parse-float');\nmodule.exports = require('../modules/_core').parseFloat;\n","// returns the score of a node based on\n// the node's score attribute\n// returns null if no score set\nexport default function getScore($node) {\n return parseFloat($node.attr('score')) || null;\n}\n","// return 1 for every comma in text\nexport default function scoreCommas(text) {\n return (text.match(/,/g) || []).length;\n}\n","const idkRe = new RegExp('^(p|pre)$', 'i');\n\nexport default function scoreLength(textLength, tagName = 'p') {\n const chunks = textLength / 50;\n\n if (chunks > 0) {\n let lengthBonus;\n\n // No idea why p or pre are being tamped down here\n // but just following the source for now\n // Not even sure why tagName is included here,\n // since this is only being called from the context\n // of scoreParagraph\n if (idkRe.test(tagName)) {\n lengthBonus = chunks - 2;\n } else {\n lengthBonus = chunks - 1.25;\n }\n\n return Math.min(Math.max(lengthBonus, 0), 3);\n }\n\n return 0;\n}\n","import { scoreCommas, scoreLength } from './index';\n\n// Score a paragraph using various methods. Things like number of\n// commas, etc. Higher is better.\nexport default function scoreParagraph(node) {\n let score = 1;\n const text = node.text().trim();\n const textLength = text.length;\n\n // If this paragraph is less than 25 characters, don't count it.\n if (textLength < 25) {\n return 0;\n }\n\n // Add points for any commas within this paragraph\n score += scoreCommas(text);\n\n // For every 50 characters in this paragraph, add another point. Up\n // to 3 points.\n score += scoreLength(textLength);\n\n // Articles can end with short paragraphs when people are being clever\n // but they can also end with short paragraphs setting up lists of junk\n // that we strip. This negative tweaks junk setup paragraphs just below\n // the cutoff threshold.\n if (text.slice(-1) === ':') {\n score -= 1;\n }\n\n return score;\n}\n","export default function setScore($node, $, score) {\n $node.attr('score', score);\n return $node;\n}\n","import { getOrInitScore, setScore } from './index';\n\nexport default function addScore($node, $, amount) {\n try {\n const score = getOrInitScore($node, $) + amount;\n setScore($node, $, score);\n } catch (e) {\n // Ignoring; error occurs in scoreNode\n }\n\n return $node;\n}\n","import { getScore, scoreNode, getWeight, addToParent } from './index';\n\n// gets and returns the score if it exists\n// if not, initializes a score based on\n// the node's tag type\nexport default function getOrInitScore($node, $, weightNodes = true) {\n let score = getScore($node);\n\n if (score) {\n return score;\n }\n\n score = scoreNode($node);\n\n if (weightNodes) {\n score += getWeight($node);\n }\n\n addToParent($node, $, score);\n\n return score;\n}\n","import { addScore } from './index';\n\n// Adds 1/4 of a child's score to its parent\nexport default function addToParent(node, $, score) {\n const parent = node.parent();\n if (parent) {\n addScore(parent, $, score * 0.25);\n }\n\n return node;\n}\n","import { scoreParagraph } from './index';\nimport {\n PARAGRAPH_SCORE_TAGS,\n CHILD_CONTENT_TAGS,\n BAD_TAGS,\n} from './constants';\n\n// Score an individual node. Has some smarts for paragraphs, otherwise\n// just scores based on tag.\nexport default function scoreNode($node) {\n const { tagName } = $node.get(0);\n\n // TODO: Consider ordering by most likely.\n // E.g., if divs are a more common tag on a page,\n // Could save doing that regex test on every node – AP\n if (PARAGRAPH_SCORE_TAGS.test(tagName)) {\n return scoreParagraph($node);\n }\n if (tagName.toLowerCase() === 'div') {\n return 5;\n }\n if (CHILD_CONTENT_TAGS.test(tagName)) {\n return 3;\n }\n if (BAD_TAGS.test(tagName)) {\n return -3;\n }\n if (tagName.toLowerCase() === 'th') {\n return -5;\n }\n\n return 0;\n}\n","import { convertNodeTo } from 'utils/dom';\n\nimport { HNEWS_CONTENT_SELECTORS } from './constants';\nimport { scoreNode, setScore, getOrInitScore, addScore } from './index';\n\nfunction convertSpans($node, $) {\n if ($node.get(0)) {\n const { tagName } = $node.get(0);\n\n if (tagName === 'span') {\n // convert spans to divs\n convertNodeTo($node, $, 'div');\n }\n }\n}\n\nfunction addScoreTo($node, $, score) {\n if ($node) {\n convertSpans($node, $);\n addScore($node, $, score);\n }\n}\n\nfunction scorePs($, weightNodes) {\n $('p, pre')\n .not('[score]')\n .each((index, node) => {\n // The raw score for this paragraph, before we add any parent/child\n // scores.\n let $node = $(node);\n $node = setScore($node, $, getOrInitScore($node, $, weightNodes));\n\n const $parent = $node.parent();\n const rawScore = scoreNode($node);\n\n addScoreTo($parent, $, rawScore, weightNodes);\n if ($parent) {\n // Add half of the individual content score to the\n // grandparent\n addScoreTo($parent.parent(), $, rawScore / 2, weightNodes);\n }\n });\n\n return $;\n}\n\n// score content. Parents get the full value of their children's\n// content score, grandparents half\nexport default function scoreContent($, weightNodes = true) {\n // First, look for special hNews based selectors and give them a big\n // boost, if they exist\n HNEWS_CONTENT_SELECTORS.forEach(([parentSelector, childSelector]) => {\n $(`${parentSelector} ${childSelector}`).each((index, node) => {\n addScore($(node).parent(parentSelector), $, 80);\n });\n });\n\n // Doubling this again\n // Previous solution caused a bug\n // in which parents weren't retaining\n // scores. This is not ideal, and\n // should be fixed.\n scorePs($, weightNodes);\n scorePs($, weightNodes);\n\n return $;\n}\n","import { textLength, linkDensity } from 'utils/dom';\nimport { hasSentenceEnd } from 'utils/text';\n\nimport { NON_TOP_CANDIDATE_TAGS_RE } from './constants';\nimport { getScore } from './index';\n\n// Now that we have a top_candidate, look through the siblings of\n// it to see if any of them are decently scored. If they are, they\n// may be split parts of the content (Like two divs, a preamble and\n// a body.) Example:\n// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14\nexport default function mergeSiblings($candidate, topScore, $) {\n if (!$candidate.parent().length) {\n return $candidate;\n }\n\n const siblingScoreThreshold = Math.max(10, topScore * 0.25);\n const wrappingDiv = $('');\n\n $candidate\n .parent()\n .children()\n .each((index, sibling) => {\n const $sibling = $(sibling);\n // Ignore tags like BR, HR, etc\n if (NON_TOP_CANDIDATE_TAGS_RE.test(sibling.tagName)) {\n return null;\n }\n\n const siblingScore = getScore($sibling);\n if (siblingScore) {\n if ($sibling.get(0) === $candidate.get(0)) {\n wrappingDiv.append($sibling);\n } else {\n let contentBonus = 0;\n const density = linkDensity($sibling);\n\n // If sibling has a very low link density,\n // give it a small bonus\n if (density < 0.05) {\n contentBonus += 20;\n }\n\n // If sibling has a high link density,\n // give it a penalty\n if (density >= 0.5) {\n contentBonus -= 20;\n }\n\n // If sibling node has the same class as\n // candidate, give it a bonus\n if ($sibling.attr('class') === $candidate.attr('class')) {\n contentBonus += topScore * 0.2;\n }\n\n const newScore = siblingScore + contentBonus;\n\n if (newScore >= siblingScoreThreshold) {\n return wrappingDiv.append($sibling);\n }\n if (sibling.tagName === 'p') {\n const siblingContent = $sibling.text();\n const siblingContentLength = textLength(siblingContent);\n\n if (siblingContentLength > 80 && density < 0.25) {\n return wrappingDiv.append($sibling);\n }\n if (\n siblingContentLength <= 80 &&\n density === 0 &&\n hasSentenceEnd(siblingContent)\n ) {\n return wrappingDiv.append($sibling);\n }\n }\n }\n }\n\n return null;\n });\n\n if (\n wrappingDiv.children().length === 1 &&\n wrappingDiv\n .children()\n .first()\n .get(0) === $candidate.get(0)\n ) {\n return $candidate;\n }\n\n return wrappingDiv;\n}\n","import {\n getScore,\n setScore,\n getOrInitScore,\n scoreCommas,\n} from 'extractors/generic/content/scoring';\n\nimport { CLEAN_CONDITIONALLY_TAGS, KEEP_CLASS } from './constants';\nimport { normalizeSpaces } from '../text';\nimport { linkDensity } from './index';\n\nfunction removeUnlessContent($node, $, weight) {\n // Explicitly save entry-content-asset tags, which are\n // noted as valuable in the Publisher guidelines. For now\n // this works everywhere. We may want to consider making\n // this less of a sure-thing later.\n if ($node.hasClass('entry-content-asset')) {\n return;\n }\n\n const content = normalizeSpaces($node.text());\n\n if (scoreCommas(content) < 10) {\n const pCount = $('p', $node).length;\n const inputCount = $('input', $node).length;\n\n // Looks like a form, too many inputs.\n if (inputCount > pCount / 3) {\n $node.remove();\n return;\n }\n\n const contentLength = content.length;\n const imgCount = $('img', $node).length;\n\n // Content is too short, and there are no images, so\n // this is probably junk content.\n if (contentLength < 25 && imgCount === 0) {\n $node.remove();\n return;\n }\n\n const density = linkDensity($node);\n\n // Too high of link density, is probably a menu or\n // something similar.\n // console.log(weight, density, contentLength)\n if (weight < 25 && density > 0.2 && contentLength > 75) {\n $node.remove();\n return;\n }\n\n // Too high of a link density, despite the score being\n // high.\n if (weight >= 25 && density > 0.5) {\n // Don't remove the node if it's a list and the\n // previous sibling starts with a colon though. That\n // means it's probably content.\n const tagName = $node.get(0).tagName.toLowerCase();\n const nodeIsList = tagName === 'ol' || tagName === 'ul';\n if (nodeIsList) {\n const previousNode = $node.prev();\n if (\n previousNode &&\n normalizeSpaces(previousNode.text()).slice(-1) === ':'\n ) {\n return;\n }\n }\n\n $node.remove();\n return;\n }\n\n const scriptCount = $('script', $node).length;\n\n // Too many script tags, not enough content.\n if (scriptCount > 0 && contentLength < 150) {\n $node.remove();\n }\n }\n}\n\n// Given an article, clean it of some superfluous content specified by\n// tags. Things like forms, ads, etc.\n//\n// Tags is an array of tag name's to search through. (like div, form,\n// etc)\n//\n// Return this same doc.\nexport default function cleanTags($article, $) {\n $(CLEAN_CONDITIONALLY_TAGS, $article).each((index, node) => {\n const $node = $(node);\n // If marked to keep, skip it\n if ($node.hasClass(KEEP_CLASS) || $node.find(`.${KEEP_CLASS}`).length > 0)\n return;\n\n let weight = getScore($node);\n if (!weight) {\n weight = getOrInitScore($node, $);\n setScore($node, $, weight);\n }\n\n // drop node if its weight is < 0\n if (weight < 0) {\n $node.remove();\n } else {\n // deteremine if node seems like content\n removeUnlessContent($node, $, weight);\n }\n });\n\n return $;\n}\n","var isObject = require('./_is-object');\nmodule.exports = function (it, TYPE) {\n if (!isObject(it) || it._t !== TYPE) throw TypeError('Incompatible receiver, ' + TYPE + ' required!');\n return it;\n};\n","// 0 -> Array#forEach\n// 1 -> Array#map\n// 2 -> Array#filter\n// 3 -> Array#some\n// 4 -> Array#every\n// 5 -> Array#find\n// 6 -> Array#findIndex\nvar ctx = require('./_ctx');\nvar IObject = require('./_iobject');\nvar toObject = require('./_to-object');\nvar toLength = require('./_to-length');\nvar asc = require('./_array-species-create');\nmodule.exports = function (TYPE, $create) {\n var IS_MAP = TYPE == 1;\n var IS_FILTER = TYPE == 2;\n var IS_SOME = TYPE == 3;\n var IS_EVERY = TYPE == 4;\n var IS_FIND_INDEX = TYPE == 6;\n var NO_HOLES = TYPE == 5 || IS_FIND_INDEX;\n var create = $create || asc;\n return function ($this, callbackfn, that) {\n var O = toObject($this);\n var self = IObject(O);\n var f = ctx(callbackfn, that, 3);\n var length = toLength(self.length);\n var index = 0;\n var result = IS_MAP ? create($this, length) : IS_FILTER ? create($this, 0) : undefined;\n var val, res;\n for (;length > index; index++) if (NO_HOLES || index in self) {\n val = self[index];\n res = f(val, index, O);\n if (TYPE) {\n if (IS_MAP) result[index] = res; // map\n else if (res) switch (TYPE) {\n case 3: return true; // some\n case 5: return val; // find\n case 6: return index; // findIndex\n case 2: result.push(val); // filter\n } else if (IS_EVERY) return false; // every\n }\n }\n return IS_FIND_INDEX ? -1 : IS_SOME || IS_EVERY ? IS_EVERY : result;\n };\n};\n","'use strict';\nvar global = require('./_global');\nvar $export = require('./_export');\nvar meta = require('./_meta');\nvar fails = require('./_fails');\nvar hide = require('./_hide');\nvar redefineAll = require('./_redefine-all');\nvar forOf = require('./_for-of');\nvar anInstance = require('./_an-instance');\nvar isObject = require('./_is-object');\nvar setToStringTag = require('./_set-to-string-tag');\nvar dP = require('./_object-dp').f;\nvar each = require('./_array-methods')(0);\nvar DESCRIPTORS = require('./_descriptors');\n\nmodule.exports = function (NAME, wrapper, methods, common, IS_MAP, IS_WEAK) {\n var Base = global[NAME];\n var C = Base;\n var ADDER = IS_MAP ? 'set' : 'add';\n var proto = C && C.prototype;\n var O = {};\n if (!DESCRIPTORS || typeof C != 'function' || !(IS_WEAK || proto.forEach && !fails(function () {\n new C().entries().next();\n }))) {\n // create collection constructor\n C = common.getConstructor(wrapper, NAME, IS_MAP, ADDER);\n redefineAll(C.prototype, methods);\n meta.NEED = true;\n } else {\n C = wrapper(function (target, iterable) {\n anInstance(target, C, NAME, '_c');\n target._c = new Base();\n if (iterable != undefined) forOf(iterable, IS_MAP, target[ADDER], target);\n });\n each('add,clear,delete,forEach,get,has,set,keys,values,entries,toJSON'.split(','), function (KEY) {\n var IS_ADDER = KEY == 'add' || KEY == 'set';\n if (KEY in proto && !(IS_WEAK && KEY == 'clear')) hide(C.prototype, KEY, function (a, b) {\n anInstance(this, C, KEY);\n if (!IS_ADDER && IS_WEAK && !isObject(a)) return KEY == 'get' ? undefined : false;\n var result = this._c[KEY](a === 0 ? 0 : a, b);\n return IS_ADDER ? this : result;\n });\n });\n IS_WEAK || dP(C.prototype, 'size', {\n get: function () {\n return this._c.size;\n }\n });\n }\n\n setToStringTag(C, NAME);\n\n O[NAME] = C;\n $export($export.G + $export.W + $export.F, O);\n\n if (!IS_WEAK) common.setStrong(C, NAME, IS_MAP);\n\n return C;\n};\n","// https://github.com/DavidBruant/Map-Set.prototype.toJSON\nvar classof = require('./_classof');\nvar from = require('./_array-from-iterable');\nmodule.exports = function (NAME) {\n return function toJSON() {\n if (classof(this) != NAME) throw TypeError(NAME + \"#toJSON isn't generic\");\n return from(this);\n };\n};\n","'use strict';\nvar dP = require('./_object-dp').f;\nvar create = require('./_object-create');\nvar redefineAll = require('./_redefine-all');\nvar ctx = require('./_ctx');\nvar anInstance = require('./_an-instance');\nvar forOf = require('./_for-of');\nvar $iterDefine = require('./_iter-define');\nvar step = require('./_iter-step');\nvar setSpecies = require('./_set-species');\nvar DESCRIPTORS = require('./_descriptors');\nvar fastKey = require('./_meta').fastKey;\nvar validate = require('./_validate-collection');\nvar SIZE = DESCRIPTORS ? '_s' : 'size';\n\nvar getEntry = function (that, key) {\n // fast case\n var index = fastKey(key);\n var entry;\n if (index !== 'F') return that._i[index];\n // frozen object case\n for (entry = that._f; entry; entry = entry.n) {\n if (entry.k == key) return entry;\n }\n};\n\nmodule.exports = {\n getConstructor: function (wrapper, NAME, IS_MAP, ADDER) {\n var C = wrapper(function (that, iterable) {\n anInstance(that, C, NAME, '_i');\n that._t = NAME; // collection type\n that._i = create(null); // index\n that._f = undefined; // first entry\n that._l = undefined; // last entry\n that[SIZE] = 0; // size\n if (iterable != undefined) forOf(iterable, IS_MAP, that[ADDER], that);\n });\n redefineAll(C.prototype, {\n // 23.1.3.1 Map.prototype.clear()\n // 23.2.3.2 Set.prototype.clear()\n clear: function clear() {\n for (var that = validate(this, NAME), data = that._i, entry = that._f; entry; entry = entry.n) {\n entry.r = true;\n if (entry.p) entry.p = entry.p.n = undefined;\n delete data[entry.i];\n }\n that._f = that._l = undefined;\n that[SIZE] = 0;\n },\n // 23.1.3.3 Map.prototype.delete(key)\n // 23.2.3.4 Set.prototype.delete(value)\n 'delete': function (key) {\n var that = validate(this, NAME);\n var entry = getEntry(that, key);\n if (entry) {\n var next = entry.n;\n var prev = entry.p;\n delete that._i[entry.i];\n entry.r = true;\n if (prev) prev.n = next;\n if (next) next.p = prev;\n if (that._f == entry) that._f = next;\n if (that._l == entry) that._l = prev;\n that[SIZE]--;\n } return !!entry;\n },\n // 23.2.3.6 Set.prototype.forEach(callbackfn, thisArg = undefined)\n // 23.1.3.5 Map.prototype.forEach(callbackfn, thisArg = undefined)\n forEach: function forEach(callbackfn /* , that = undefined */) {\n validate(this, NAME);\n var f = ctx(callbackfn, arguments.length > 1 ? arguments[1] : undefined, 3);\n var entry;\n while (entry = entry ? entry.n : this._f) {\n f(entry.v, entry.k, this);\n // revert to the last existing entry\n while (entry && entry.r) entry = entry.p;\n }\n },\n // 23.1.3.7 Map.prototype.has(key)\n // 23.2.3.7 Set.prototype.has(value)\n has: function has(key) {\n return !!getEntry(validate(this, NAME), key);\n }\n });\n if (DESCRIPTORS) dP(C.prototype, 'size', {\n get: function () {\n return validate(this, NAME)[SIZE];\n }\n });\n return C;\n },\n def: function (that, key, value) {\n var entry = getEntry(that, key);\n var prev, index;\n // change existing entry\n if (entry) {\n entry.v = value;\n // create new entry\n } else {\n that._l = entry = {\n i: index = fastKey(key, true), // <- index\n k: key, // <- key\n v: value, // <- value\n p: prev = that._l, // <- previous entry\n n: undefined, // <- next entry\n r: false // <- removed\n };\n if (!that._f) that._f = entry;\n if (prev) prev.n = entry;\n that[SIZE]++;\n // add to index\n if (index !== 'F') that._i[index] = entry;\n } return that;\n },\n getEntry: getEntry,\n setStrong: function (C, NAME, IS_MAP) {\n // add .keys, .values, .entries, [@@iterator]\n // 23.1.3.4, 23.1.3.8, 23.1.3.11, 23.1.3.12, 23.2.3.5, 23.2.3.8, 23.2.3.10, 23.2.3.11\n $iterDefine(C, NAME, function (iterated, kind) {\n this._t = validate(iterated, NAME); // target\n this._k = kind; // kind\n this._l = undefined; // previous\n }, function () {\n var that = this;\n var kind = that._k;\n var entry = that._l;\n // revert to the last existing entry\n while (entry && entry.r) entry = entry.p;\n // get next entry\n if (!that._t || !(that._l = entry = entry ? entry.n : that._t._f)) {\n // or finish the iteration\n that._t = undefined;\n return step(1);\n }\n // return step by kind\n if (kind == 'keys') return step(0, entry.k);\n if (kind == 'values') return step(0, entry.v);\n return step(0, [entry.k, entry.v]);\n }, IS_MAP ? 'entries' : 'values', !IS_MAP, true);\n\n // add [@@species], 23.1.2.2, 23.2.2.2\n setSpecies(NAME);\n }\n};\n","var isObject = require('./_is-object');\nvar isArray = require('./_is-array');\nvar SPECIES = require('./_wks')('species');\n\nmodule.exports = function (original) {\n var C;\n if (isArray(original)) {\n C = original.constructor;\n // cross-realm fallback\n if (typeof C == 'function' && (C === Array || isArray(C.prototype))) C = undefined;\n if (isObject(C)) {\n C = C[SPECIES];\n if (C === null) C = undefined;\n }\n } return C === undefined ? Array : C;\n};\n","// 9.4.2.3 ArraySpeciesCreate(originalArray, length)\nvar speciesConstructor = require('./_array-species-constructor');\n\nmodule.exports = function (original, length) {\n return new (speciesConstructor(original))(length);\n};\n","'use strict';\nvar strong = require('./_collection-strong');\nvar validate = require('./_validate-collection');\nvar SET = 'Set';\n\n// 23.2 Set Objects\nmodule.exports = require('./_collection')(SET, function (get) {\n return function Set() { return get(this, arguments.length > 0 ? arguments[0] : undefined); };\n}, {\n // 23.2.3.1 Set.prototype.add(value)\n add: function add(value) {\n return strong.def(validate(this, SET), value = value === 0 ? 0 : value, value);\n }\n}, strong);\n","// https://github.com/DavidBruant/Map-Set.prototype.toJSON\nvar $export = require('./_export');\n\n$export($export.P + $export.R, 'Set', { toJSON: require('./_collection-to-json')('Set') });\n","var forOf = require('./_for-of');\n\nmodule.exports = function (iter, ITERATOR) {\n var result = [];\n forOf(iter, false, result.push, result, ITERATOR);\n return result;\n};\n","'use strict';\n// https://tc39.github.io/proposal-setmap-offrom/\nvar $export = require('./_export');\n\nmodule.exports = function (COLLECTION) {\n $export($export.S, COLLECTION, { of: function of() {\n var length = arguments.length;\n var A = new Array(length);\n while (length--) A[length] = arguments[length];\n return new this(A);\n } });\n};\n","// https://tc39.github.io/proposal-setmap-offrom/#sec-set.of\nrequire('./_set-collection-of')('Set');\n","'use strict';\n// https://tc39.github.io/proposal-setmap-offrom/\nvar $export = require('./_export');\nvar aFunction = require('./_a-function');\nvar ctx = require('./_ctx');\nvar forOf = require('./_for-of');\n\nmodule.exports = function (COLLECTION) {\n $export($export.S, COLLECTION, { from: function from(source /* , mapFn, thisArg */) {\n var mapFn = arguments[1];\n var mapping, A, n, cb;\n aFunction(this);\n mapping = mapFn !== undefined;\n if (mapping) aFunction(mapFn);\n if (source == undefined) return new this();\n A = [];\n if (mapping) {\n n = 0;\n cb = ctx(mapFn, arguments[2], 2);\n forOf(source, false, function (nextItem) {\n A.push(cb(nextItem, n++));\n });\n } else {\n forOf(source, false, A.push, A);\n }\n return new this(A);\n } });\n};\n","// https://tc39.github.io/proposal-setmap-offrom/#sec-set.from\nrequire('./_set-collection-from')('Set');\n","require('../modules/es6.object.to-string');\nrequire('../modules/es6.string.iterator');\nrequire('../modules/web.dom.iterable');\nrequire('../modules/es6.set');\nrequire('../modules/es7.set.to-json');\nrequire('../modules/es7.set.of');\nrequire('../modules/es7.set.from');\nmodule.exports = require('../modules/_core').Set;\n","import URL from 'url';\n\nimport { getAttrs, setAttr } from 'utils/dom';\n\nfunction absolutize($, rootUrl, attr) {\n const baseUrl = $('base').attr('href');\n\n $(`[${attr}]`).each((_, node) => {\n const attrs = getAttrs(node);\n const url = attrs[attr];\n if (!url) return;\n const absoluteUrl = URL.resolve(baseUrl || rootUrl, url);\n\n setAttr(node, attr, absoluteUrl);\n });\n}\n\nfunction absolutizeSet($, rootUrl, $content) {\n $('[srcset]', $content).each((_, node) => {\n const attrs = getAttrs(node);\n const urlSet = attrs.srcset;\n\n if (urlSet) {\n // a comma should be considered part of the candidate URL unless preceded by a descriptor\n // descriptors can only contain positive numbers followed immediately by either 'w' or 'x'\n // space characters inside the URL should be encoded (%20 or +)\n const candidates = urlSet.match(\n /(?:\\s*)(\\S+(?:\\s*[\\d.]+[wx])?)(?:\\s*,\\s*)?/g\n );\n if (!candidates) return;\n const absoluteCandidates = candidates.map(candidate => {\n // a candidate URL cannot start or end with a comma\n // descriptors are separated from the URLs by unescaped whitespace\n const parts = candidate\n .trim()\n .replace(/,$/, '')\n .split(/\\s+/);\n parts[0] = URL.resolve(rootUrl, parts[0]);\n return parts.join(' ');\n });\n const absoluteUrlSet = [...new Set(absoluteCandidates)].join(', ');\n setAttr(node, 'srcset', absoluteUrlSet);\n }\n });\n}\n\nexport default function makeLinksAbsolute($content, $, url) {\n ['href', 'src'].forEach(attr => absolutize($, url, attr));\n absolutizeSet($, url, $content);\n\n return $content;\n}\n","export function textLength(text) {\n return text.trim().replace(/\\s+/g, ' ').length;\n}\n\n// Determines what percentage of the text\n// in a node is link text\n// Takes a node, returns a float\nexport function linkDensity($node) {\n const totalTextLength = textLength($node.text());\n\n const linkText = $node.find('a').text();\n const linkLength = textLength(linkText);\n\n if (totalTextLength > 0) {\n return linkLength / totalTextLength;\n }\n if (totalTextLength === 0 && linkLength > 0) {\n return 1;\n }\n\n return 0;\n}\n","require('../../modules/es6.string.iterator');\nrequire('../../modules/web.dom.iterable');\nmodule.exports = require('../../modules/_wks-ext').f('iterator');\n","require('./_wks-define')('asyncIterator');\n","require('./_wks-define')('observable');\n","require('../../modules/es6.symbol');\nrequire('../../modules/es6.object.to-string');\nrequire('../../modules/es7.symbol.async-iterator');\nrequire('../../modules/es7.symbol.observable');\nmodule.exports = require('../../modules/_core').Symbol;\n","var _Symbol$iterator = require(\"../core-js/symbol/iterator\");\n\nvar _Symbol = require(\"../core-js/symbol\");\n\nfunction _typeof2(obj) { if (typeof _Symbol === \"function\" && typeof _Symbol$iterator === \"symbol\") { _typeof2 = function _typeof2(obj) { return typeof obj; }; } else { _typeof2 = function _typeof2(obj) { return obj && typeof _Symbol === \"function\" && obj.constructor === _Symbol && obj !== _Symbol.prototype ? \"symbol\" : typeof obj; }; } return _typeof2(obj); }\n\nfunction _typeof(obj) {\n if (typeof _Symbol === \"function\" && _typeof2(_Symbol$iterator) === \"symbol\") {\n module.exports = _typeof = function _typeof(obj) {\n return _typeof2(obj);\n };\n } else {\n module.exports = _typeof = function _typeof(obj) {\n return obj && typeof _Symbol === \"function\" && obj.constructor === _Symbol && obj !== _Symbol.prototype ? \"symbol\" : _typeof2(obj);\n };\n }\n\n return _typeof(obj);\n}\n\nmodule.exports = _typeof;","import { stripTags } from 'utils/dom';\n\n// Given a node type to search for, and a list of meta tag names to\n// search for, find a meta tag associated.\nexport default function extractFromMeta(\n $,\n metaNames,\n cachedNames,\n cleanTags = true\n) {\n const foundNames = metaNames.filter(name => cachedNames.indexOf(name) !== -1);\n\n // eslint-disable-next-line no-restricted-syntax\n for (const name of foundNames) {\n const type = 'name';\n const value = 'value';\n\n const nodes = $(`meta[${type}=\"${name}\"]`);\n\n // Get the unique value of every matching node, in case there\n // are two meta tags with the same name and value.\n // Remove empty values.\n const values = nodes\n .map((index, node) => $(node).attr(value))\n .toArray()\n .filter(text => text !== '');\n\n // If we have more than one value for the same name, we have a\n // conflict and can't trust any of them. Skip this name. If we have\n // zero, that means our meta tags had no values. Skip this name\n // also.\n if (values.length === 1) {\n let metaValue;\n // Meta values that contain HTML should be stripped, as they\n // weren't subject to cleaning previously.\n if (cleanTags) {\n metaValue = stripTags(values[0], $);\n } else {\n [metaValue] = values;\n }\n\n return metaValue;\n }\n }\n\n // If nothing is found, return null\n return null;\n}\n","import { withinComment } from 'utils/dom';\n\nfunction isGoodNode($node, maxChildren) {\n // If it has a number of children, it's more likely a container\n // element. Skip it.\n if ($node.children().length > maxChildren) {\n return false;\n }\n // If it looks to be within a comment, skip it.\n if (withinComment($node)) {\n return false;\n }\n\n return true;\n}\n\n// Given a a list of selectors find content that may\n// be extractable from the document. This is for flat\n// meta-information, like author, title, date published, etc.\nexport default function extractFromSelectors(\n $,\n selectors,\n maxChildren = 1,\n textOnly = true\n) {\n // eslint-disable-next-line no-restricted-syntax\n for (const selector of selectors) {\n const nodes = $(selector);\n\n // If we didn't get exactly one of this selector, this may be\n // a list of articles or comments. Skip it.\n if (nodes.length === 1) {\n const $node = $(nodes[0]);\n\n if (isGoodNode($node, maxChildren)) {\n let content;\n if (textOnly) {\n content = $node.text();\n } else {\n content = $node.html();\n }\n\n if (content) {\n return content;\n }\n }\n }\n }\n\n return null;\n}\n","import { getAttrs } from 'utils/dom';\n\nexport default function withinComment($node) {\n const parents = $node.parents().toArray();\n const commentParent = parents.find(parent => {\n const attrs = getAttrs(parent);\n const { class: nodeClass, id } = attrs;\n const classAndId = `${nodeClass} ${id}`;\n return classAndId.includes('comment');\n });\n\n return commentParent !== undefined;\n}\n","// strips all tags from a string of text\nexport default function stripTags(text, $) {\n // Wrapping text in html element prevents errors when text\n // has no html\n const cleanText = $(`${text}`).text();\n return cleanText === '' ? text : cleanText;\n}\n","// Given a node, determine if it's article-like enough to return\n// param: node (a cheerio node)\n// return: boolean\n\nexport default function nodeIsSufficient($node) {\n return $node.text().trim().length >= 100;\n}\n","export default function getAttrs(node) {\n const { attribs, attributes } = node;\n\n if (!attribs && attributes) {\n const attrs = Reflect.ownKeys(attributes).reduce((acc, index) => {\n const attr = attributes[index];\n\n if (!attr.name || !attr.value) return acc;\n\n acc[attr.name] = attr.value;\n return acc;\n }, {});\n return attrs;\n }\n\n return attribs;\n}\n","export default function setAttr(node, attr, val) {\n if (node.attribs) {\n node.attribs[attr] = val;\n } else if (node.attributes) {\n node.setAttribute(attr, val);\n }\n\n return node;\n}\n","export const IS_LINK = new RegExp('https?://', 'i');\nconst IMAGE_RE = '.(png|gif|jpe?g)';\nexport const IS_IMAGE = new RegExp(`${IMAGE_RE}`, 'i');\nexport const IS_SRCSET = new RegExp(\n `${IMAGE_RE}(\\\\?\\\\S+)?(\\\\s*[\\\\d.]+[wx])`,\n 'i'\n);\n\nexport const TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');\n","import { getAttrs } from 'utils/dom';\n\nimport { IS_LINK, IS_IMAGE, IS_SRCSET } from './constants';\n\n// Convert all instances of images with potentially\n// lazy loaded images into normal images.\n// Many sites will have img tags with no source, or an image tag with a src\n// attribute that a is a placeholer. We need to be able to properly fill in\n// the src attribute so the images are no longer lazy loaded.\nexport default function convertLazyLoadedImages($) {\n const extractSrcFromJSON = str => {\n try {\n const { src } = JSON.parse(str);\n if (typeof src === 'string') return src;\n } catch (_) {\n return false;\n }\n\n return false;\n };\n\n $('img').each((_, img) => {\n const attrs = getAttrs(img);\n\n Reflect.ownKeys(attrs).forEach(attr => {\n const value = attrs[attr];\n\n if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {\n $(img).attr('srcset', value);\n } else if (\n attr !== 'src' &&\n attr !== 'srcset' &&\n IS_LINK.test(value) &&\n IS_IMAGE.test(value)\n ) {\n // Is the value a JSON object? If so, we should attempt to extract the image src from the data.\n const existingSrc = extractSrcFromJSON(value);\n if (existingSrc) {\n $(img).attr('src', existingSrc);\n } else {\n $(img).attr('src', value);\n }\n }\n });\n });\n\n return $;\n}\n","import { TAGS_TO_REMOVE } from './constants';\n\nfunction isComment(index, node) {\n return node.type === 'comment';\n}\n\nfunction cleanComments($) {\n $.root()\n .find('*')\n .contents()\n .filter(isComment)\n .remove();\n\n return $;\n}\n\nexport default function clean($) {\n $(TAGS_TO_REMOVE).remove();\n\n $ = cleanComments($);\n return $;\n}\n","import cheerio from 'cheerio';\nimport iconv from 'iconv-lite';\n\nimport { getEncoding } from 'utils/text';\nimport { fetchResource } from './utils';\nimport { normalizeMetaTags, convertLazyLoadedImages, clean } from './utils/dom';\n\nconst Resource = {\n // Create a Resource.\n //\n // :param url: The URL for the document we should retrieve.\n // :param response: If set, use as the response rather than\n // attempting to fetch it ourselves. Expects a\n // string.\n // :param headers: Custom headers to be included in the request\n async create(url, preparedResponse, parsedUrl, headers = {}) {\n let result;\n\n if (preparedResponse) {\n const validResponse = {\n statusMessage: 'OK',\n statusCode: 200,\n headers: {\n 'content-type': 'text/html',\n 'content-length': 500,\n },\n };\n\n result = {\n body: preparedResponse,\n response: validResponse,\n alreadyDecoded: true,\n };\n } else {\n result = await fetchResource(url, parsedUrl, headers);\n }\n\n if (result.error) {\n result.failed = true;\n return result;\n }\n\n return this.generateDoc(result);\n },\n\n generateDoc({ body: content, response, alreadyDecoded = false }) {\n const { 'content-type': contentType = '' } = response.headers;\n\n // TODO: Implement is_text function from\n // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57\n if (!contentType.includes('html') && !contentType.includes('text')) {\n throw new Error('Content does not appear to be text.');\n }\n\n let $ = this.encodeDoc({ content, contentType, alreadyDecoded });\n\n if ($.root().children().length === 0) {\n throw new Error('No children, likely a bad parse.');\n }\n\n $ = normalizeMetaTags($);\n $ = convertLazyLoadedImages($);\n $ = clean($);\n\n return $;\n },\n\n encodeDoc({ content, contentType, alreadyDecoded = false }) {\n if (alreadyDecoded) {\n return cheerio.load(content);\n }\n\n const encoding = getEncoding(contentType);\n let decodedContent = iconv.decode(content, encoding);\n let $ = cheerio.load(decodedContent);\n // after first cheerio.load, check to see if encoding matches\n const contentTypeSelector = cheerio.browser\n ? 'meta[http-equiv=content-type]'\n : 'meta[http-equiv=content-type i]';\n const metaContentType =\n $(contentTypeSelector).attr('content') ||\n $('meta[charset]').attr('charset');\n const properEncoding = getEncoding(metaContentType);\n\n // if encodings in the header/body dont match, use the one in the body\n if (metaContentType && properEncoding !== encoding) {\n decodedContent = iconv.decode(content, properEncoding);\n $ = cheerio.load(decodedContent);\n }\n\n return $;\n },\n};\n\nexport default Resource;\n","export default function* range(start = 1, end = 1) {\n while (start <= end) {\n yield (start += 1);\n }\n}\n","'use strict';\n// 19.1.2.1 Object.assign(target, source, ...)\nvar getKeys = require('./_object-keys');\nvar gOPS = require('./_object-gops');\nvar pIE = require('./_object-pie');\nvar toObject = require('./_to-object');\nvar IObject = require('./_iobject');\nvar $assign = Object.assign;\n\n// should work with symbols and should have deterministic property order (V8 bug)\nmodule.exports = !$assign || require('./_fails')(function () {\n var A = {};\n var B = {};\n // eslint-disable-next-line no-undef\n var S = Symbol();\n var K = 'abcdefghijklmnopqrst';\n A[S] = 7;\n K.split('').forEach(function (k) { B[k] = k; });\n return $assign({}, A)[S] != 7 || Object.keys($assign({}, B)).join('') != K;\n}) ? function assign(target, source) { // eslint-disable-line no-unused-vars\n var T = toObject(target);\n var aLen = arguments.length;\n var index = 1;\n var getSymbols = gOPS.f;\n var isEnum = pIE.f;\n while (aLen > index) {\n var S = IObject(arguments[index++]);\n var keys = getSymbols ? getKeys(S).concat(getSymbols(S)) : getKeys(S);\n var length = keys.length;\n var j = 0;\n var key;\n while (length > j) if (isEnum.call(S, key = keys[j++])) T[key] = S[key];\n } return T;\n} : $assign;\n","// 19.1.3.1 Object.assign(target, source)\nvar $export = require('./_export');\n\n$export($export.S + $export.F, 'Object', { assign: require('./_object-assign') });\n","require('../../modules/es6.object.assign');\nmodule.exports = require('../../modules/_core').Object.assign;\n","const merge = (extractor, domains) =>\n domains.reduce((acc, domain) => {\n acc[domain] = extractor;\n return acc;\n }, {});\n\nexport default function mergeSupportedDomains(extractor) {\n return extractor.supportedDomains\n ? merge(extractor, [extractor.domain, ...extractor.supportedDomains])\n : merge(extractor, [extractor.domain]);\n}\n","import mergeSupportedDomains from '../utils/merge-supported-domains';\n\nexport const apiExtractors = {};\n\nexport default function addExtractor(extractor) {\n if (!extractor || !extractor.domain) {\n return {\n error: true,\n message: 'Unable to add custom extractor. Invalid parameters.',\n };\n }\n\n Object.assign(apiExtractors, mergeSupportedDomains(extractor));\n\n return apiExtractors;\n}\n","export const BloggerExtractor = {\n domain: 'blogspot.com',\n content: {\n // Blogger is insane and does not load its content\n // initially in the page, but it's all there\n // in noscript\n selectors: ['.post-content noscript'],\n\n // Selectors to remove from the extracted content\n clean: [],\n\n // Convert the noscript tag to a div\n transforms: {\n noscript: 'div',\n },\n },\n\n author: {\n selectors: ['.post-author-name'],\n },\n\n title: {\n selectors: ['.post h2.title'],\n },\n\n date_published: {\n selectors: ['span.publishdate'],\n },\n};\n","export const NYMagExtractor = {\n domain: 'nymag.com',\n content: {\n // Order by most likely. Extractor will stop on first occurrence\n selectors: ['div.article-content', 'section.body', 'article.article'],\n\n // Selectors to remove from the extracted content\n clean: ['.ad', '.single-related-story'],\n\n // Object of tranformations to make on matched elements\n // Each key is the selector, each value is the tag to\n // transform to.\n // If a function is given, it should return a string\n // to convert to or nothing (in which case it will not perform\n // the transformation.\n transforms: {\n // Convert h1s to h2s\n h1: 'h2',\n\n // Convert lazy-loaded noscript images to figures\n noscript: ($node, $) => {\n const $children = $.browser ? $($node.text()) : $node.children();\n if (\n $children.length === 1 &&\n $children.get(0) !== undefined &&\n $children.get(0).tagName.toLowerCase() === 'img'\n ) {\n return 'figure';\n }\n\n return null;\n },\n },\n },\n\n title: {\n selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1'],\n },\n\n author: {\n selectors: ['.by-authors', '.lede-feature-author'],\n },\n\n dek: {\n selectors: ['.lede-feature-teaser'],\n },\n\n date_published: {\n selectors: [\n ['time.article-timestamp[datetime]', 'datetime'],\n 'time.article-timestamp',\n ],\n },\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const ApartmentTherapyExtractor = {\n domain: 'www.apartmenttherapy.com',\n title: {\n selectors: ['h1.headline'],\n },\n\n author: {\n selectors: ['.PostByline__name'],\n },\n\n content: {\n selectors: ['div.post__content'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'div[data-render-react-id=\"images/LazyPicture\"]': ($node, $) => {\n const data = JSON.parse($node.attr('data-props'));\n const { src } = data.sources[0];\n const $img = $('').attr('src', src);\n $node.replaceWith($img);\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [],\n },\n\n date_published: {\n selectors: [['.PostByline__timestamp[datetime]', 'datetime']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: {\n selectors: [],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n};\n","export const MediumExtractor = {\n domain: 'medium.com',\n\n title: {\n selectors: ['h1', ['meta[name=\"og:title\"]', 'value']],\n },\n\n author: {\n selectors: [['meta[name=\"author\"]', 'value']],\n },\n\n content: {\n selectors: ['article'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n // Allow drop cap character.\n 'section span:first-of-type': $node => {\n const $text = $node.html();\n if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {\n $node.replaceWith($text);\n }\n },\n // Re-write lazy-loaded youtube videos\n iframe: $node => {\n const ytRe = /https:\\/\\/i.embed.ly\\/.+url=https:\\/\\/i\\.ytimg\\.com\\/vi\\/(\\w+)\\//;\n const thumb = decodeURIComponent($node.attr('data-thumbnail'));\n const $parent = $node.parents('figure');\n\n if (ytRe.test(thumb)) {\n const [_, youtubeId] = thumb.match(ytRe); // eslint-disable-line\n $node.attr('src', `https://www.youtube.com/embed/${youtubeId}`);\n const $caption = $parent.find('figcaption');\n $parent.empty().append([$node, $caption]);\n return;\n }\n\n // If we can't draw the YouTube preview, remove the figure.\n $parent.remove();\n },\n\n // rewrite figures to pull out image and caption, remove rest\n figure: $node => {\n // ignore if figure has an iframe\n if ($node.find('iframe').length > 0) return;\n\n const $img = $node.find('img').slice(-1)[0];\n const $caption = $node.find('figcaption');\n\n $node.empty().append([$img, $caption]);\n },\n\n // Remove any smaller images that did not get caught by the generic image\n // cleaner (author photo 48px, leading sentence images 79px, etc.).\n img: $node => {\n const width = parseInt($node.attr('width'), 10);\n if (width < 100) $node.remove();\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: ['span a', 'svg'],\n },\n\n date_published: {\n selectors: [['meta[name=\"article:published_time\"]', 'value']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: null,\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n};\n","export const WwwMsnbcComExtractor = {\n domain: 'www.msnbc.com',\n\n title: {\n selectors: ['h1', 'h1.is-title-pane'],\n },\n\n author: {\n selectors: ['.byline-name', '.author'],\n },\n\n date_published: {\n selectors: [\n ['meta[itemprop=\"datePublished\"]', 'value'],\n ['meta[name=\"DC.date.issued\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [['meta[name=\"description\"]', 'value']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: ['.article-body__content', '.pane-node-body'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.pane-node-body': ($node, $) => {\n const [\n selector,\n attr,\n ] = WwwMsnbcComExtractor.lead_image_url.selectors[0];\n const src = $(selector).attr(attr);\n if (src) {\n $node.prepend(``);\n }\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [],\n },\n};\n","export const GeniusComExtractor = {\n domain: 'genius.com',\n\n title: {\n selectors: ['h1'],\n },\n\n author: {\n selectors: ['h2 a'],\n },\n\n date_published: {\n selectors: [\n [\n 'meta[itemprop=page_data]',\n 'value',\n res => {\n const json = JSON.parse(res);\n return json.song.release_date;\n },\n ],\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n [\n 'meta[itemprop=page_data]',\n 'value',\n res => {\n const json = JSON.parse(res);\n return json.song.album.cover_art_url;\n },\n ],\n ],\n },\n\n content: {\n selectors: ['.lyrics'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {},\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [],\n },\n};\n","import URL from 'url';\n\nexport const WiredJpExtractor = {\n domain: 'wired.jp',\n\n title: {\n selectors: ['h1[data-testid=\"ContentHeaderHed\"]', 'h1.post-title'],\n },\n\n author: {\n selectors: [\n ['meta[name=\"article:author\"]', 'value'],\n 'p[itemprop=\"author\"]',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ['time', 'datetime'],\n ],\n },\n\n dek: {\n selectors: ['div[class^=\"ContentHeaderDek\"]', '.post-intro'],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: [\n 'div[data-attribute-verso-pattern=\"article-body\"]',\n 'article.article-detail',\n ],\n\n transforms: {\n 'img[data-original]': $node => {\n const dataOriginal = $node.attr('data-original');\n const src = $node.attr('src');\n const url = URL.resolve(src, dataOriginal);\n $node.attr('src', url);\n },\n },\n\n clean: ['.post-category', 'time', 'h1.post-title', '.social-area-syncer'],\n },\n};\n","/* eslint-disable no-nested-ternary */\n/* eslint-disable no-unused-expressions */\nexport const WwwAbendblattDeExtractor = {\n domain: 'www.abendblatt.de',\n\n title: {\n selectors: ['h2.article__header__headline'],\n },\n\n author: {\n selectors: ['span.author-info__name-text'],\n },\n\n date_published: {\n selectors: [\n ['time.teaser-stream-time', 'datetime'],\n ['time.article__header__date', 'datetime'],\n ],\n },\n\n dek: {\n selectors: [['meta[name=\"description\"]', 'value']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: ['div.article__body'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n p: $node => {\n if (!$node.hasClass('obfuscated')) return null;\n let o = '';\n let n = 0;\n for (let i = $node.text(); n < i.length; n += 1) {\n const r = i.charCodeAt(n);\n r === 177\n ? (o += '%')\n : r === 178\n ? (o += '!')\n : r === 180\n ? (o += ';')\n : r === 181\n ? (o += '=')\n : r === 32\n ? (o += ' ')\n : r === 10\n ? (o += '\\n')\n : r > 33 && (o += String.fromCharCode(r - 1));\n }\n\n $node.html(o);\n $node.removeClass('obfuscated');\n $node.addClass('deobfuscated');\n return null;\n },\n div: $node => {\n if (!$node.hasClass('obfuscated')) return null;\n let o = '';\n let n = 0;\n for (let i = $node.text(); n < i.length; n += 1) {\n const r = i.charCodeAt(n);\n r === 177\n ? (o += '%')\n : r === 178\n ? (o += '!')\n : r === 180\n ? (o += ';')\n : r === 181\n ? (o += '=')\n : r === 32\n ? (o += ' ')\n : r === 10\n ? (o += '\\n')\n : r > 33 && (o += String.fromCharCode(r - 1));\n }\n\n $node.html(o);\n $node.removeClass('obfuscated');\n $node.addClass('deobfuscated');\n return null;\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [],\n },\n};\n","export const WikipediaExtractor = {\n domain: 'wikipedia.org',\n content: {\n selectors: ['#mw-content-text'],\n\n defaultCleaner: false,\n\n // transform top infobox to an image with caption\n transforms: {\n '.infobox img': $node => {\n const $parent = $node.parents('.infobox');\n // Only prepend the first image in .infobox\n if ($parent.children('img').length === 0) {\n $parent.prepend($node);\n }\n },\n '.infobox caption': 'figcaption',\n '.infobox': 'figure',\n },\n\n // Selectors to remove from the extracted content\n clean: [\n '.mw-editsection',\n 'figure tr, figure td, figure tbody',\n '#toc',\n '.navbox',\n ],\n },\n\n author: 'Wikipedia Contributors',\n\n title: {\n selectors: ['h2.title'],\n },\n\n date_published: {\n selectors: ['#footer-info-lastmod'],\n },\n};\n","export const TwitterExtractor = {\n domain: 'twitter.com',\n\n content: {\n transforms: {\n // We're transforming essentially the whole page here.\n // Twitter doesn't have nice selectors, so our initial\n // selector grabs the whole page, then we're re-writing\n // it to fit our needs before we clean it up.\n '.permalink[role=main]': ($node, $) => {\n const tweets = $node.find('.tweet');\n const $tweetContainer = $('');\n $tweetContainer.append(tweets);\n $node.replaceWith($tweetContainer);\n },\n\n // Twitter wraps @ with s, which\n // renders as a strikethrough\n s: 'span',\n },\n\n selectors: ['.permalink[role=main]'],\n\n defaultCleaner: false,\n\n clean: ['.stream-item-footer', 'button', '.tweet-details-fixer'],\n },\n\n author: {\n selectors: ['.tweet.permalink-tweet .username'],\n },\n\n date_published: {\n selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']],\n },\n};\n","export const NYTimesExtractor = {\n domain: 'www.nytimes.com',\n\n title: {\n selectors: [\n 'h1[data-testid=\"headline\"]',\n 'h1.g-headline',\n 'h1[itemprop=\"headline\"]',\n 'h1.headline',\n 'h1 .balancedHeadline',\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n '.g-byline',\n '.byline',\n ['meta[name=\"byl\"]', 'value'],\n ],\n },\n\n content: {\n selectors: ['div.g-blocks', 'section[name=\"articleBody\"]', 'article#story'],\n\n transforms: {\n 'img.g-lazy': $node => {\n let src = $node.attr('src');\n const width = 640;\n\n src = src.replace('{{size}}', width);\n $node.attr('src', src);\n },\n },\n\n clean: [\n '.ad',\n 'header#story-header',\n '.story-body-1 .lede.video',\n '.visually-hidden',\n '#newsletter-promo',\n '.promo',\n '.comments-button',\n '.hidden',\n '.comments',\n '.supplemental',\n '.nocontent',\n '.story-footer-links',\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ['meta[name=\"article:published\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: null,\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\nexport const TheAtlanticExtractor = {\n domain: 'www.theatlantic.com',\n title: {\n selectors: ['h1', '.c-article-header__hed'],\n },\n\n author: {\n selectors: [['meta[name=\"author\"]', 'value'], '.c-byline__author'],\n },\n\n content: {\n selectors: ['article', '.article-body'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.partner-box',\n '.callout',\n '.c-article-writer__image',\n '.c-article-writer__content',\n '.c-letters-cta__text',\n '.c-footer__logo',\n '.c-recirculation-link',\n '.twitter-tweet',\n ],\n },\n\n dek: {\n selectors: [['meta[name=\"description\"]', 'value']],\n },\n\n date_published: {\n selectors: [['time[itemprop=\"datePublished\"]', 'datetime']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const NewYorkerExtractor = {\n domain: 'www.newyorker.com',\n title: {\n selectors: [\n 'h1[class^=\"content-header\"]',\n 'h1[class^=\"ArticleHeader__hed\"]',\n 'h1[class*=\"ContentHeaderHed\"]',\n ['meta[name=\"og:title\"]', 'value'],\n ],\n },\n\n author: {\n selectors: [\n 'article header div[class^=\"BylinesWrapper\"]',\n ['meta[name=\"article:author\"]', 'value'],\n 'div[class^=\"ArticleContributors\"] a[rel=\"author\"]',\n 'article header div[class*=\"Byline__multipleContributors\"]',\n ],\n },\n\n content: {\n selectors: [\n '.article__body',\n 'article.article.main-content',\n 'main[class^=\"Layout__content\"]',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.caption__text': 'figcaption',\n '.caption__credit': 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: ['footer[class^=\"ArticleFooter__footer\"]', 'aside'],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n 'time.content-header__publish-date',\n ['meta[name=\"pubdate\"]', 'value'],\n ],\n timezone: 'America/New_York',\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: {\n selectors: [\n 'div[class^=\"ContentHeaderDek\"]',\n 'div.content-header__dek',\n 'h2[class^=\"ArticleHeader__dek\"]',\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const WiredExtractor = {\n domain: 'www.wired.com',\n title: {\n selectors: [\n 'h1[data-testId=\"ContentHeaderHed\"]',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"article:author\"]', 'value'],\n 'a[rel=\"author\"]',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n 'article.article.main-content',\n 'article.content',\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: ['.visually-hidden', 'figcaption img.photo', '.alert-message'],\n },\n\n date_published: {\n selectors: [['meta[name=\"article:published_time\"]', 'value']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: {\n selectors: [],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const MSNExtractor = {\n domain: 'www.msn.com',\n title: {\n selectors: [\n 'h1',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'span.authorname-txt',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n 'div.richtext',\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: ['span.caption'],\n },\n\n date_published: {\n selectors: ['span.time'],\n },\n\n lead_image_url: {\n selectors: [],\n },\n\n dek: {\n selectors: [],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const YahooExtractor = {\n domain: 'www.yahoo.com',\n title: {\n selectors: [\n 'header.canvas-header',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'span.provider-name',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n '.content-canvas',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: ['.figure-caption'],\n },\n\n date_published: {\n selectors: [['time.date[datetime]', 'datetime']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: {\n selectors: [\n // enter dek selectors\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const BuzzfeedExtractor = {\n domain: 'www.buzzfeed.com',\n\n supportedDomains: ['www.buzzfeednews.com'],\n\n title: {\n selectors: [\n 'h1.embed-headline-title',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'a[data-action=\"user/username\"]',\n 'byline__author',\n ['meta[name=\"author\"]', 'value'],\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n ['div[class^=\"featureimage_featureImageWrapper\"]', '.js-subbuzz-wrapper'],\n ['.js-subbuzz-wrapper'],\n ],\n\n defaultCleaner: false,\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n h2: 'b',\n\n 'div.longform_custom_header_media': $node => {\n if ($node.has('img') && $node.has('.longform_header_image_source')) {\n return 'figure';\n }\n\n return null;\n },\n\n 'figure.longform_custom_header_media .longform_header_image_source':\n 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.instapaper_ignore',\n '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline',\n '.share-box',\n '.print',\n '.js-inline-share-bar',\n '.js-ad-placement',\n ],\n },\n\n date_published: {\n selectors: [['time[datetime]', 'datetime']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: {\n selectors: ['.embed-headline-description'],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const WikiaExtractor = {\n domain: 'fandom.wikia.com',\n title: {\n selectors: [\n 'h1.entry-title',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n '.author vcard',\n '.fn',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n '.grid-content',\n '.entry-content',\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [],\n },\n\n date_published: {\n selectors: [['meta[name=\"article:published_time\"]', 'value']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: {\n selectors: [],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const LittleThingsExtractor = {\n domain: 'www.littlethings.com',\n title: {\n selectors: [\n 'h1[class*=\"PostHeader\"]',\n 'h1.post-title',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'div[class^=\"PostHeader__ScAuthorNameSection\"]',\n ['meta[name=\"author\"]', 'value'],\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n 'section[class*=\"PostMainArticle\"]',\n '.mainContentIntro',\n '.content-wrapper',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","export const PoliticoExtractor = {\n domain: 'www.politico.com',\n title: {\n selectors: [['meta[name=\"og:title\"]', 'value']],\n },\n\n author: {\n selectors: [\n ['div[itemprop=\"author\"] meta[itemprop=\"name\"]', 'value'],\n '.story-meta__authors .vcard',\n '.story-main-content .byline .vcard',\n ],\n },\n\n content: {\n selectors: [['.story-text'], '.story-main-content', '.story-core'],\n\n transforms: [],\n\n clean: ['figcaption', '.story-meta', '.ad'],\n },\n\n date_published: {\n selectors: [\n ['time[itemprop=\"datePublished\"]', 'datetime'],\n ['.story-meta__details time[datetime]', 'datetime'],\n ['.story-main-content .timestamp time[datetime]', 'datetime'],\n ],\n timezone: 'America/New_York',\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: {\n selectors: [['meta[name=\"og:description\"]', 'value']],\n },\n};\n","export const DeadspinExtractor = {\n domain: 'deadspin.com',\n\n supportedDomains: [\n 'jezebel.com',\n 'lifehacker.com',\n 'kotaku.com',\n 'gizmodo.com',\n 'jalopnik.com',\n 'kinja.com',\n 'avclub.com',\n 'clickhole.com',\n 'splinternews.com',\n 'theonion.com',\n 'theroot.com',\n 'thetakeout.com',\n 'theinventory.com',\n ],\n\n title: {\n selectors: ['header h1', 'h1.headline'],\n },\n\n author: {\n selectors: ['a[data-ga*=\"Author\"]', '.author'],\n },\n\n content: {\n selectors: ['.js_post-content', '.post-content', '.entry-content'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'iframe.lazyload[data-recommend-id^=\"youtube://\"]': $node => {\n const youtubeId = $node.attr('id').split('youtube-')[1];\n $node.attr('src', `https://www.youtube.com/embed/${youtubeId}`);\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: ['.magnifier', '.lightbox'],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ['time.updated[datetime]', 'datetime'],\n ],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const BroadwayWorldExtractor = {\n domain: 'www.broadwayworld.com',\n title: {\n selectors: ['h1[itemprop=headline]', 'h1.article-title'],\n },\n\n author: {\n selectors: ['span[itemprop=author]'],\n },\n\n content: {\n selectors: ['div[itemprop=articlebody]'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {},\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [],\n },\n\n date_published: {\n selectors: [['meta[itemprop=datePublished]', 'value']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n dek: {\n selectors: [],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n};\n","export const WwwTmzComExtractor = {\n domain: 'www.tmz.com',\n\n title: {\n selectors: ['.post-title-breadcrumb', 'h1', '.headline'],\n },\n\n author: 'TMZ STAFF',\n\n date_published: {\n selectors: ['.article__published-at', '.article-posted-date'],\n\n timezone: 'America/Los_Angeles',\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: ['.article__blocks', '.article-content', '.all-post-body'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {},\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: ['.lightbox-link'],\n },\n};\n","export const WwwWashingtonpostComExtractor = {\n domain: 'www.washingtonpost.com',\n\n title: {\n selectors: ['h1', '#topper-headline-wrapper'],\n },\n\n author: {\n selectors: ['.pb-author-name'],\n },\n\n date_published: {\n selectors: [['.author-timestamp[itemprop=\"datePublished\"]', 'content']],\n },\n\n dek: {\n selectors: [],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: ['.article-body'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n 'div.inline-content': $node => {\n if ($node.has('img,iframe,video').length > 0) {\n return 'figure';\n }\n\n $node.remove();\n return null;\n },\n '.pb-caption': 'figcaption',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: ['.interstitial-link', '.newsletter-inline-unit'],\n },\n};\n","export const WwwHuffingtonpostComExtractor = {\n domain: 'www.huffingtonpost.com',\n\n title: {\n selectors: ['h1.headline__title'],\n },\n\n author: {\n selectors: ['span.author-card__details__name'],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:modified_time\"]', 'value'],\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: ['h2.headline__subtitle'],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: ['div.entry__body'],\n\n defaultCleaner: false,\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {},\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.pull-quote',\n '.tag-cloud',\n '.embed-asset',\n '.below-entry',\n '.entry-corrections',\n '#suggested-story',\n ],\n },\n};\n","export const NewrepublicComExtractor = {\n domain: 'newrepublic.com',\n\n title: {\n selectors: ['h1.article-headline'],\n },\n\n author: {\n selectors: ['span.AuthorList'],\n },\n\n date_published: {\n selectors: [['meta[name=\"article:published_time\"]', 'value']],\n\n timezone: 'America/New_York',\n },\n\n dek: {\n selectors: ['h2.article-subhead'],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: [['div.article-body']],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {},\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: ['aside'],\n },\n};\n","export const MoneyCnnComExtractor = {\n domain: 'money.cnn.com',\n\n title: {\n selectors: ['.article-title'],\n },\n\n author: {\n selectors: [['meta[name=\"author\"]', 'value'], '.byline a'],\n },\n\n date_published: {\n selectors: [['meta[name=\"date\"]', 'value']],\n\n timezone: 'GMT',\n },\n\n dek: {\n selectors: ['#storytext h2'],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: ['#storytext'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {},\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: ['.inStoryHeading'],\n },\n};\n","export const WwwThevergeComExtractor = {\n domain: 'www.theverge.com',\n\n supportedDomains: ['www.polygon.com'],\n\n title: {\n selectors: ['h1'],\n },\n\n author: {\n selectors: [['meta[name=\"author\"]', 'value']],\n },\n\n date_published: {\n selectors: [['meta[name=\"article:published_time\"]', 'value']],\n },\n\n dek: {\n selectors: ['.p-dek'],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: [\n // feature template multi-match\n ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'],\n // regular post multi-match\n ['.e-image--hero', '.c-entry-content'],\n // feature template fallback\n '.l-wrapper .l-feature',\n // regular post fallback\n 'div.c-entry-content',\n ],\n\n // Transform lazy-loaded images\n transforms: {\n noscript: $node => {\n const $children = $node.children();\n if ($children.length === 1 && $children.get(0).tagName === 'img') {\n return 'span';\n }\n\n return null;\n },\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.aside',\n 'img.c-dynamic-image', // images come from noscript transform\n ],\n },\n};\n","export const WwwCnnComExtractor = {\n domain: 'www.cnn.com',\n\n title: {\n selectors: ['h1.pg-headline', 'h1'],\n },\n\n author: {\n selectors: [['meta[name=\"author\"]', 'value']],\n },\n\n date_published: {\n selectors: [['meta[name=\"article:published_time\"]', 'value']],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: [\n // a more specific selector to grab the lead image and the body\n ['.media__video--thumbnail', '.zn-body-text'],\n // a fallback for the above\n '.zn-body-text',\n 'div[itemprop=\"articleBody\"]',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': $node => {\n const $text = $node.html();\n if ($text) {\n return 'p';\n }\n\n return null;\n },\n\n // this transform cleans the short, all-link sections linking\n // to related content but not marked as such in any way.\n '.zn-body__paragraph': $node => {\n if ($node.has('a')) {\n if (\n $node.text().trim() ===\n $node\n .find('a')\n .text()\n .trim()\n ) {\n $node.remove();\n }\n }\n },\n\n '.media__video--thumbnail': 'figure',\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [],\n },\n};\n","export const WwwAolComExtractor = {\n domain: 'www.aol.com',\n\n title: {\n selectors: ['h1.p-article__title'],\n },\n\n author: {\n selectors: [['meta[name=\"author\"]', 'value']],\n },\n\n date_published: {\n selectors: ['.p-article__byline__date'],\n\n timezone: 'America/New_York',\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n selectors: ['.article-content'],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {},\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [],\n },\n};\n","export const WwwYoutubeComExtractor = {\n domain: 'www.youtube.com',\n\n title: {\n selectors: [\n ['meta[name=\"title\"]', 'value'],\n '.watch-title',\n 'h1.watch-title-container',\n ],\n },\n\n author: {\n selectors: [['link[itemprop=\"name\"]', 'content'], '.yt-user-info'],\n },\n\n date_published: {\n selectors: [['meta[itemProp=\"datePublished\"]', 'value']],\n\n timezone: 'GMT',\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [['meta[name=\"og:image\"]', 'value']],\n },\n\n content: {\n defaultCleaner: false,\n\n selectors: [\n '#player-container-outer',\n 'ytd-expandable-video-description-body-renderer #description',\n ['#player-api', '#description'],\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n '#player-api': ($node, $) => {\n const videoId = $('meta[itemProp=\"videoId\"]').attr('value');\n $node.html(`\n `);\n },\n '#player-container-outer': ($node, $) => {\n const videoId = $('meta[itemProp=\"videoId\"]').attr('value');\n const description = $('meta[itemProp=\"description\"]').attr('value');\n $node.html(`\n \n