From 56a19bf9341819049d0771f7707debc3409e38cd Mon Sep 17 00:00:00 2001 From: Michael Ashley Date: Mon, 9 May 2022 08:58:26 -0700 Subject: [PATCH] fix: updating generate-parser dist (#499) --- dist/generate-custom-parser.js | 1716 ++++++++++++++++++++++++++------ 1 file changed, 1438 insertions(+), 278 deletions(-) diff --git a/dist/generate-custom-parser.js b/dist/generate-custom-parser.js index 44f11744e..1b8967cf9 100644 --- a/dist/generate-custom-parser.js +++ b/dist/generate-custom-parser.js @@ -23,10 +23,11 @@ var regenerator = _interopDefault(require('@babel/runtime-corejs2/regenerator')) var objectWithoutProperties = _interopDefault(require('@babel/runtime-corejs2/helpers/objectWithoutProperties')); var asyncToGenerator = _interopDefault(require('@babel/runtime-corejs2/helpers/asyncToGenerator')); var cheerio = _interopDefault(require('cheerio')); +var turndown = _interopDefault(require('turndown')); var promise = _interopDefault(require('@babel/runtime-corejs2/core-js/promise')); -var request = _interopDefault(require('request')); +var postmanRequest = _interopDefault(require('postman-request')); +var assign = _interopDefault(require('@babel/runtime-corejs2/core-js/object/assign')); var keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys')); -var turndown = _interopDefault(require('turndown')); var stringDirection = _interopDefault(require('string-direction')); var validUrl = _interopDefault(require('valid-url')); var momentTimezone = _interopDefault(require('moment-timezone')); @@ -70,11 +71,12 @@ function stripJunkTags(article, $) { // Scoring -function absolutize($, rootUrl, attr, $content) { +function absolutize($, rootUrl, attr) { var baseUrl = $('base').attr('href'); - $("[".concat(attr, "]"), $content).each(function (_, node) { + $("[".concat(attr, "]")).each(function (_, node) { var attrs = getAttrs(node); var url = attrs[attr]; + if (!url) return; var absoluteUrl = URL.resolve(baseUrl || rootUrl, url); setAttr(node, attr, absoluteUrl); }); @@ -90,6 +92,7 @@ function absolutizeSet($, rootUrl, $content) { // descriptors can only contain positive numbers followed immediately by either 'w' or 'x' // space characters inside the URL should be encoded (%20 or +) var candidates = urlSet.match(/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g); + if (!candidates) return; var absoluteCandidates = candidates.map(function (candidate) { // a candidate URL cannot start or end with a comma // descriptors are separated from the URLs by unescaped whitespace @@ -107,7 +110,7 @@ function absolutizeSet($, rootUrl, $content) { function makeLinksAbsolute$$1($content, $, url) { ['href', 'src'].forEach(function (attr) { - return absolutize($, url, attr, $content); + return absolutize($, url, attr); }); absolutizeSet($, url, $content); return $content; @@ -163,6 +166,8 @@ var URL$1 = _interopDefault$1(URL); var cheerio$1 = _interopDefault$1(cheerio); +var TurndownService = _interopDefault$1(turndown); + var iconv = _interopDefault$1(iconvLite); var _parseInt$1 = _interopDefault$1(_parseInt); @@ -171,7 +176,7 @@ var _slicedToArray$1 = _interopDefault$1(_slicedToArray); var _Promise = _interopDefault$1(promise); -var request$1 = _interopDefault$1(request); +var request = _interopDefault$1(postmanRequest); var _Reflect$ownKeys$1 = _interopDefault$1(_Reflect$ownKeys); @@ -187,9 +192,9 @@ var _typeof$1 = _interopDefault$1(_typeof); var _getIterator$1 = _interopDefault$1(_getIterator); -var _Object$keys = _interopDefault$1(keys); +var _Object$assign = _interopDefault$1(assign); -var TurndownService = _interopDefault$1(turndown); +var _Object$keys = _interopDefault$1(keys); var stringDirection$1 = _interopDefault$1(stringDirection); @@ -209,7 +214,7 @@ var ellipsize$1 = _interopDefault$1(ellipsize); var _Array$isArray = _interopDefault$1(isArray); -var NORMALIZE_RE$1 = /\s{2,}/g; +var NORMALIZE_RE$1 = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g; function normalizeSpaces$1(text) { return text.replace(NORMALIZE_RE$1, ' ').trim(); @@ -372,55 +377,6 @@ function getEncoding$1(str) { return encoding; } -var _marked = -/*#__PURE__*/ -_regeneratorRuntime.mark(range); - -function range() { - var start, - end, - _args = arguments; - return _regeneratorRuntime.wrap(function range$(_context) { - while (1) { - switch (_context.prev = _context.next) { - case 0: - start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1; - end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1; - - case 2: - if (!(start <= end)) { - _context.next = 7; - break; - } - - _context.next = 5; - return start += 1; - - case 5: - _context.next = 2; - break; - - case 7: - case "end": - return _context.stop(); - } - } - }, _marked, this); -} // extremely simple url validation as a first step - - -function validateUrl(_ref) { - var hostname = _ref.hostname; // If this isn't a valid url, return an error message - - return !!hostname; -} - -var Errors = { - badUrl: { - error: true, - messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.' - } -}; var REQUEST_HEADERS = cheerio$1.browser ? {} : { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' }; // The number of milliseconds to attempt to fetch a resource before timing out. @@ -435,7 +391,7 @@ var MAX_CONTENT_LENGTH = 5242880; // Turn the global proxy on or off function get(options) { return new _Promise(function (resolve, reject) { - request$1(options, function (err, response, body) { + request(options, function (err, response, body) { if (err) { reject(err); } else { @@ -447,13 +403,13 @@ function get(options) { }); }); } // Evaluate a response to ensure it's something we should be keeping. -// This does not validate in the sense of a response being 200 level or -// not. Validation here means that we haven't found reason to bail from +// This does not validate in the sense of a response being 200 or not. +// Validation here means that we haven't found reason to bail from // further processing of this url. function validateResponse(response) { - var parseNon2xx = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false; // Check if we got a valid status code + var parseNon200 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false; // Check if we got a valid status code // This isn't great, but I'm requiring a statusMessage to be set // before short circuiting b/c nock doesn't set it in tests // statusMessage only not set in nock response, in which case @@ -463,8 +419,8 @@ function validateResponse(response) { if (response.statusMessage && response.statusMessage !== 'OK' || response.statusCode !== 200) { if (!response.statusCode) { throw new Error("Unable to fetch content. Original exception was ".concat(response.error)); - } else if (!parseNon2xx) { - throw new Error("Resource returned a response status code of ".concat(response.statusCode, " and resource was instructed to reject non-2xx level status codes.")); + } else if (!parseNon200) { + throw new Error("Resource returned a response status code of ".concat(response.statusCode, " and resource was instructed to reject non-200 status codes.")); } } @@ -497,16 +453,22 @@ function _fetchResource() { _fetchResource = _asyncToGenerator( /*#__PURE__*/ _regeneratorRuntime.mark(function _callee(url, parsedUrl) { - var options, _ref2, response, body; + var headers, + options, + _ref2, + response, + body, + _args = arguments; return _regeneratorRuntime.wrap(function _callee$(_context) { while (1) { switch (_context.prev = _context.next) { case 0: + headers = _args.length > 2 && _args[2] !== undefined ? _args[2] : {}; parsedUrl = parsedUrl || URL$1.parse(encodeURI(url)); options = _objectSpread({ url: parsedUrl.href, - headers: _objectSpread({}, REQUEST_HEADERS), + headers: _objectSpread({}, REQUEST_HEADERS, headers), timeout: FETCH_TIMEOUT, // Accept cookies jar: true, @@ -521,31 +483,34 @@ function _fetchResource() { // Follow GET redirects; this option is for Node only followRedirect: true }); - _context.next = 4; + _context.next = 5; return get(options); - case 4: + case 5: _ref2 = _context.sent; response = _ref2.response; body = _ref2.body; - _context.prev = 7; + _context.prev = 8; validateResponse(response); return _context.abrupt("return", { body: body, response: response }); - case 12: - _context.prev = 12; - _context.t0 = _context["catch"](7); - return _context.abrupt("return", Errors.badUrl); + case 13: + _context.prev = 13; + _context.t0 = _context["catch"](8); + return _context.abrupt("return", { + error: true, + message: _context.t0.message + }); - case 15: + case 16: case "end": return _context.stop(); } } - }, _callee, this, [[7, 12]]); + }, _callee, this, [[8, 13]]); })); return _fetchResource.apply(this, arguments); } @@ -577,11 +542,11 @@ var SPACER_RE$1 = new RegExp('transparent|spacer|blank', 'i'); // The class we w // but would normally remove var KEEP_CLASS$1 = 'mercury-parser-keep'; -var KEEP_SELECTORS$1 = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]']; // A list of tags to strip from the output if we encounter them. +var KEEP_SELECTORS$1 = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]']; // A list of tags to strip from the output if we encounter them. var STRIP_OUTPUT_TAGS$1 = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes -var WHITELIST_ATTRS$1 = ['src', 'srcset', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height']; +var WHITELIST_ATTRS$1 = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height']; var WHITELIST_ATTRS_RE$1 = new RegExp("^(".concat(WHITELIST_ATTRS$1.join('|'), ")$"), 'i'); // removeEmpty var CLEAN_CONDITIONALLY_TAGS$1 = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders @@ -1453,11 +1418,12 @@ function rewriteTopLevel$$1(article, $) { return $; } -function absolutize$1($, rootUrl, attr, $content) { +function absolutize$1($, rootUrl, attr) { var baseUrl = $('base').attr('href'); - $("[".concat(attr, "]"), $content).each(function (_, node) { + $("[".concat(attr, "]")).each(function (_, node) { var attrs = getAttrs$1(node); var url = attrs[attr]; + if (!url) return; var absoluteUrl = URL$1.resolve(baseUrl || rootUrl, url); setAttr$1(node, attr, absoluteUrl); }); @@ -1473,6 +1439,7 @@ function absolutizeSet$1($, rootUrl, $content) { // descriptors can only contain positive numbers followed immediately by either 'w' or 'x' // space characters inside the URL should be encoded (%20 or +) var candidates = urlSet.match(/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g); + if (!candidates) return; var absoluteCandidates = candidates.map(function (candidate) { // a candidate URL cannot start or end with a comma // descriptors are separated from the URLs by unescaped whitespace @@ -1490,7 +1457,7 @@ function absolutizeSet$1($, rootUrl, $content) { function makeLinksAbsolute$$1($content, $, url) { ['href', 'src'].forEach(function (attr) { - return absolutize$1($, url, attr, $content); + return absolutize$1($, url, attr); }); absolutizeSet$1($, url, $content); return $content; @@ -1735,7 +1702,9 @@ function setAttrs$1(node, attrs) { var IS_LINK = new RegExp('https?://', 'i'); -var IS_IMAGE = new RegExp('.(png|gif|jpe?g)', 'i'); +var IMAGE_RE = '.(png|gif|jpe?g)'; +var IS_IMAGE = new RegExp("".concat(IMAGE_RE), 'i'); +var IS_SRCSET = new RegExp("".concat(IMAGE_RE, "(\\?\\S+)?(\\s*[\\d.]+[wx])"), 'i'); var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(','); // lazy loaded images into normal images. // Many sites will have img tags with no source, or an image tag with a src // attribute that a is a placeholer. We need to be able to properly fill in @@ -1748,7 +1717,9 @@ function convertLazyLoadedImages($) { _Reflect$ownKeys$1(attrs).forEach(function (attr) { var value = attrs[attr]; - if (attr !== 'src' && IS_LINK.test(value) && IS_IMAGE.test(value)) { + if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) { + $(img).attr('srcset', value); + } else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) { $(img).attr('src', value); } }); @@ -1778,17 +1749,23 @@ var Resource = { // :param response: If set, use as the response rather than // attempting to fetch it ourselves. Expects a // string. + // :param headers: Custom headers to be included in the request create: function () { var _create = _asyncToGenerator( /*#__PURE__*/ _regeneratorRuntime.mark(function _callee(url, preparedResponse, parsedUrl) { - var result, validResponse; + var headers, + result, + validResponse, + _args = arguments; return _regeneratorRuntime.wrap(function _callee$(_context) { while (1) { switch (_context.prev = _context.next) { case 0: + headers = _args.length > 3 && _args[3] !== undefined ? _args[3] : {}; + if (!preparedResponse) { - _context.next = 5; + _context.next = 6; break; } @@ -1804,29 +1781,29 @@ var Resource = { body: preparedResponse, response: validResponse }; - _context.next = 8; + _context.next = 9; break; - case 5: - _context.next = 7; - return fetchResource(url, parsedUrl); + case 6: + _context.next = 8; + return fetchResource(url, parsedUrl, headers); - case 7: + case 8: result = _context.sent; - case 8: + case 9: if (!result.error) { - _context.next = 11; + _context.next = 12; break; } result.failed = true; return _context.abrupt("return", result); - case 11: + case 12: return _context.abrupt("return", this.generateDoc(result)); - case 12: + case 13: case "end": return _context.stop(); } @@ -1843,7 +1820,8 @@ var Resource = { generateDoc: function generateDoc(_ref) { var content = _ref.body, response = _ref.response; - var contentType = response.headers['content-type']; // TODO: Implement is_text function from + var _response$headers$con = response.headers['content-type'], + contentType = _response$headers$con === void 0 ? '' : _response$headers$con; // TODO: Implement is_text function from // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57 if (!contentType.includes('html') && !contentType.includes('text')) { @@ -1871,7 +1849,8 @@ var Resource = { var decodedContent = iconv.decode(content, encoding); var $ = cheerio$1.load(decodedContent); // after first cheerio.load, check to see if encoding matches - var metaContentType = $('meta[http-equiv=content-type i]').attr('content') || $('meta[charset]').attr('charset'); + var contentTypeSelector = cheerio$1.browser ? 'meta[http-equiv=content-type]' : 'meta[http-equiv=content-type i]'; + var metaContentType = $(contentTypeSelector).attr('content') || $('meta[charset]').attr('charset'); var properEncoding = getEncoding$1(metaContentType); // if encodings in the header/body dont match, use the one in the body if (metaContentType && properEncoding !== encoding) { @@ -1883,6 +1862,49 @@ var Resource = { } }; +var _marked = +/*#__PURE__*/ +_regeneratorRuntime.mark(range); + +function range() { + var start, + end, + _args = arguments; + return _regeneratorRuntime.wrap(function range$(_context) { + while (1) { + switch (_context.prev = _context.next) { + case 0: + start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1; + end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1; + + case 2: + if (!(start <= end)) { + _context.next = 7; + break; + } + + _context.next = 5; + return start += 1; + + case 5: + _context.next = 2; + break; + + case 7: + case "end": + return _context.stop(); + } + } + }, _marked, this); +} // extremely simple url validation as a first step + + +function validateUrl(_ref) { + var hostname = _ref.hostname; // If this isn't a valid url, return an error message + + return !!hostname; +} + var merge = function merge(extractor, domains) { return domains.reduce(function (acc, domain) { acc[domain] = extractor; @@ -1894,6 +1916,21 @@ function mergeSupportedDomains(extractor) { return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray$1(extractor.supportedDomains))) : merge(extractor, [extractor.domain]); } +var apiExtractors = {}; + +function addExtractor(extractor) { + if (!extractor || !extractor.domain) { + return { + error: true, + message: 'Unable to add custom extractor. Invalid parameters.' + }; + } + + _Object$assign(apiExtractors, mergeSupportedDomains(extractor)); + + return apiExtractors; +} + var BloggerExtractor = { domain: 'blogspot.com', content: { @@ -2051,25 +2088,30 @@ var NYTimesExtractor = { var TheAtlanticExtractor = { domain: 'www.theatlantic.com', title: { - selectors: ['h1.hed'] + selectors: ['h1', '.c-article-header__hed'] }, author: { - selectors: ['article#article .article-cover-extra .metadata .byline a'] + selectors: [['meta[name="author"]', 'value'], '.c-byline__author'] }, content: { - selectors: [['.article-cover figure.lead-img', '.article-body'], '.article-body'], + selectors: ['article', '.article-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result - clean: ['.partner-box', '.callout'] + clean: ['.partner-box', '.callout', '.c-article-writer__image', '.c-article-writer__content', '.c-letters-cta__text', '.c-footer__logo', '.c-recirculation-link', '.twitter-tweet'] + }, + dek: { + selectors: [['meta[name="description"]', 'value']] }, date_published: { - selectors: [['time[itemProp="datePublished"]', 'datetime']] + selectors: [['time[itemprop="datePublished"]', 'datetime']] + }, + lead_image_url: { + selectors: [['img[itemprop="url"]', 'src']] }, - lead_image_url: null, next_page_url: null, excerpt: null }; // Rename CustomExtractor @@ -2079,30 +2121,31 @@ var TheAtlanticExtractor = { var NewYorkerExtractor = { domain: 'www.newyorker.com', title: { - selectors: ['h1.title'] + selectors: ['h1[class^="ArticleHeader__hed"]', ['meta[name="og:title"]', 'value']] }, author: { - selectors: ['.contributors'] + selectors: ['div[class^="ArticleContributors"] a[rel="author"]', 'article header div[class*="Byline__multipleContributors"]'] }, content: { - selectors: ['div#articleBody', 'div.articleBody'], + selectors: ['main[class^="Layout__content"]'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result - clean: [] + clean: ['footer[class^="ArticleFooter__footer"]'] }, date_published: { - selectors: [['meta[name="article:published_time"]', 'value'], ['time[itemProp="datePublished"]', 'content']], + selectors: [['meta[name="pubdate"]', 'value']], + format: 'YYYYMMDD', timezone: 'America/New_York' }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']] }, dek: { - selectors: ['.dek', 'h2.dek'] + selectors: ['h2[class^="ArticleHeader__dek"]'] }, next_page_url: null, excerpt: null @@ -2351,7 +2394,7 @@ var PoliticoExtractor = { }; var DeadspinExtractor = { domain: 'deadspin.com', - supportedDomains: ['jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com'], + supportedDomains: ['jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com', 'avclub.com', 'clickhole.com', 'splinternews.com', 'theonion.com', 'theroot.com', 'thetakeout.com', 'theinventory.com'], title: { selectors: ['h1.headline'] }, @@ -2479,15 +2522,14 @@ var ApartmentTherapyExtractor = { }; var MediumExtractor = { domain: 'medium.com', - supportedDomains: ['trackchanges.postlight.com'], title: { - selectors: ['h1'] + selectors: ['h1', ['meta[name="og:title"]', 'value']] }, author: { selectors: [['meta[name="author"]', 'value']] }, content: { - selectors: [['.section-content'], '.section-content', 'article > div > section'], + selectors: ['article'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { @@ -2495,6 +2537,7 @@ var MediumExtractor = { iframe: function iframe($node) { var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//; var thumb = decodeURIComponent($node.attr('data-thumbnail')); + var $parent = $node.parents('figure'); if (ytRe.test(thumb)) { var _thumb$match = thumb.match(ytRe), @@ -2504,10 +2547,13 @@ var MediumExtractor = { $node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId)); - var $parent = $node.parents('figure'); var $caption = $parent.find('figcaption'); $parent.empty().append([$node, $caption]); - } + return; + } // If we can't draw the YouTube preview, remove the figure. + + + $parent.remove(); }, // rewrite figures to pull out image and caption, remove rest figure: function figure($node) { @@ -2516,23 +2562,27 @@ var MediumExtractor = { var $img = $node.find('img').slice(-1)[0]; var $caption = $node.find('figcaption'); $node.empty().append([$img, $caption]); + }, + // Remove any smaller images that did not get caught by the generic image + // cleaner (author photo 48px, leading sentence images 79px, etc.). + img: function img($node) { + var width = _parseInt$1($node.attr('width'), 10); + + if (width < 100) $node.remove(); } }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result - clean: [] + clean: ['span', 'svg'] }, date_published: { - selectors: [['time[datetime]', 'datetime']] + selectors: [['meta[name="article:published_time"]', 'value']] }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']] }, - dek: { - selectors: [// enter selectors - ] - }, + dek: null, next_page_url: { selectors: [// enter selectors ] @@ -3543,7 +3593,7 @@ var WwwNydailynewsComExtractor = { var WwwCnbcComExtractor = { domain: 'www.cnbc.com', title: { - selectors: ['h1.title'] + selectors: ['h1.title', 'h1.ArticleHeader-headline'] }, author: { selectors: [['meta[name="author"]', 'value']] @@ -3555,7 +3605,7 @@ var WwwCnbcComExtractor = { selectors: [['meta[name="og:image"]', 'value']] }, content: { - selectors: ['div#article_body.content', 'div.story'], + selectors: ['div#article_body.content', 'div.story', 'div.ArticleBody-articleBody'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, @@ -4749,80 +4799,1086 @@ var WwwFastcompanyComExtractor = { selectors: ['.post__article'] } }; +var BlisterreviewComExtractor = { + domain: 'blisterreview.com', + title: { + selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title'] + }, + author: { + selectors: ['span.author-name'] + }, + date_published: { + selectors: [['meta[name="article:published_time"]', 'value'], ['time.entry-date', 'datetime'], ['meta[itemprop="datePublished"]', 'content']] + }, + dek: { + selectors: [// enter selectors + ] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value'], ['meta[property="og:image"]', 'content'], ['meta[itemprop="image"]', 'content'], ['meta[name="twitter:image"]', 'content'], ['img.attachment-large', 'src']] + }, + content: { + selectors: [['.elementor-section-wrap', '.elementor-text-editor > p, .elementor-text-editor > ul > li, .attachment-large, .wp-caption-text']], + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: { + figcaption: 'p' + }, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: ['.comments-area'] + } +}; +var NewsMynaviJpExtractor = { + domain: 'news.mynavi.jp', + title: { + selectors: [['meta[name="og:title"]', 'value']] + }, + author: { + selectors: ['main div.article-author a.article-author__name'] + }, + date_published: { + selectors: [['meta[name="article:published_time"]', 'value']] + }, + dek: { + selectors: [['meta[name="og:description"]', 'value']] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['main article div'], + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: { + img: function img($node) { + var src = $node.attr('data-original'); -var CustomExtractors = -/*#__PURE__*/ -_Object$freeze({ - BloggerExtractor: BloggerExtractor, - NYMagExtractor: NYMagExtractor, - WikipediaExtractor: WikipediaExtractor, - TwitterExtractor: TwitterExtractor, - NYTimesExtractor: NYTimesExtractor, - TheAtlanticExtractor: TheAtlanticExtractor, - NewYorkerExtractor: NewYorkerExtractor, - WiredExtractor: WiredExtractor, - MSNExtractor: MSNExtractor, - YahooExtractor: YahooExtractor, - BuzzfeedExtractor: BuzzfeedExtractor, - WikiaExtractor: WikiaExtractor, - LittleThingsExtractor: LittleThingsExtractor, - PoliticoExtractor: PoliticoExtractor, - DeadspinExtractor: DeadspinExtractor, - BroadwayWorldExtractor: BroadwayWorldExtractor, - ApartmentTherapyExtractor: ApartmentTherapyExtractor, - MediumExtractor: MediumExtractor, - WwwTmzComExtractor: WwwTmzComExtractor, - WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor, - WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor, - NewrepublicComExtractor: NewrepublicComExtractor, - MoneyCnnComExtractor: MoneyCnnComExtractor, - WwwThevergeComExtractor: WwwThevergeComExtractor, - WwwCnnComExtractor: WwwCnnComExtractor, - WwwAolComExtractor: WwwAolComExtractor, - WwwYoutubeComExtractor: WwwYoutubeComExtractor, - WwwTheguardianComExtractor: WwwTheguardianComExtractor, - WwwSbnationComExtractor: WwwSbnationComExtractor, - WwwBloombergComExtractor: WwwBloombergComExtractor, - WwwBustleComExtractor: WwwBustleComExtractor, - WwwNprOrgExtractor: WwwNprOrgExtractor, - WwwRecodeNetExtractor: WwwRecodeNetExtractor, - QzComExtractor: QzComExtractor, - WwwDmagazineComExtractor: WwwDmagazineComExtractor, - WwwReutersComExtractor: WwwReutersComExtractor, - MashableComExtractor: MashableComExtractor, - WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor, - WwwVoxComExtractor: WwwVoxComExtractor, - NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor, - WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor, - WwwLatimesComExtractor: WwwLatimesComExtractor, - PagesixComExtractor: PagesixComExtractor, - ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor, - WwwCbssportsComExtractor: WwwCbssportsComExtractor, - WwwMsnbcComExtractor: WwwMsnbcComExtractor, - WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor, - WwwMentalflossComExtractor: WwwMentalflossComExtractor, - AbcnewsGoComExtractor: AbcnewsGoComExtractor, - WwwNydailynewsComExtractor: WwwNydailynewsComExtractor, - WwwCnbcComExtractor: WwwCnbcComExtractor, - WwwPopsugarComExtractor: WwwPopsugarComExtractor, - ObserverComExtractor: ObserverComExtractor, - PeopleComExtractor: PeopleComExtractor, - WwwUsmagazineComExtractor: WwwUsmagazineComExtractor, - WwwRollingstoneComExtractor: WwwRollingstoneComExtractor, - twofortysevensportsComExtractor: twofortysevensportsComExtractor, - UproxxComExtractor: UproxxComExtractor, - WwwEonlineComExtractor: WwwEonlineComExtractor, - WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor, - WwwRefinery29ComExtractor: WwwRefinery29ComExtractor, - WwwMacrumorsComExtractor: WwwMacrumorsComExtractor, - WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor, - WwwSiComExtractor: WwwSiComExtractor, - WwwRawstoryComExtractor: WwwRawstoryComExtractor, - WwwCnetComExtractor: WwwCnetComExtractor, - WwwCinemablendComExtractor: WwwCinemablendComExtractor, - WwwTodayComExtractor: WwwTodayComExtractor, - WwwHowtogeekComExtractor: WwwHowtogeekComExtractor, - WwwAlComExtractor: WwwAlComExtractor, + if (src !== '') { + $node.attr('src', src); + } + } + }, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [] + } +}; +var ClinicaltrialsGovExtractor = { + domain: 'clinicaltrials.gov', + title: { + selectors: ['h1.tr-solo_record'] + }, + author: { + selectors: ['div#sponsor.tr-info-text'] + }, + date_published: { + // selectors: ['span.term[data-term="Last Update Posted"]'], + selectors: ['div:has(> span.term[data-term="Last Update Posted"])'] + }, + content: { + selectors: ['div#tab-body'], + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: {}, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: ['.usa-alert> img'] + } +}; +var GithubComExtractor = { + domain: 'github.com', + title: { + selectors: [['meta[name="og:title"]', 'value']] + }, + author: { + selectors: [// enter author selectors + ] + }, + date_published: { + selectors: [['span[itemprop="dateModified"] relative-time', 'datetime']] + }, + dek: { + selectors: ['span[itemprop="about"]'] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: [['#readme article']], + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: {}, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [] + } +}; +var WwwRedditComExtractor = { + domain: 'www.reddit.com', + title: { + selectors: ['div[data-test-id="post-content"] h2'] + }, + author: { + selectors: ['div[data-test-id="post-content"] a[href*="user/"]'] + }, + date_published: { + selectors: ['div[data-test-id="post-content"] a[data-click-id="timestamp"]'] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: [['div[data-test-id="post-content"] p'], // text post + ['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link + 'div[data-test-id="post-content"] div[data-click-id="media"]'], // external link with media preview (YouTube, imgur album, etc...) + ['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video) + ['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])'], // external link + 'div[data-test-id="post-content"]'], + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: { + 'div[role="img"]': function divRoleImg($node) { + // External link image preview + var $img = $node.find('img'); + var bgImg = $node.css('background-image'); + + if ($img.length === 1 && bgImg) { + $img.attr('src', bgImg.match(/\((.*?)\)/)[1].replace(/('|")/g, '')); + return $img; + } + + return $node; + } + }, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: ['.icon'] + } +}; +var OtrsComExtractor = { + domain: 'otrs.com', + title: { + selectors: ['#main article h1'] + }, + author: { + selectors: ['div.dateplusauthor a'] + }, + date_published: { + selectors: [['meta[name="article:published_time"]', 'value']] + }, + dek: { + selectors: [['meta[name="og:description"]', 'value']] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['#main article'], + defaultCleaner: false, + transforms: {}, + clean: ['div.dateplusauthor', 'div.gr-12.push-6.footershare', '#atftbx', 'div.category-modul'] + } +}; +var WwwOssnewsJpExtractor = { + domain: 'www.ossnews.jp', + title: { + selectors: ['#alpha-block h1.hxnewstitle'] + }, + author: null, + date_published: { + selectors: ['p.fs12'], + format: 'YYYY年MM月DD日 HH:mm', + timezone: 'Asia/Tokyo' + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['#alpha-block .section:has(h1.hxnewstitle)'], + defaultCleaner: false, + transforms: {}, + clean: [] + } +}; +var BuzzapJpExtractor = { + domain: 'buzzap.jp', + title: { + selectors: ['h1.entry-title'] + }, + author: null, + date_published: { + selectors: [['time.entry-date', 'datetime']] + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.ctiframe'], + defaultCleaner: false, + transforms: {}, + clean: [] + } +}; +var WwwAsahiComExtractor = { + domain: 'www.asahi.com', + title: { + selectors: ['.ArticleTitle h1'] + }, + author: { + selectors: [['meta[name="article:author"]', 'value']] + }, + date_published: { + selectors: [['meta[name="pubdate"]', 'value']] + }, + dek: null, + excerpt: { + selectors: [['meta[name="og:description"]', 'value']] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['#MainInner div.ArticleBody'], + defaultCleaner: false, + transforms: {}, + clean: ['div.AdMod', 'div.LoginSelectArea'] + } +}; +var WwwSanwaCoJpExtractor = { + domain: 'www.sanwa.co.jp', + title: { + selectors: ['#newsContent h1'] + }, + author: null, + date_published: { + selectors: ['p.date'], + format: 'YYYY.MM.DD', + timezone: 'Asia/Tokyo' + }, + dek: { + selectors: [['meta[name="og:description"]', 'value']] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['#newsContent'], + defaultCleaner: false, + transforms: {}, + clean: ['#smartphone', 'div.sns_box', 'div.contentFoot'] + } +}; +var WwwElecomCoJpExtractor = { + domain: 'www.elecom.co.jp', + title: { + selectors: ['title'] + }, + author: null, + date_published: { + selectors: ['p.section-last'], + format: 'YYYY.MM.DD', + timezone: 'Asia/Tokyo' + }, + dek: null, + lead_image_url: null, + content: { + selectors: ['td.TableMain2'], + defaultCleaner: false, + transforms: { + table: function table($node) { + $node.attr('width', 'auto'); + } + }, + clean: [] + } +}; +var ScanNetsecurityNeJpExtractor = { + domain: 'scan.netsecurity.ne.jp', + title: { + selectors: ['header.arti-header h1.head'] + }, + author: null, + date_published: { + selectors: [['meta[name="article:modified_time"]', 'value']] + }, + dek: { + selectors: ['header.arti-header p.arti-summary'] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.arti-content.arti-content--thumbnail'], + defaultCleaner: false, + transforms: {}, + clean: ['aside.arti-giga'] + } +}; +var JvndbJvnJpExtractor = { + domain: 'jvndb.jvn.jp', + title: { + selectors: ['title'] + }, + author: null, + date_published: { + selectors: ['div.modifytxt:nth-child(2)'], + format: 'YYYY/MM/DD', + timezone: 'Asia/Tokyo' + }, + dek: null, + lead_image_url: null, + content: { + selectors: ['#news-list'], + defaultCleaner: false, + transforms: {}, + clean: [] + } +}; +var GeniusComExtractor = { + domain: 'genius.com', + title: { + selectors: ['h1'] + }, + author: { + selectors: ['h2 a'] + }, + date_published: { + selectors: [['meta[itemprop=page_data]', 'value', function (res) { + var json = JSON.parse(res); + return json.song.release_date; + }]] + }, + dek: { + selectors: [// enter selectors + ] + }, + lead_image_url: { + selectors: [['meta[itemprop=page_data]', 'value', function (res) { + var json = JSON.parse(res); + return json.song.album.cover_art_url; + }]] + }, + content: { + selectors: ['.lyrics'], + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: {}, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [] + } +}; +var WwwJnsaOrgExtractor = { + domain: 'www.jnsa.org', + title: { + selectors: ['#wgtitle h2'] + }, + author: null, + date_published: null, + dek: null, + excerpt: { + selectors: [['meta[name="og:description"]', 'value']] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['#main_area'], + transforms: {}, + clean: ['#pankuzu', '#side'] + } +}; +var PhpspotOrgExtractor = { + domain: 'phpspot.org', + title: { + selectors: ['h3.hl'] + }, + author: null, + date_published: { + selectors: ['h4.hl'], + format: 'YYYY年MM月DD日', + timezone: 'Asia/Tokyo' + }, + dek: null, + lead_image_url: null, + content: { + selectors: ['div.entrybody'], + defaultCleaner: false, + transforms: {}, + clean: [] + } +}; +var WwwInfoqComExtractor = { + domain: 'www.infoq.com', + title: { + selectors: ['h1.heading'] + }, + author: { + selectors: ['div.widget.article__authors'] + }, + date_published: { + selectors: ['.article__readTime.date'], + format: 'YYYY年MM月DD日', + timezone: 'Asia/Tokyo' + }, + dek: { + selectors: [['meta[name="og:description"]', 'value']] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.article__data'], + defaultCleaner: false, + transforms: {}, + clean: [] + } +}; +var WwwMoongiftJpExtractor = { + domain: 'www.moongift.jp', + title: { + selectors: ['h1.title a'] + }, + author: null, + date_published: { + selectors: ['ul.meta li:not(.social):first-of-type'], + timezone: 'Asia/Tokyo' + }, + dek: { + selectors: [['meta[name="og:description"]', 'value']] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['#main'], + transforms: {}, + clean: ['ul.mg_service.cf'] + } +}; +var WwwItmediaCoJpExtractor = { + domain: 'www.itmedia.co.jp', + supportedDomains: ['www.atmarkit.co.jp', 'techtarget.itmedia.co.jp', 'nlab.itmedia.co.jp'], + title: { + selectors: ['#cmsTitle h1'] + }, + author: { + selectors: ['#byline'] + }, + date_published: { + selectors: [['meta[name="article:modified_time"]', 'value']] + }, + dek: { + selectors: ['#cmsAbstract h2'] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['#cmsBody'], + defaultCleaner: false, + transforms: {}, + clean: ['#snsSharebox'] + } +}; +var WwwPublickey1JpExtractor = { + domain: 'www.publickey1.jp', + title: { + selectors: ['h1'] + }, + author: { + selectors: ['#subcol p:has(img)'] + }, + date_published: { + selectors: ['div.pubdate'], + format: 'YYYY年MM月DD日', + timezone: 'Asia/Tokyo' + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['#maincol'], + defaultCleaner: false, + transforms: {}, + clean: ['#breadcrumbs', 'div.sbm', 'div.ad_footer'] + } +}; +var TakagihiromitsuJpExtractor = { + domain: 'takagi-hiromitsu.jp', + title: { + selectors: ['h3'] + }, + author: { + selectors: [['meta[name="author"]', 'value']] + }, + date_published: { + selectors: [['meta[http-equiv="Last-Modified"]', 'value']] + }, + dek: null, + lead_image_url: null, + content: { + selectors: ['div.body'], + defaultCleaner: false, + transforms: {}, + clean: [] + } +}; +var BookwalkerJpExtractor = { + domain: 'bookwalker.jp', + title: { + selectors: ['h1.main-heading'] + }, + author: { + selectors: ['div.authors'] + }, + date_published: { + selectors: ['.work-info .work-detail:first-of-type .work-detail-contents:last-of-type'], + timezone: 'Asia/Tokyo' + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: [['div.main-info', 'div.main-cover-inner']], + defaultCleaner: false, + transforms: {}, + clean: ['span.label.label--trial', 'dt.info-head.info-head--coin', 'dd.info-contents.info-contents--coin', 'div.info-notice.fn-toggleClass'] + } +}; +var WwwYomiuriCoJpExtractor = { + domain: 'www.yomiuri.co.jp', + title: { + selectors: ['h1.title-article.c-article-title'] + }, + author: null, + date_published: { + selectors: [['meta[name="article:published_time"]', 'value']] + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.p-main-contents'], + transforms: {}, + clean: [] + } +}; +var JapanCnetComExtractor = { + domain: 'japan.cnet.com', + title: { + selectors: ['.leaf-headline-ttl'] + }, + author: { + selectors: ['.writer'] + }, + date_published: { + selectors: ['.date'], + format: 'YYYY年MM月DD日 HH時mm分', + timezone: 'Asia/Tokyo' + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.article_body'], + transforms: {}, + clean: [] + } +}; +var DeadlineComExtractor = { + domain: 'deadline.com', + title: { + selectors: ['h1'] + }, + author: { + selectors: ['section.author h3'] + }, + date_published: { + selectors: [['meta[name="article:published_time"]', 'value']] + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item'], + transforms: { + '.embed-twitter': function embedTwitter($node) { + var innerHtml = $node.html(); + $node.replaceWith(innerHtml); + } + }, + clean: [] + } +}; +var WwwGizmodoJpExtractor = { + domain: 'www.gizmodo.jp', + title: { + selectors: ['h1.p-post-title'] + }, + author: { + selectors: ['li.p-post-AssistAuthor'] + }, + date_published: { + selectors: [['li.p-post-AssistTime time', 'datetime']] + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['article.p-post'], + transforms: { + 'img.p-post-thumbnailImage': function imgPPostThumbnailImage($node) { + var src = $node.attr('src'); + $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, '')); + } + }, + clean: ['h1.p-post-title', 'ul.p-post-Assist'] + } +}; +var GetnewsJpExtractor = { + domain: 'getnews.jp', + title: { + selectors: ['article h1'] + }, + author: { + selectors: ['span.prof'] + }, + date_published: { + selectors: [['ul.cattag-top time', 'datetime']] + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.post-bodycopy'], + transforms: {}, + clean: [] + } +}; +var WwwLifehackerJpExtractor = { + domain: 'www.lifehacker.jp', + title: { + selectors: ['h1.lh-summary-title'] + }, + author: { + selectors: ['p.lh-entryDetailInner--credit'] + }, + date_published: { + selectors: [['div.lh-entryDetail-header time', 'datetime']] + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.lh-entryDetail-body'], + transforms: { + 'img.lazyload': function imgLazyload($node) { + var src = $node.attr('src'); + $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, '')); + } + }, + clean: ['p.lh-entryDetailInner--credit'] + } +}; +var SectIijAdJpExtractor = { + domain: 'sect.iij.ad.jp', + title: { + selectors: ['h3'] + }, + author: { + selectors: ['dl.entrydate dd'] + }, + date_published: { + selectors: ['dl.entrydate dd'], + format: 'YYYY年MM月DD日', + timezone: 'Asia/Tokyo' + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['#article'], + transforms: {}, + clean: ['dl.entrydate'] + } +}; +var WwwOreillyCoJpExtractor = { + domain: 'www.oreilly.co.jp', + title: { + selectors: ['h3'] + }, + author: { + selectors: ['li[itemprop="author"]'] + }, + date_published: { + selectors: [['meta[itemprop="datePublished"]', 'value']], + timezone: 'Asia/Tokyo' + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['#content'], + defaultCleaner: false, + transforms: {}, + clean: ['.social-tools'] + } +}; +var WwwIpaGoJpExtractor = { + domain: 'www.ipa.go.jp', + title: { + selectors: ['h1'] + }, + author: null, + date_published: { + selectors: ['p.ipar_text_right'], + format: 'YYYY年M月D日', + timezone: 'Asia/Tokyo' + }, + dek: null, + lead_image_url: null, + content: { + selectors: ['#ipar_main'], + defaultCleaner: false, + transforms: {}, + clean: ['p.ipar_text_right'] + } +}; +var WeeklyAsciiJpExtractor = { + domain: 'weekly.ascii.jp', + title: { + selectors: ['h1[itemprop="headline"]'] + }, + author: { + selectors: ['p.author'] + }, + date_published: { + selectors: [['meta[name="odate"]', 'value']] + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.article'], + transforms: {}, + clean: [] + } +}; +var TechlogIijAdJpExtractor = { + domain: 'techlog.iij.ad.jp', + title: { + selectors: ['h1.entry-title'] + }, + author: { + selectors: ['a[rel="author"]'] + }, + date_published: { + selectors: [['time.entry-date', 'datetime']] + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.entry-content'], + defaultCleaner: false, + transforms: {}, + clean: [] + } +}; +var WiredJpExtractor = { + domain: 'wired.jp', + title: { + selectors: ['h1.post-title'] + }, + author: { + selectors: ['p[itemprop="author"]'] + }, + date_published: { + selectors: [['time', 'datetime']] + }, + dek: { + selectors: ['.post-intro'] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['article.article-detail'], + transforms: { + 'img[data-original]': function imgDataOriginal($node) { + var dataOriginal = $node.attr('data-original'); + var src = $node.attr('src'); + var url = URL$1.resolve(src, dataOriginal); + $node.attr('src', url); + } + }, + clean: ['.post-category', 'time', 'h1.post-title', '.social-area-syncer'] + } +}; +var JapanZdnetComExtractor = { + domain: 'japan.zdnet.com', + title: { + selectors: ['h1'] + }, + author: { + selectors: [['meta[name="cXenseParse:author"]', 'value']] + }, + date_published: { + selectors: [['meta[name="article:published_time"]', 'value']] + }, + dek: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['div.article_body'], + transforms: {}, + clean: [] + } +}; +var WwwRbbtodayComExtractor = { + domain: 'www.rbbtoday.com', + title: { + selectors: ['h1'] + }, + author: { + selectors: ['.writer.writer-name'] + }, + date_published: { + selectors: [['header time', 'datetime']] + }, + dek: { + selectors: ['.arti-summary'] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['.arti-content'], + transforms: {}, + clean: ['.arti-giga'] + } +}; +var WwwLemondeFrExtractor = { + domain: 'www.lemonde.fr', + title: { + selectors: ['h1.article__title'] + }, + author: { + selectors: ['.author__name'] + }, + date_published: { + selectors: [['meta[name="og:article:published_time"]', 'value']] + }, + dek: { + selectors: ['.article__desc'] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['.article__content'], + transforms: {}, + clean: [] + } +}; +var WwwPhoronixComExtractor = { + domain: 'www.phoronix.com', + title: { + selectors: ['article header'] + }, + author: { + selectors: ['.author a:first-child'] + }, + date_published: { + selectors: ['.author'], + // 1 June 2019 at 08:34 PM EDT + format: 'D MMMM YYYY at hh:mm', + timezone: 'America/New_York' + }, + dek: null, + lead_image_url: null, + content: { + selectors: ['.content'], + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: {}, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [] + } +}; +var PitchforkComExtractor = { + domain: 'pitchfork.com', + title: { + selectors: ['title'] + }, + author: { + selectors: ['.authors-detail__display-name'] + }, + date_published: { + selectors: [['.pub-date', 'datetime']] + }, + dek: { + selectors: ['.review-detail__abstract'] + }, + lead_image_url: { + selectors: [['.single-album-tombstone__art img', 'src']] + }, + content: { + selectors: ['.review-detail__text'] + }, + extend: { + score: { + selectors: ['.score'] + } + } +}; +var BiorxivOrgExtractor = { + domain: 'biorxiv.org', + title: { + selectors: ['h1#page-title'] + }, + author: { + selectors: ['div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors'] + }, + content: { + selectors: ['div#abstract-1'], + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: {}, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [] + } +}; +var EpaperZeitDeExtractor = { + domain: 'epaper.zeit.de', + title: { + selectors: ['p.title'] + }, + author: { + selectors: ['.article__author'] + }, + date_published: null, + excerpt: { + selectors: ['subtitle'] + }, + lead_image_url: null, + content: { + selectors: ['.article'], + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: { + 'p.title': 'h1', + '.article__author': 'p', + byline: 'p', + linkbox: 'p' + }, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: ['image-credits', 'box[type=citation]'] + } +}; + +var CustomExtractors = +/*#__PURE__*/ +_Object$freeze({ + BloggerExtractor: BloggerExtractor, + NYMagExtractor: NYMagExtractor, + WikipediaExtractor: WikipediaExtractor, + TwitterExtractor: TwitterExtractor, + NYTimesExtractor: NYTimesExtractor, + TheAtlanticExtractor: TheAtlanticExtractor, + NewYorkerExtractor: NewYorkerExtractor, + WiredExtractor: WiredExtractor, + MSNExtractor: MSNExtractor, + YahooExtractor: YahooExtractor, + BuzzfeedExtractor: BuzzfeedExtractor, + WikiaExtractor: WikiaExtractor, + LittleThingsExtractor: LittleThingsExtractor, + PoliticoExtractor: PoliticoExtractor, + DeadspinExtractor: DeadspinExtractor, + BroadwayWorldExtractor: BroadwayWorldExtractor, + ApartmentTherapyExtractor: ApartmentTherapyExtractor, + MediumExtractor: MediumExtractor, + WwwTmzComExtractor: WwwTmzComExtractor, + WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor, + WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor, + NewrepublicComExtractor: NewrepublicComExtractor, + MoneyCnnComExtractor: MoneyCnnComExtractor, + WwwThevergeComExtractor: WwwThevergeComExtractor, + WwwCnnComExtractor: WwwCnnComExtractor, + WwwAolComExtractor: WwwAolComExtractor, + WwwYoutubeComExtractor: WwwYoutubeComExtractor, + WwwTheguardianComExtractor: WwwTheguardianComExtractor, + WwwSbnationComExtractor: WwwSbnationComExtractor, + WwwBloombergComExtractor: WwwBloombergComExtractor, + WwwBustleComExtractor: WwwBustleComExtractor, + WwwNprOrgExtractor: WwwNprOrgExtractor, + WwwRecodeNetExtractor: WwwRecodeNetExtractor, + QzComExtractor: QzComExtractor, + WwwDmagazineComExtractor: WwwDmagazineComExtractor, + WwwReutersComExtractor: WwwReutersComExtractor, + MashableComExtractor: MashableComExtractor, + WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor, + WwwVoxComExtractor: WwwVoxComExtractor, + NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor, + WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor, + WwwLatimesComExtractor: WwwLatimesComExtractor, + PagesixComExtractor: PagesixComExtractor, + ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor, + WwwCbssportsComExtractor: WwwCbssportsComExtractor, + WwwMsnbcComExtractor: WwwMsnbcComExtractor, + WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor, + WwwMentalflossComExtractor: WwwMentalflossComExtractor, + AbcnewsGoComExtractor: AbcnewsGoComExtractor, + WwwNydailynewsComExtractor: WwwNydailynewsComExtractor, + WwwCnbcComExtractor: WwwCnbcComExtractor, + WwwPopsugarComExtractor: WwwPopsugarComExtractor, + ObserverComExtractor: ObserverComExtractor, + PeopleComExtractor: PeopleComExtractor, + WwwUsmagazineComExtractor: WwwUsmagazineComExtractor, + WwwRollingstoneComExtractor: WwwRollingstoneComExtractor, + twofortysevensportsComExtractor: twofortysevensportsComExtractor, + UproxxComExtractor: UproxxComExtractor, + WwwEonlineComExtractor: WwwEonlineComExtractor, + WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor, + WwwRefinery29ComExtractor: WwwRefinery29ComExtractor, + WwwMacrumorsComExtractor: WwwMacrumorsComExtractor, + WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor, + WwwSiComExtractor: WwwSiComExtractor, + WwwRawstoryComExtractor: WwwRawstoryComExtractor, + WwwCnetComExtractor: WwwCnetComExtractor, + WwwCinemablendComExtractor: WwwCinemablendComExtractor, + WwwTodayComExtractor: WwwTodayComExtractor, + WwwHowtogeekComExtractor: WwwHowtogeekComExtractor, + WwwAlComExtractor: WwwAlComExtractor, WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor, WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor, FusionNetExtractor: FusionNetExtractor, @@ -4845,7 +5901,48 @@ _Object$freeze({ WwwSlateComExtractor: WwwSlateComExtractor, IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor, WwwFortinetComExtractor: WwwFortinetComExtractor, - WwwFastcompanyComExtractor: WwwFastcompanyComExtractor + WwwFastcompanyComExtractor: WwwFastcompanyComExtractor, + BlisterreviewComExtractor: BlisterreviewComExtractor, + NewsMynaviJpExtractor: NewsMynaviJpExtractor, + ClinicaltrialsGovExtractor: ClinicaltrialsGovExtractor, + GithubComExtractor: GithubComExtractor, + WwwRedditComExtractor: WwwRedditComExtractor, + OtrsComExtractor: OtrsComExtractor, + WwwOssnewsJpExtractor: WwwOssnewsJpExtractor, + BuzzapJpExtractor: BuzzapJpExtractor, + WwwAsahiComExtractor: WwwAsahiComExtractor, + WwwSanwaCoJpExtractor: WwwSanwaCoJpExtractor, + WwwElecomCoJpExtractor: WwwElecomCoJpExtractor, + ScanNetsecurityNeJpExtractor: ScanNetsecurityNeJpExtractor, + JvndbJvnJpExtractor: JvndbJvnJpExtractor, + GeniusComExtractor: GeniusComExtractor, + WwwJnsaOrgExtractor: WwwJnsaOrgExtractor, + PhpspotOrgExtractor: PhpspotOrgExtractor, + WwwInfoqComExtractor: WwwInfoqComExtractor, + WwwMoongiftJpExtractor: WwwMoongiftJpExtractor, + WwwItmediaCoJpExtractor: WwwItmediaCoJpExtractor, + WwwPublickey1JpExtractor: WwwPublickey1JpExtractor, + TakagihiromitsuJpExtractor: TakagihiromitsuJpExtractor, + BookwalkerJpExtractor: BookwalkerJpExtractor, + WwwYomiuriCoJpExtractor: WwwYomiuriCoJpExtractor, + JapanCnetComExtractor: JapanCnetComExtractor, + DeadlineComExtractor: DeadlineComExtractor, + WwwGizmodoJpExtractor: WwwGizmodoJpExtractor, + GetnewsJpExtractor: GetnewsJpExtractor, + WwwLifehackerJpExtractor: WwwLifehackerJpExtractor, + SectIijAdJpExtractor: SectIijAdJpExtractor, + WwwOreillyCoJpExtractor: WwwOreillyCoJpExtractor, + WwwIpaGoJpExtractor: WwwIpaGoJpExtractor, + WeeklyAsciiJpExtractor: WeeklyAsciiJpExtractor, + TechlogIijAdJpExtractor: TechlogIijAdJpExtractor, + WiredJpExtractor: WiredJpExtractor, + JapanZdnetComExtractor: JapanZdnetComExtractor, + WwwRbbtodayComExtractor: WwwRbbtodayComExtractor, + WwwLemondeFrExtractor: WwwLemondeFrExtractor, + WwwPhoronixComExtractor: WwwPhoronixComExtractor, + PitchforkComExtractor: PitchforkComExtractor, + BiorxivOrgExtractor: BiorxivOrgExtractor, + EpaperZeitDeExtractor: EpaperZeitDeExtractor }); var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) { @@ -4863,6 +5960,10 @@ var SEC_DATE_STRING = /^\d{10}$/i; var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i; var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i; var TIME_MERIDIAN_DOTS_RE = /\.m\./i; +var TIME_NOW_STRING = /^\s*(just|right)?\s*now\s*/i; +var timeUnits = ['seconds?', 'minutes?', 'hours?', 'days?', 'weeks?', 'months?', 'years?']; +var allTimeUnits = timeUnits.join('|'); +var TIME_AGO_STRING = new RegExp("(\\d+)\\s+(".concat(allTimeUnits, ")\\s+ago"), 'i'); var months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']; var allMonths = months.join('|'); var timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?'; @@ -4916,6 +6017,15 @@ function createDate(dateString, timezone, format) { return moment(new Date(dateString)); } + if (TIME_AGO_STRING.test(dateString)) { + var fragments = TIME_AGO_STRING.exec(dateString); + return moment().subtract(fragments[1], fragments[2]); + } + + if (TIME_NOW_STRING.test(dateString)) { + return moment(); + } + return timezone ? moment.tz(dateString, format || parseFormat(dateString), timezone) : moment(dateString, format || parseFormat(dateString)); } // Take a date published string, and hopefully return a date out of // it. Return none if we fail. @@ -6080,9 +7190,7 @@ var GenericExtractor = { }, extract: function extract(options) { var html = options.html, - $ = options.$, - _options$contentType = options.contentType, - contentType = _options$contentType === void 0 ? 'html' : _options$contentType; + $ = options.$; if (html && !$) { var loaded = cheerio$1.load(html); @@ -6116,24 +7224,13 @@ var GenericExtractor = { url = _this$url_and_domain.url, domain = _this$url_and_domain.domain; - var convertedContent; - - if (contentType === 'html') { - convertedContent = content; - } else if (contentType === 'text') { - convertedContent = $.text(cheerio$1.load(content)); - } else if (contentType === 'markdown') { - var turndownService = new TurndownService(); - convertedContent = turndownService.turndown(content); - } - return { title: title, author: author, date_published: date_published || null, dek: dek, lead_image_url: lead_image_url, - content: convertedContent, + content: content, next_page_url: next_page_url, url: url, domain: domain, @@ -6161,7 +7258,7 @@ function getExtractor(url, parsedUrl, $) { var _parsedUrl = parsedUrl, hostname = _parsedUrl.hostname; var baseDomain = hostname.split('.').slice(-2).join('.'); - return Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor; + return apiExtractors[hostname] || apiExtractors[baseDomain] || Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor; } function cleanBySelectors($content, $, _ref) { @@ -6199,7 +7296,7 @@ function transformElements($content, $, _ref2) { return $content; } -function findMatchingSelector($, selectors, extractHtml) { +function findMatchingSelector($, selectors, extractHtml, allowMultiple) { return selectors.find(function (selector) { if (_Array$isArray(selector)) { if (extractHtml) { @@ -6212,10 +7309,10 @@ function findMatchingSelector($, selectors, extractHtml) { s = _selector[0], attr = _selector[1]; - return $(s).length === 1 && $(s).attr(attr) && $(s).attr(attr).trim() !== ''; + return (allowMultiple || !allowMultiple && $(s).length === 1) && $(s).attr(attr) && $(s).attr(attr).trim() !== ''; } - return $(selector).length === 1 && $(selector).text().trim() !== ''; + return (allowMultiple || !allowMultiple && $(selector).length === 1) && $(selector).text().trim() !== ''; }); } @@ -6224,9 +7321,7 @@ function select(opts) { type = opts.type, extractionOpts = opts.extractionOpts, _opts$extractHtml = opts.extractHtml, - extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml, - _opts$contentType = opts.contentType, - contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType; // Skip if there's not extraction for this type + extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia // contributors), return the string @@ -6234,25 +7329,30 @@ function select(opts) { if (typeof extractionOpts === 'string') return extractionOpts; var selectors = extractionOpts.selectors, _extractionOpts$defau = extractionOpts.defaultCleaner, - defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau; - var matchingSelector = findMatchingSelector($, selectors, extractHtml); - if (!matchingSelector) return null; // Declaring result; will contain either - // text or html, which will be cleaned - // by the appropriate cleaner type - // If the selector type requests html as its return type - // transform and clean the element with provided selectors - - var $content; - - if (extractHtml) { - // If matching selector is an array, we're considering this a + defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau, + allowMultiple = extractionOpts.allowMultiple; + var matchingSelector = findMatchingSelector($, selectors, extractHtml, allowMultiple); + if (!matchingSelector) return null; + + function transformAndClean($node) { + makeLinksAbsolute$$1($node, $, opts.url || ''); + cleanBySelectors($node, $, extractionOpts); + transformElements($node, $, extractionOpts); + return $node; + } + + function selectHtml() { + // If the selector type requests html as its return type + // transform and clean the element with provided selectors + var $content; // If matching selector is an array, we're considering this a // multi-match selection, which allows the parser to choose several // selectors to include in the result. Note that all selectors in the // array must match in order for this selector to trigger + if (_Array$isArray(matchingSelector)) { $content = $(matchingSelector.join(',')); var $wrapper = $('
'); - $content.each(function (index, element) { + $content.each(function (_, element) { $wrapper.append(element); }); $content = $wrapper; @@ -6263,51 +7363,76 @@ function select(opts) { $content.wrap($('
')); $content = $content.parent(); - $content = transformElements($content, $, extractionOpts); - $content = cleanBySelectors($content, $, extractionOpts); - $content = Cleaners[type]($content, _objectSpread({}, opts, { - defaultCleaner: defaultCleaner - })); + $content = transformAndClean($content); - if (contentType === 'html') { - return $.html($content); + if (Cleaners[type]) { + Cleaners[type]($content, _objectSpread({}, opts, { + defaultCleaner: defaultCleaner + })); } - if (contentType === 'text') { - return $.text($content); + if (allowMultiple) { + return $content.children().toArray().map(function (el) { + return $.html($(el)); + }); } - if (contentType === 'markdown') { - var turndownService = new TurndownService(); - return turndownService.turndown($.html($content)); - } + return $.html($content); + } + + if (extractHtml) { + return selectHtml(matchingSelector); } + var $match; var result; // if selector is an array (e.g., ['img', 'src']), // extract the attr if (_Array$isArray(matchingSelector)) { - var _matchingSelector = _slicedToArray$1(matchingSelector, 2), + var _matchingSelector = _slicedToArray$1(matchingSelector, 3), selector = _matchingSelector[0], - attr = _matchingSelector[1]; - - result = $(selector).attr(attr).trim(); + attr = _matchingSelector[1], + transform = _matchingSelector[2]; + + $match = $(selector); + $match = transformAndClean($match); + result = $match.map(function (_, el) { + var item = $(el).attr(attr).trim(); + return transform ? transform(item) : item; + }); } else { - var $node = $(matchingSelector); - $node = cleanBySelectors($node, $, extractionOpts); - $node = transformElements($node, $, extractionOpts); - result = $node.text().trim(); - } // Allow custom extractor to skip default cleaner - // for this type; defaults to true + $match = $(matchingSelector); + $match = transformAndClean($match); + result = $match.map(function (_, el) { + return $(el).text().trim(); + }); + } + result = _Array$isArray(result.toArray()) && allowMultiple ? result.toArray() : result[0]; // Allow custom extractor to skip default cleaner + // for this type; defaults to true - if (defaultCleaner) { + if (defaultCleaner && Cleaners[type]) { return Cleaners[type](result, _objectSpread({}, opts, extractionOpts)); } return result; } +function selectExtendedTypes(extend, opts) { + var results = {}; + + _Reflect$ownKeys$1(extend).forEach(function (t) { + if (!results[t]) { + results[t] = select(_objectSpread({}, opts, { + type: t, + extractionOpts: extend[t] + })); + } + }); + + return results; +} + function extractResult(opts) { var type = opts.type, extractor = opts.extractor, @@ -6333,9 +7458,7 @@ var RootExtractor = { var opts = arguments.length > 1 ? arguments[1] : undefined; var _opts = opts, contentOnly = _opts.contentOnly, - extractedTitle = _opts.extractedTitle, - _opts$contentType2 = _opts.contentType, - contentType = _opts$contentType2 === void 0 ? 'html' : _opts$contentType2; // This is the generic extractor. Run its extract method + extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method if (extractor.domain === '*') return extractor.extract(opts); opts = _objectSpread({}, opts, { @@ -6346,8 +7469,7 @@ var RootExtractor = { var _content = extractResult(_objectSpread({}, opts, { type: 'content', extractHtml: true, - title: extractedTitle, - contentType: contentType + title: extractedTitle })); return { @@ -6403,7 +7525,13 @@ var RootExtractor = { url = _ref3.url, domain = _ref3.domain; - return { + var extendedResults = {}; + + if (extractor.extend) { + extendedResults = selectExtendedTypes(extractor.extend, opts); + } + + return _objectSpread({ title: title, content: content, author: author, @@ -6416,7 +7544,7 @@ var RootExtractor = { excerpt: excerpt, word_count: word_count, direction: direction - }; + }, extendedResults); } }; @@ -6507,14 +7635,20 @@ var Mercury = { fallback, _opts$contentType, contentType, + _opts$headers, + headers, + extend, + customExtractor, parsedUrl, $, Extractor, metaCache, + extendedTypes, result, _result, title, next_page_url, + turndownService, _args = arguments; return _regeneratorRuntime.wrap(function _callee$(_context) { @@ -6522,7 +7656,7 @@ var Mercury = { switch (_context.prev = _context.next) { case 0: _ref = _args.length > 1 && _args[1] !== undefined ? _args[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, ["html"]); - _opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType; // if no url was passed and this is the browser version, + _opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend, customExtractor = opts.customExtractor; // if no url was passed and this is the browser version, // set url to window.location.href and load the html // from the current page @@ -6539,27 +7673,35 @@ var Mercury = { break; } - return _context.abrupt("return", Errors.badUrl); + return _context.abrupt("return", { + error: true, + message: 'The url parameter passed does not look like a valid URL. Please check your URL and try again.' + }); case 6: _context.next = 8; - return Resource.create(url, html, parsedUrl); + return Resource.create(url, html, parsedUrl, headers); case 8: $ = _context.sent; - Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`); - // If we found an error creating the resource, return that error if (!$.failed) { - _context.next = 12; + _context.next = 11; break; } return _context.abrupt("return", $); - case 12: + case 11: + // Add custom extractor via cli. + if (customExtractor) { + addExtractor(customExtractor); + } + + Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`); // if html still has not been set (i.e., url passed to Mercury.parse), // set html from the response of Resource.create + if (!html) { html = $.html(); } // Cached value of every meta name in our document. @@ -6569,6 +7711,16 @@ var Mercury = { metaCache = $('meta').map(function (_, node) { return $(node).attr('name'); }).toArray(); + extendedTypes = {}; + + if (extend) { + extendedTypes = selectExtendedTypes(extend, { + $: $, + url: url, + html: html + }); + } + result = RootExtractor.extract(Extractor, { url: url, html: html, @@ -6581,11 +7733,11 @@ var Mercury = { _result = result, title = _result.title, next_page_url = _result.next_page_url; // Fetch more pages if next_page_url found if (!(fetchAllPages && next_page_url)) { - _context.next = 22; + _context.next = 25; break; } - _context.next = 19; + _context.next = 22; return collectAllPages({ Extractor: Extractor, next_page_url: next_page_url, @@ -6597,21 +7749,28 @@ var Mercury = { url: url }); - case 19: + case 22: result = _context.sent; - _context.next = 23; + _context.next = 26; break; - case 22: + case 25: result = _objectSpread({}, result, { total_pages: 1, rendered_pages: 1 }); - case 23: - return _context.abrupt("return", result); + case 26: + if (contentType === 'markdown') { + turndownService = new TurndownService(); + result.content = turndownService.turndown(result.content); + } else if (contentType === 'text') { + result.content = $.text($(result.content)); + } - case 24: + return _context.abrupt("return", _objectSpread({}, result, extendedTypes)); + + case 28: case "end": return _context.stop(); } @@ -6630,6 +7789,9 @@ var Mercury = { // to work with, e.g., for custom extractor generator fetchResource: function fetchResource(url) { return Resource.create(url); + }, + addExtractor: function addExtractor$$1(extractor) { + return addExtractor(extractor); } }; var mercury = Mercury; @@ -6786,8 +7948,6 @@ function scaffoldCustomParser(url) { hostname = _URL$parse3.hostname; var newParser = false; - console.log("dir", dir); - console.log("fs.existsSync(dir)", fs.existsSync(dir)); if (!fs.existsSync(dir)) { newParser = true;