Skip to content

Commit

Permalink
release: 2.2.3 (#703)
Browse files Browse the repository at this point in the history
  • Loading branch information
johnholdun authored Oct 24, 2022
1 parent 635fcf6 commit ad8d4aa
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 11 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Mercury Parser Changelog

### 2.2.3 (Oct 24, 2022)

- [[`635fcf6356`](https://github.com/postlight/parser/commit/635fcf6356)] - **fix**: handle sec & ms timestamps properly (#702) (Austin)
- [[`ab401822aa`](https://github.com/postlight/parser/commit/ab401822aa)] - maintenance update - october 2022 (#696) (Michael Ashley)
- [[`8ca8a5f7e5`](https://github.com/postlight/parser/commit/8ca8a5f7e5)] - **feat**: add postlight.com custom extractor (#695) (Sarah Doire)
- [[`39b9ff55c4`](https://github.com/postlight/parser/commit/39b9ff55c4)] - **release**: 2.2.2 (#689) (John Holdun)

### 2.2.2 (Sept 08, 2022)

##### Commits
Expand Down
64 changes: 55 additions & 9 deletions dist/mercury.js
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,19 @@ var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
// the src attribute so the images are no longer lazy loaded.

function convertLazyLoadedImages($) {
var extractSrcFromJSON = function extractSrcFromJSON(str) {
try {
var _JSON$parse = JSON.parse(str),
src = _JSON$parse.src;

if (typeof src === 'string') return src;
} catch (e) {
return false;
}

return false;
};

$('img').each(function (_, img) {
var attrs = getAttrs(img);

Expand All @@ -1549,7 +1562,14 @@ function convertLazyLoadedImages($) {
if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
$(img).attr('srcset', value);
} else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
$(img).attr('src', value);
// Is the value a JSON object? If so, we should attempt to extract the image src from the data.
var existingSrc = extractSrcFromJSON(value);

if (existingSrc) {
$(img).attr('src', existingSrc);
} else {
$(img).attr('src', value);
}
}
});
});
Expand Down Expand Up @@ -2388,6 +2408,14 @@ var MediumExtractor = {
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
// Allow drop cap character.
'section span:first-of-type': function sectionSpanFirstOfType($node) {
var $text = $node.html();

if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
$node.replaceWith($text);
}
},
// Re-write lazy-loaded youtube videos
iframe: function iframe($node) {
var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
Expand Down Expand Up @@ -2429,7 +2457,7 @@ var MediumExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['span', 'svg']
clean: ['span a', 'svg']
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']]
Expand Down Expand Up @@ -6411,10 +6439,14 @@ function cleanDatePublished(dateString) {
format = _ref.format;

// If string is in milliseconds or seconds, convert to int and return
if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
if (MS_DATE_STRING.test(dateString)) {
return new Date(_parseInt(dateString, 10)).toISOString();
}

if (SEC_DATE_STRING.test(dateString)) {
return new Date(_parseInt(dateString, 10) * 1000).toISOString();
}

var date = createDate(dateString, timezone, format);

if (!date.isValid()) {
Expand Down Expand Up @@ -7546,13 +7578,26 @@ var GenericExcerptExtractor = {
}
};

var getWordCount = function getWordCount(content) {
var $ = cheerio.load(content);
var $content = $('div').first();
var text = normalizeSpaces($content.text());
return text.split(/\s/).length;
};

var getWordCountAlt = function getWordCountAlt(content) {
content = content.replace(/<[^>]*>/g, ' ');
content = content.replace(/\s+/g, ' ');
content = content.trim();
return content.split(' ').length;
};

var GenericWordCountExtractor = {
extract: function extract(_ref) {
var content = _ref.content;
var $ = cheerio.load(content);
var $content = $('div').first();
var text = normalizeSpaces($content.text());
return text.split(/\s/).length;
var count = getWordCount(content);
if (count === 1) count = getWordCountAlt(content);
return count;
}
};

Expand Down Expand Up @@ -7715,7 +7760,8 @@ function select(opts) {
_extractionOpts$defau = extractionOpts.defaultCleaner,
defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau,
allowMultiple = extractionOpts.allowMultiple;
var matchingSelector = findMatchingSelector($, selectors, extractHtml, allowMultiple);
var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
var matchingSelector = findMatchingSelector($, selectors, extractHtml, overrideAllowMultiple);
if (!matchingSelector) return null;

function transformAndClean($node) {
Expand Down Expand Up @@ -7988,7 +8034,7 @@ function _collectAllPages() {
});
return _context.abrupt("return", _objectSpread({}, result, {
total_pages: pages,
pages_rendered: pages,
rendered_pages: pages,
word_count: word_count
}));

Expand Down
2 changes: 1 addition & 1 deletion dist/mercury.web.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@postlight/parser",
"version": "2.2.2",
"version": "2.2.3",
"description": "Postlight Parser transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
"author": "Postlight <mercury@postlight.com>",
"homepage": "https://reader.postlight.com",
Expand Down

0 comments on commit ad8d4aa

Please sign in to comment.