Skip to content

Commit

Permalink
feat: Add custom parser for Reddit (#307)
Browse files Browse the repository at this point in the history
  • Loading branch information
toufic-m authored and adampash committed Mar 8, 2019
1 parent 13581cd commit 7844129
Show file tree
Hide file tree
Showing 16 changed files with 491 additions and 1 deletion.
9 changes: 9 additions & 0 deletions fixtures/www.reddit.com/1551705199548.html

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions fixtures/www.reddit.com/1552069905710.html

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions fixtures/www.reddit.com/1552069933451.html

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions fixtures/www.reddit.com/1552069947100.html

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions fixtures/www.reddit.com/1552069958273.html

Large diffs are not rendered by default.

29 changes: 29 additions & 0 deletions fixtures/www.reddit.com/1552069973740.html

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions fixtures/www.reddit.com/1552069996237.html

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions fixtures/www.reddit.com/1552070031501.html

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions src/cleaners/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ export const SEC_DATE_STRING = /^\d{10}$/i;
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
export const TIME_MERIDIAN_DOTS_RE = /\.m\./i;
export const TIME_NOW_STRING = /^\s*(just|right)?\s*now\s*/i;
const timeUnits = [
'seconds?',
'minutes?',
'hours?',
'days?',
'weeks?',
'months?',
'years?',
];
const allTimeUnits = timeUnits.join('|');
export const TIME_AGO_STRING = new RegExp(
`(\\d+)\\s+(${allTimeUnits})\\s+ago`,
'i'
);
const months = [
'jan',
'feb',
Expand Down
11 changes: 11 additions & 0 deletions src/cleaners/date-published.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import {
SEC_DATE_STRING,
CLEAN_DATE_STRING_RE,
SPLIT_DATE_STRING,
TIME_AGO_STRING,
TIME_NOW_STRING,
TIME_MERIDIAN_SPACE_RE,
TIME_MERIDIAN_DOTS_RE,
TIME_WITH_OFFSET_RE,
Expand All @@ -28,6 +30,15 @@ export function createDate(dateString, timezone, format) {
return moment(new Date(dateString));
}

if (TIME_AGO_STRING.test(dateString)) {
const fragments = TIME_AGO_STRING.exec(dateString);
return moment().subtract(fragments[1], fragments[2]);
}

if (TIME_NOW_STRING.test(dateString)) {
return moment();
}

return timezone
? moment.tz(dateString, format || parseFormat(dateString), timezone)
: moment(dateString, format || parseFormat(dateString));
Expand Down
64 changes: 64 additions & 0 deletions src/cleaners/date-published.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,70 @@ describe('cleanDatePublished(dateString)', () => {
});
assert.equal(datePublished, '2015-08-03T16:45:00.000Z');
});

it('can handle dates formatted as "[just|right] now"', () => {
const date1 = cleanDatePublished('now');
const newDate1 = moment(date1)
.format()
.split('T')[0];
const expectedDate1 = moment()
.format()
.split('T')[0];
assert.equal(newDate1, expectedDate1);

const date2 = cleanDatePublished('just now');
const newDate2 = moment(date2)
.format()
.split('T')[0];
const expectedDate2 = moment()
.format()
.split('T')[0];
assert.equal(newDate2, expectedDate2);

const date3 = cleanDatePublished('right now');
const newDate3 = moment(date3)
.format()
.split('T')[0];
const expectedDate3 = moment()
.format()
.split('T')[0];
assert.equal(newDate3, expectedDate3);
});

it('can handle dates formatted as "[amount] [time unit] ago"', () => {
// This generates an approximate date with a margin of error, for example:
// "X days ago" will not be accurate down to the exact time
// "X months ago" will not be accurate down to the exact day
const date1 = cleanDatePublished('1 hour ago');
const newDate1 = moment(date1)
.format()
.split('T')[0];
const expectedDate1 = moment()
.subtract(1, 'hour')
.format()
.split('T')[0];
assert.equal(newDate1, expectedDate1);

const date2 = cleanDatePublished('5 days ago');
const newDate2 = moment(date2)
.format()
.split('T')[0];
const expectedDate2 = moment()
.subtract(5, 'days')
.format()
.split('T')[0];
assert.equal(newDate2, expectedDate2);

const date3 = cleanDatePublished('10 months ago');
const newDate3 = moment(date3)
.format()
.split('T')[0];
const expectedDate3 = moment()
.subtract(10, 'months')
.format()
.split('T')[0];
assert.equal(newDate3, expectedDate3);
});
});

describe('cleanDateString(dateString)', () => {
Expand Down
22 changes: 22 additions & 0 deletions src/extractors/custom/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,28 @@ export const ExampleExtractor = {
This is all you'll need to know to handle most of the fields Mercury parses (titles, authors, date published, etc.). Article content is the exception.
#### Content selectors
If you pass an array selector for the content selection, it behaves differently from the attribute selectors on other types. In such cases, it will be considered as a multi-match selection, which allows the parser to choose several selectors to include in the result, and will include all occurrences of each matching selector in the result.
Note that all selectors in the array must match in order for this selector to trigger.
```javascript
export const ExampleExtractor = {
...

// Attempt to match both the content and image
// before falling back to just the content
content: {
selectors: [
['.parsys.content', '.__image-lead__'],
'.content'
],
},

...
```
### Cleaning content from an article
An article's content can be more complex than the other fields, meaning you sometimes need to do more than just provide the selector(s) in order to return clean content.
Expand Down
3 changes: 2 additions & 1 deletion src/extractors/custom/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,5 @@ export * from './ici.radio-canada.ca';
export * from './www.fortinet.com';
export * from './www.fastcompany.com';
export * from './blisterreview.com';
export * from './news.mynavi.jp';
export * from './news.mynavi.jp';
export * from './www.reddit.com';
56 changes: 56 additions & 0 deletions src/extractors/custom/www.reddit.com/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
export const WwwRedditComExtractor = {
domain: 'www.reddit.com',

title: {
selectors: ['div[data-test-id="post-content"] h2'],
},

author: {
selectors: ['div[data-test-id="post-content"] a[href*="user/"]'],
},

date_published: {
selectors: [
'div[data-test-id="post-content"] a[data-click-id="timestamp"]',
],
},

lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']],
},

content: {
selectors: [
['div[data-test-id="post-content"] p'], // text post
[
'div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link
'div[data-test-id="post-content"] div[data-click-id="media"]', // embedded media
], // external link with media preview (YouTube, imgur album, etc...)
['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video)
[
'div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])',
], // external link
'div[data-test-id="post-content"]',
],

// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'div[role="img"]': $node => {
// External link image preview
const $img = $node.find('img');
const bgImg = $node.css('background-image');
if ($img.length === 1 && bgImg) {
$img.attr('src', bgImg.match(/\((.*?)\)/)[1].replace(/('|")/g, ''));
return $img;
}
return $node;
},
},

// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.icon'],
},
};
Loading

0 comments on commit 7844129

Please sign in to comment.