Skip to content

Commit 7844129

Browse files
toufic-madampash
authored andcommitted
feat: Add custom parser for Reddit (#307)
1 parent 13581cd commit 7844129

File tree

16 files changed

+491
-1
lines changed

16 files changed

+491
-1
lines changed

fixtures/www.reddit.com/1551705199548.html

Lines changed: 9 additions & 0 deletions
Large diffs are not rendered by default.

fixtures/www.reddit.com/1552069905710.html

Lines changed: 4 additions & 0 deletions
Large diffs are not rendered by default.

fixtures/www.reddit.com/1552069933451.html

Lines changed: 8 additions & 0 deletions
Large diffs are not rendered by default.

fixtures/www.reddit.com/1552069947100.html

Lines changed: 3 additions & 0 deletions
Large diffs are not rendered by default.

fixtures/www.reddit.com/1552069958273.html

Lines changed: 5 additions & 0 deletions
Large diffs are not rendered by default.

fixtures/www.reddit.com/1552069973740.html

Lines changed: 29 additions & 0 deletions
Large diffs are not rendered by default.

fixtures/www.reddit.com/1552069996237.html

Lines changed: 23 additions & 0 deletions
Large diffs are not rendered by default.

fixtures/www.reddit.com/1552070031501.html

Lines changed: 7 additions & 0 deletions
Large diffs are not rendered by default.

src/cleaners/constants.js

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,21 @@ export const SEC_DATE_STRING = /^\d{10}$/i;
2727
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
2828
export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
2929
export const TIME_MERIDIAN_DOTS_RE = /\.m\./i;
30+
export const TIME_NOW_STRING = /^\s*(just|right)?\s*now\s*/i;
31+
const timeUnits = [
32+
'seconds?',
33+
'minutes?',
34+
'hours?',
35+
'days?',
36+
'weeks?',
37+
'months?',
38+
'years?',
39+
];
40+
const allTimeUnits = timeUnits.join('|');
41+
export const TIME_AGO_STRING = new RegExp(
42+
`(\\d+)\\s+(${allTimeUnits})\\s+ago`,
43+
'i'
44+
);
3045
const months = [
3146
'jan',
3247
'feb',

src/cleaners/date-published.js

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import {
99
SEC_DATE_STRING,
1010
CLEAN_DATE_STRING_RE,
1111
SPLIT_DATE_STRING,
12+
TIME_AGO_STRING,
13+
TIME_NOW_STRING,
1214
TIME_MERIDIAN_SPACE_RE,
1315
TIME_MERIDIAN_DOTS_RE,
1416
TIME_WITH_OFFSET_RE,
@@ -28,6 +30,15 @@ export function createDate(dateString, timezone, format) {
2830
return moment(new Date(dateString));
2931
}
3032

33+
if (TIME_AGO_STRING.test(dateString)) {
34+
const fragments = TIME_AGO_STRING.exec(dateString);
35+
return moment().subtract(fragments[1], fragments[2]);
36+
}
37+
38+
if (TIME_NOW_STRING.test(dateString)) {
39+
return moment();
40+
}
41+
3142
return timezone
3243
? moment.tz(dateString, format || parseFormat(dateString), timezone)
3344
: moment(dateString, format || parseFormat(dateString));

src/cleaners/date-published.test.js

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,70 @@ describe('cleanDatePublished(dateString)', () => {
3434
});
3535
assert.equal(datePublished, '2015-08-03T16:45:00.000Z');
3636
});
37+
38+
it('can handle dates formatted as "[just|right] now"', () => {
39+
const date1 = cleanDatePublished('now');
40+
const newDate1 = moment(date1)
41+
.format()
42+
.split('T')[0];
43+
const expectedDate1 = moment()
44+
.format()
45+
.split('T')[0];
46+
assert.equal(newDate1, expectedDate1);
47+
48+
const date2 = cleanDatePublished('just now');
49+
const newDate2 = moment(date2)
50+
.format()
51+
.split('T')[0];
52+
const expectedDate2 = moment()
53+
.format()
54+
.split('T')[0];
55+
assert.equal(newDate2, expectedDate2);
56+
57+
const date3 = cleanDatePublished('right now');
58+
const newDate3 = moment(date3)
59+
.format()
60+
.split('T')[0];
61+
const expectedDate3 = moment()
62+
.format()
63+
.split('T')[0];
64+
assert.equal(newDate3, expectedDate3);
65+
});
66+
67+
it('can handle dates formatted as "[amount] [time unit] ago"', () => {
68+
// This generates an approximate date with a margin of error, for example:
69+
// "X days ago" will not be accurate down to the exact time
70+
// "X months ago" will not be accurate down to the exact day
71+
const date1 = cleanDatePublished('1 hour ago');
72+
const newDate1 = moment(date1)
73+
.format()
74+
.split('T')[0];
75+
const expectedDate1 = moment()
76+
.subtract(1, 'hour')
77+
.format()
78+
.split('T')[0];
79+
assert.equal(newDate1, expectedDate1);
80+
81+
const date2 = cleanDatePublished('5 days ago');
82+
const newDate2 = moment(date2)
83+
.format()
84+
.split('T')[0];
85+
const expectedDate2 = moment()
86+
.subtract(5, 'days')
87+
.format()
88+
.split('T')[0];
89+
assert.equal(newDate2, expectedDate2);
90+
91+
const date3 = cleanDatePublished('10 months ago');
92+
const newDate3 = moment(date3)
93+
.format()
94+
.split('T')[0];
95+
const expectedDate3 = moment()
96+
.subtract(10, 'months')
97+
.format()
98+
.split('T')[0];
99+
assert.equal(newDate3, expectedDate3);
100+
});
37101
});
38102

39103
describe('cleanDateString(dateString)', () => {

src/extractors/custom/README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,28 @@ export const ExampleExtractor = {
7373
7474
This is all you'll need to know to handle most of the fields Mercury parses (titles, authors, date published, etc.). Article content is the exception.
7575
76+
#### Content selectors
77+
78+
If you pass an array selector for the content selection, it behaves differently from the attribute selectors on other types. In such cases, it will be considered as a multi-match selection, which allows the parser to choose several selectors to include in the result, and will include all occurrences of each matching selector in the result.
79+
80+
Note that all selectors in the array must match in order for this selector to trigger.
81+
82+
```javascript
83+
export const ExampleExtractor = {
84+
...
85+
86+
// Attempt to match both the content and image
87+
// before falling back to just the content
88+
content: {
89+
selectors: [
90+
['.parsys.content', '.__image-lead__'],
91+
'.content'
92+
],
93+
},
94+
95+
...
96+
```
97+
7698
### Cleaning content from an article
7799
78100
An article's content can be more complex than the other fields, meaning you sometimes need to do more than just provide the selector(s) in order to return clean content.

src/extractors/custom/index.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,5 @@ export * from './ici.radio-canada.ca';
9292
export * from './www.fortinet.com';
9393
export * from './www.fastcompany.com';
9494
export * from './blisterreview.com';
95-
export * from './news.mynavi.jp';
95+
export * from './news.mynavi.jp';
96+
export * from './www.reddit.com';
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
export const WwwRedditComExtractor = {
2+
domain: 'www.reddit.com',
3+
4+
title: {
5+
selectors: ['div[data-test-id="post-content"] h2'],
6+
},
7+
8+
author: {
9+
selectors: ['div[data-test-id="post-content"] a[href*="user/"]'],
10+
},
11+
12+
date_published: {
13+
selectors: [
14+
'div[data-test-id="post-content"] a[data-click-id="timestamp"]',
15+
],
16+
},
17+
18+
lead_image_url: {
19+
selectors: [['meta[name="og:image"]', 'value']],
20+
},
21+
22+
content: {
23+
selectors: [
24+
['div[data-test-id="post-content"] p'], // text post
25+
[
26+
'div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link
27+
'div[data-test-id="post-content"] div[data-click-id="media"]', // embedded media
28+
], // external link with media preview (YouTube, imgur album, etc...)
29+
['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video)
30+
[
31+
'div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])',
32+
], // external link
33+
'div[data-test-id="post-content"]',
34+
],
35+
36+
// Is there anything in the content you selected that needs transformed
37+
// before it's consumable content? E.g., unusual lazy loaded images
38+
transforms: {
39+
'div[role="img"]': $node => {
40+
// External link image preview
41+
const $img = $node.find('img');
42+
const bgImg = $node.css('background-image');
43+
if ($img.length === 1 && bgImg) {
44+
$img.attr('src', bgImg.match(/\((.*?)\)/)[1].replace(/('|")/g, ''));
45+
return $img;
46+
}
47+
return $node;
48+
},
49+
},
50+
51+
// Is there anything that is in the result that shouldn't be?
52+
// The clean selectors will remove anything that matches from
53+
// the result
54+
clean: ['.icon'],
55+
},
56+
};

0 commit comments

Comments
 (0)