postlight
diff --git a/‎fixtures/www.reddit.com/1551705199548.html
Lines changed: 9 additions & 0 deletions b/‎fixtures/www.reddit.com/1551705199548.html
Lines changed: 9 additions & 0 deletions
diff --git a/‎fixtures/www.reddit.com/1552069905710.html
Lines changed: 4 additions & 0 deletions b/‎fixtures/www.reddit.com/1552069905710.html
Lines changed: 4 additions & 0 deletions
diff --git a/‎fixtures/www.reddit.com/1552069933451.html
Lines changed: 8 additions & 0 deletions b/‎fixtures/www.reddit.com/1552069933451.html
Lines changed: 8 additions & 0 deletions
diff --git a/‎fixtures/www.reddit.com/1552069947100.html
Lines changed: 3 additions & 0 deletions b/‎fixtures/www.reddit.com/1552069947100.html
Lines changed: 3 additions & 0 deletions
diff --git a/‎fixtures/www.reddit.com/1552069958273.html
Lines changed: 5 additions & 0 deletions b/‎fixtures/www.reddit.com/1552069958273.html
Lines changed: 5 additions & 0 deletions
diff --git a/‎fixtures/www.reddit.com/1552069973740.html
Lines changed: 29 additions & 0 deletions b/‎fixtures/www.reddit.com/1552069973740.html
Lines changed: 29 additions & 0 deletions
diff --git a/‎fixtures/www.reddit.com/1552069996237.html
Lines changed: 23 additions & 0 deletions b/‎fixtures/www.reddit.com/1552069996237.html
Lines changed: 23 additions & 0 deletions
diff --git a/‎fixtures/www.reddit.com/1552070031501.html
Lines changed: 7 additions & 0 deletions b/‎fixtures/www.reddit.com/1552070031501.html
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/cleaners/constants.js
Lines changed: 15 additions & 0 deletions b/‎src/cleaners/constants.js
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/cleaners/date-published.js
Lines changed: 11 additions & 0 deletions b/‎src/cleaners/date-published.js
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/cleaners/date-published.test.js
Lines changed: 64 additions & 0 deletions b/‎src/cleaners/date-published.test.js
Lines changed: 64 additions & 0 deletions
diff --git a/‎src/extractors/custom/README.md
Lines changed: 22 additions & 0 deletions b/‎src/extractors/custom/README.md
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/extractors/custom/index.js
Lines changed: 2 additions & 1 deletion b/‎src/extractors/custom/index.js
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/extractors/custom/www.reddit.com/index.js
Lines changed: 56 additions & 0 deletions b/‎src/extractors/custom/www.reddit.com/index.js
Lines changed: 56 additions & 0 deletions
@@ -27,6 +27,21 @@ export const SEC_DATE_STRING = /^\d{10}$/i;
 export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
 export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
 export const TIME_MERIDIAN_DOTS_RE = /\.m\./i;
+export const TIME_NOW_STRING = /^\s*(just|right)?\s*now\s*/i;
+const timeUnits = [
+  'seconds?',
+  'minutes?',
+  'hours?',
+  'days?',
+  'weeks?',
+  'months?',
+  'years?',
+];
+const allTimeUnits = timeUnits.join('|');
+export const TIME_AGO_STRING = new RegExp(
+  `(\\d+)\\s+(${allTimeUnits})\\s+ago`,
+  'i'
+);
 const months = [
   'jan',
   'feb',
 
@@ -9,6 +9,8 @@ import {
   SEC_DATE_STRING,
   CLEAN_DATE_STRING_RE,
   SPLIT_DATE_STRING,
+  TIME_AGO_STRING,
+  TIME_NOW_STRING,
   TIME_MERIDIAN_SPACE_RE,
   TIME_MERIDIAN_DOTS_RE,
   TIME_WITH_OFFSET_RE,
@@ -28,6 +30,15 @@ export function createDate(dateString, timezone, format) {
     return moment(new Date(dateString));
   }
 
+  if (TIME_AGO_STRING.test(dateString)) {
+    const fragments = TIME_AGO_STRING.exec(dateString);
+    return moment().subtract(fragments[1], fragments[2]);
+  }
+
+  if (TIME_NOW_STRING.test(dateString)) {
+    return moment();
+  }
+
   return timezone
     ? moment.tz(dateString, format || parseFormat(dateString), timezone)
     : moment(dateString, format || parseFormat(dateString));
 
@@ -34,6 +34,70 @@ describe('cleanDatePublished(dateString)', () => {
     });
     assert.equal(datePublished, '2015-08-03T16:45:00.000Z');
   });
+
+  it('can handle dates formatted as "[just|right] now"', () => {
+    const date1 = cleanDatePublished('now');
+    const newDate1 = moment(date1)
+      .format()
+      .split('T')[0];
+    const expectedDate1 = moment()
+      .format()
+      .split('T')[0];
+    assert.equal(newDate1, expectedDate1);
+
+    const date2 = cleanDatePublished('just now');
+    const newDate2 = moment(date2)
+      .format()
+      .split('T')[0];
+    const expectedDate2 = moment()
+      .format()
+      .split('T')[0];
+    assert.equal(newDate2, expectedDate2);
+
+    const date3 = cleanDatePublished('right now');
+    const newDate3 = moment(date3)
+      .format()
+      .split('T')[0];
+    const expectedDate3 = moment()
+      .format()
+      .split('T')[0];
+    assert.equal(newDate3, expectedDate3);
+  });
+
+  it('can handle dates formatted as "[amount] [time unit] ago"', () => {
+    // This generates an approximate date with a margin of error, for example:
+    // "X days ago" will not be accurate down to the exact time
+    // "X months ago" will not be accurate down to the exact day
+    const date1 = cleanDatePublished('1 hour ago');
+    const newDate1 = moment(date1)
+      .format()
+      .split('T')[0];
+    const expectedDate1 = moment()
+      .subtract(1, 'hour')
+      .format()
+      .split('T')[0];
+    assert.equal(newDate1, expectedDate1);
+
+    const date2 = cleanDatePublished('5 days ago');
+    const newDate2 = moment(date2)
+      .format()
+      .split('T')[0];
+    const expectedDate2 = moment()
+      .subtract(5, 'days')
+      .format()
+      .split('T')[0];
+    assert.equal(newDate2, expectedDate2);
+
+    const date3 = cleanDatePublished('10 months ago');
+    const newDate3 = moment(date3)
+      .format()
+      .split('T')[0];
+    const expectedDate3 = moment()
+      .subtract(10, 'months')
+      .format()
+      .split('T')[0];
+    assert.equal(newDate3, expectedDate3);
+  });
 });
 
 describe('cleanDateString(dateString)', () => {
 
@@ -73,6 +73,28 @@ export const ExampleExtractor = {
 
 This is all you'll need to know to handle most of the fields Mercury parses (titles, authors, date published, etc.). Article content is the exception.
 
+#### Content selectors
+
+If you pass an array selector for the content selection, it behaves differently from the attribute selectors on other types. In such cases, it will be considered as a multi-match selection, which allows the parser to choose several selectors to include in the result, and will include all occurrences of each matching selector in the result.
+
+Note that all selectors in the array must match in order for this selector to trigger.
+
+```javascript
+export const ExampleExtractor = {
+    ...
+
+    // Attempt to match both the content and image
+    // before falling back to just the content
+    content: {
+      selectors: [
+        ['.parsys.content', '.__image-lead__'],
+        '.content'
+      ],
+    },
+
+    ...
+```
+
 ### Cleaning content from an article
 
 An article's content can be more complex than the other fields, meaning you sometimes need to do more than just provide the selector(s) in order to return clean content.
 
@@ -92,4 +92,5 @@ export * from './ici.radio-canada.ca';
 export * from './www.fortinet.com';
 export * from './www.fastcompany.com';
 export * from './blisterreview.com';
-export * from './news.mynavi.jp';
+export * from './news.mynavi.jp';
+export * from './www.reddit.com';
@@ -0,0 +1,56 @@
+export const WwwRedditComExtractor = {
+  domain: 'www.reddit.com',
+
+  title: {
+    selectors: ['div[data-test-id="post-content"] h2'],
+  },
+
+  author: {
+    selectors: ['div[data-test-id="post-content"] a[href*="user/"]'],
+  },
+
+  date_published: {
+    selectors: [
+      'div[data-test-id="post-content"] a[data-click-id="timestamp"]',
+    ],
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']],
+  },
+
+  content: {
+    selectors: [
+      ['div[data-test-id="post-content"] p'], // text post
+      [
+        'div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link
+        'div[data-test-id="post-content"] div[data-click-id="media"]', // embedded media
+      ], // external link with media preview (YouTube, imgur album, etc...)
+      ['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video)
+      [
+        'div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])',
+      ], // external link
+      'div[data-test-id="post-content"]',
+    ],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {
+      'div[role="img"]': $node => {
+        // External link image preview
+        const $img = $node.find('img');
+        const bgImg = $node.css('background-image');
+        if ($img.length === 1 && bgImg) {
+          $img.attr('src', bgImg.match(/\((.*?)\)/)[1].replace(/('|")/g, ''));
+          return $img;
+        }
+        return $node;
+      },
+    },
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.icon'],
+  },
+};