Skip to content

Commit

Permalink
Bugfix new yorker wired extractors (#604)
Browse files Browse the repository at this point in the history
* www.newyorker.com: add updated fixtures and fix extractors

* www.wired.com: add updated fixtures and fix extractors

Co-authored-by: John Holdun <john@johnholdun.com>
  • Loading branch information
sodiumjoe and johnholdun authored May 9, 2022
1 parent 99062da commit fb44ab0
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 13 deletions.
6 changes: 6 additions & 0 deletions fixtures/www.newyorker.com/1611473608343.html

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions fixtures/www.newyorker.com/1611475571383.html

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions fixtures/www.wired.com/1611475755063.html

Large diffs are not rendered by default.

15 changes: 11 additions & 4 deletions src/extractors/custom/www.newyorker.com/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,25 @@ export const NewYorkerExtractor = {
domain: 'www.newyorker.com',
title: {
selectors: [
'h1[class^="content-header"]',
'h1[class^="ArticleHeader__hed"]',
['meta[name="og:title"]', 'value'],
],
},

author: {
selectors: [
['meta[name="author"]', 'value'],
'div[class^="ArticleContributors"] a[rel="author"]',
'article header div[class*="Byline__multipleContributors"]',
],
},

content: {
selectors: ['main[class^="Layout__content"]'],
selectors: [
'article.article.main-content',
'main[class^="Layout__content"]',
],

// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
Expand All @@ -31,8 +36,10 @@ export const NewYorkerExtractor = {
},

date_published: {
selectors: [['meta[name="pubdate"]', 'value']],
format: 'YYYYMMDD',
selectors: [
'time.content-header__publish-date',
['meta[name="pubdate"]', 'value'],
],
timezone: 'America/New_York',
},

Expand All @@ -41,7 +48,7 @@ export const NewYorkerExtractor = {
},

dek: {
selectors: ['h2[class^="ArticleHeader__dek"]'],
selectors: ['div.content-header__dek', 'h2[class^="ArticleHeader__dek"]'],
},

next_page_url: null,
Expand Down
8 changes: 4 additions & 4 deletions src/extractors/custom/www.newyorker.com/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ describe('NewYorkerExtractor', () => {
url =
'http://www.newyorker.com/tech/elements/hacking-cryptography-and-the-countdown-to-quantum-computing';
const html = fs.readFileSync(
'./fixtures/www.newyorker.com/1557138180688.html'
'./fixtures/www.newyorker.com/1611473608343.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
Expand Down Expand Up @@ -73,7 +73,7 @@ describe('NewYorkerExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://media.newyorker.com/photos/59097a5e8b51cf59fc4239f5/16:9/w_1200,h_630,c_limit/Hutchinson-Quantum-Computing.jpg'
'https://media.newyorker.com/photos/59097a5e8b51cf59fc4239f5/16:9/w_1280,c_limit/Hutchinson-Quantum-Computing.jpg'
);
});

Expand All @@ -97,7 +97,7 @@ describe('NewYorkerExtractor', () => {
// the article.
assert.equal(
first13,
'In a laboratory in Shanghai, researchers work on developing a quantum computer—a new'
'Given the recent ubiquity of cyber-scandals—Colin Powell’s stolen e-mails, Simone Biles’s leaked medical'
);
});
});
Expand All @@ -109,7 +109,7 @@ describe('NewYorkerExtractor', () => {
url =
'http://www.newyorker.com/magazine/2016/12/05/lessons-from-my-mother';
const html = fs.readFileSync(
'./fixtures/www.newyorker.com/1557145645680.html'
'./fixtures/www.newyorker.com/1611475571383.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
Expand Down
8 changes: 7 additions & 1 deletion src/extractors/custom/www.wired.com/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,23 @@ export const WiredExtractor = {
domain: 'www.wired.com',
title: {
selectors: [
'h1.content-header__hed',
'h1.post-title',
// enter title selectors
],
},

author: {
selectors: [
['meta[name="author"]', 'value'],
'a[rel="author"]',
// enter author selectors
],
},

content: {
selectors: [
'article.article.main-content',
'article.content',
// enter content selectors
],
Expand All @@ -34,7 +37,10 @@ export const WiredExtractor = {
},

date_published: {
selectors: [['meta[itemprop="datePublished"]', 'value']],
selectors: [
'time.content-header__publish-date',
['meta[itemprop="datePublished"]', 'value'],
],
},

lead_image_url: {
Expand Down
8 changes: 4 additions & 4 deletions src/extractors/custom/www.wired.com/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ describe('WiredExtractor', () => {
url =
'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
const html = fs.readFileSync(
'./fixtures/www.wired.com/1475256747028.html'
'./fixtures/www.wired.com/1611475755063.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
Expand Down Expand Up @@ -62,7 +62,7 @@ describe('WiredExtractor', () => {

// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-09-30T07:00:12.000Z');
assert.equal(date_published.split('T')[0], '2016-09-30');
});

it('returns the lead_image_url', async () => {
Expand All @@ -74,7 +74,7 @@ describe('WiredExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://www.wired.com/wp-content/uploads/2016/09/Rosetta_impact-1-1200x630.jpg'
'https://media.wired.com/photos/5926b676af95806129f50602/191:100/w_1280,c_limit/Rosetta_impact-1.jpg'
);
});

Expand All @@ -98,7 +98,7 @@ describe('WiredExtractor', () => {
// the article.
assert.equal(
first13,
'Today, the European Space Agencys Rosetta spacecraft will engage its thrusters for one'
"Today, the European Space Agency's Rosetta spacecraft will engage its thrusters for one"
);
});
});
Expand Down

0 comments on commit fb44ab0

Please sign in to comment.