Skip to content

Commit

Permalink
Feat: detect platforms (#52)
Browse files Browse the repository at this point in the history
Detectors for matching extractors for publishing platforms. Currently supporting Medium and Blogger.
  • Loading branch information
adampash authored Dec 6, 2016
1 parent 64c0fad commit 2fb4764
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 6 deletions.
15 changes: 15 additions & 0 deletions src/extractors/detect-by-html.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import {
MediumExtractor,
BloggerExtractor,
} from './custom/';

const Detectors = {
'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
'meta[name="generator"][value="blogger"]': BloggerExtractor,
};

export default function detectByHtml($) {
const selector = Reflect.ownKeys(Detectors).find(s => $(s).length > 0);

return Detectors[selector];
}
24 changes: 24 additions & 0 deletions src/extractors/detect-by-html.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import assert from 'assert';
import cheerio from 'cheerio';

import detectByHtml from './detect-by-html';

describe('detectByHtml', () => {
it('detects a medium post from the html', () => {
const html =
'<head><meta name="al:ios:app_name" value="Medium" /></head>';

const $ = cheerio.load(html);

assert.equal(detectByHtml($).domain, 'medium.com');
});

it('returns nothing if no match is found', () => {
const html =
'<div></div>';

const $ = cheerio.load(html);

assert.equal(detectByHtml($), null);
});
});
6 changes: 4 additions & 2 deletions src/extractors/get-extractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@ import URL from 'url';

import Extractors from './all';
import GenericExtractor from './generic';
import detectByHtml from './detect-by-html';

export default function getExtractor(url, parsedUrl) {
export default function getExtractor(url, parsedUrl, $) {
parsedUrl = parsedUrl || URL.parse(url);
const { hostname } = parsedUrl;
const baseDomain = hostname.split('.').slice(-2).join('.');

return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor;
return Extractors[hostname] || Extractors[baseDomain] ||
detectByHtml($) || GenericExtractor;
}
13 changes: 12 additions & 1 deletion src/extractors/get-extractor.test.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import assert from 'assert';
import cheerio from 'cheerio';

import getExtractor from './get-extractor';

describe('getExtractor(url)', () => {
it('returns GenericExtractor if no custom extractor is found', () => {
const extractor = getExtractor('http://example.com');
const extractor = getExtractor('http://example.com', null, cheerio.load('<div />'));

assert.equal(extractor.domain, '*');
});
Expand All @@ -26,4 +27,14 @@ describe('getExtractor(url)', () => {

assert.equal(extractor.domain, 'wikipedia.org');
});

it('returns a custom extractor based on detectors', () => {
const html =
'<head><meta name="al:ios:app_name" value="Medium" /></head>';

const $ = cheerio.load(html);
const extractor = getExtractor('http://foo.com', null, $);

assert.equal(extractor.domain, 'medium.com');
});
});
6 changes: 3 additions & 3 deletions src/mercury.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ const Mercury = {
return Errors.badUrl;
}

const Extractor = getExtractor(url, parsedUrl);
// console.log(`Using extractor for ${Extractor.domain}`);

const $ = await Resource.create(url, html, parsedUrl);

const Extractor = getExtractor(url, parsedUrl, $);
// console.log(`Using extractor for ${Extractor.domain}`);

// If we found an error creating the resource, return that error
if ($.failed) {
return $;
Expand Down

0 comments on commit 2fb4764

Please sign in to comment.