forked from yangsibai/node-html-readability
-
Notifications
You must be signed in to change notification settings - Fork 0
/
regex-helper.js
62 lines (53 loc) · 2.09 KB
/
regex-helper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
var regexps = {
unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pager|popup|tweet|twitter/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
replaceFonts: /<(\/?)font[^>]*>/gi,
trim: /^\s+|\s+$/g,
normalize: /\s{2,}/g,
killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g,
videos: /youtube|vimeo|youku|tudou|56|yinyuetai|video\.sina/i,
skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
nextLink: /(next|weiter|continue|next_page|>([^\|]|$)|([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|)/i,
indexLink: /http.*(\\.com\.cn|.net|\.com|\.cn)/i,
title: /title|head/gi
};
function unlikelyCandidates (str) {
return str.search(regexps.unlikelyCandidates) !== -1;
}
function okMaybeItsACandidate (str) {
return str && str.search(regexps.okMaybeItsACandidate) !== -1;
}
function isVideo (str) {
return str && str.search(regexps.videos) !== -1;
}
function divToPElements (str) {
return str && str.search(regexps.divToPElements) !== -1;
}
function isNegative (str) {
return str.search(regexps.negative) !== -1;
}
function isPositive (str) {
return str && str.search(regexps.positive) !== -1;
}
function replaceBreaks (str) {
return str.replace(regexps.killBreaks, '<br />');
}
function likeTitle (str) {
return str && str.search(regexps.title) !== -1;
}
module.exports = {
unlikelyCandidates: unlikelyCandidates,
okMaybeItsACandidate: okMaybeItsACandidate,
isVideo: isVideo,
divToPElements: divToPElements,
isNegative: isNegative,
isPositive: isPositive,
replaceBreaks: replaceBreaks,
likeTitle: likeTitle
};