From c8127dedb6138fa2b2564b7efb9c9777cd89d510 Mon Sep 17 00:00:00 2001
From: Mike Hordecki Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please let us know by submitting an issue. It appears this page uses frames. Unfortunately, browser security properties often cause Readability to fail on pages that include frames. You may want to try running readability itself on this source page: " + readability.biggestFrame.src + " Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.
]*>[ \n\r\t]*){2,}/gi,
+ replaceFonts: /<(\/?)font[^>]*>/gi,
+ trim: /^\s+|\s+$/g,
+ normalize: /\s{2,}/g,
+ killBreaks: /(
(\s| ?)*){1,}/g,
+ videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
+ skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
+ nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
+ prevLink: /(prev|earl|old|new|<|«)/i
+ },
+
+ /**
+ * Runs readability.
+ *
+ * Workflow:
+ * 1. Prep the document by removing script tags, css, etc.
+ * 2. Build readability's DOM tree.
+ * 3. Grab the article content from the current dom tree.
+ * 4. Replace the current DOM tree with the new one.
+ * 5. Read peacefully.
+ *
+ * @return void
+ **/
+ init: function() {
+ /* Before we do anything, remove all scripts that are not readability. */
+ window.onload = window.onunload = function() {};
+
+ readability.removeScripts(document);
+
+ if(document.body && !readability.bodyCache) {
+ readability.bodyCache = document.body.innerHTML;
+
+ }
+ /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
+ readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
+
+ /* Pull out any possible next page link first */
+ var nextPageLink = readability.findNextPageLink(document.body);
+
+ readability.prepDocument();
+
+ /* Build readability's DOM tree */
+ var overlay = document.createElement("DIV");
+ var innerDiv = document.createElement("DIV");
+ var articleTools = readability.getArticleTools();
+ var articleTitle = readability.getArticleTitle();
+ var articleContent = readability.grabArticle();
+ var articleFooter = readability.getArticleFooter();
+
+ if(!articleContent) {
+ articleContent = document.createElement("DIV");
+ articleContent.id = "readability-content";
+ articleContent.innerHTML = [
+ "
').replace(readability.regexps.replaceFonts, '<$1span>'); + }, + + /** + * For easier reading, convert this document to have footnotes at the bottom rather than inline links. + * @see http://www.roughtype.com/archives/2010/05/experiments_in.php + * + * @return void + **/ + addFootnotes: function(articleContent) { + var footnotesWrapper = document.getElementById('readability-footnotes'), + articleFootnotes = document.getElementById('readability-footnotes-list'); + + if(!footnotesWrapper) { + footnotesWrapper = document.createElement("DIV"); + footnotesWrapper.id = 'readability-footnotes'; + footnotesWrapper.innerHTML = '
tags, etc.
+ *
+ * @param Element
+ * @return void
+ **/
+ prepArticle: function (articleContent) {
+ readability.cleanStyles(articleContent);
+ readability.killBreaks(articleContent);
+
+ /* Clean out junk from the article content */
+ readability.cleanConditionally(articleContent, "form");
+ readability.clean(articleContent, "object");
+ readability.clean(articleContent, "h1");
+
+ /**
+ * If there is only one h2, they are probably using it
+ * as a header and not a subheader, so remove it since we already have a header.
+ ***/
+ if(articleContent.getElementsByTagName('h2').length === 1) {
+ readability.clean(articleContent, "h2");
+ }
+ readability.clean(articleContent, "iframe");
+
+ readability.cleanHeaders(articleContent);
+
+ /* Do these last as the previous stuff may have removed junk that will affect these */
+ readability.cleanConditionally(articleContent, "table");
+ readability.cleanConditionally(articleContent, "ul");
+ readability.cleanConditionally(articleContent, "div");
+
+ /* Remove extra paragraphs */
+ var articleParagraphs = articleContent.getElementsByTagName('p');
+ for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
+ var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
+ var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
+ var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
+
+ if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
+ articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
+ }
+ }
+
+ try {
+ articleContent.innerHTML = articleContent.innerHTML.replace(/
]*>\s*
topCandidate.readability.contentScore) { + topCandidate = candidates[c]; } + } + + /** + * If we still have no top candidate, just use the body as a last resort. + * We also have to copy the body node so it is something we can modify. + **/ + if (topCandidate === null || topCandidate.tagName === "BODY") + { + topCandidate = document.createElement("DIV"); + topCandidate.innerHTML = page.innerHTML; + page.innerHTML = ""; + page.appendChild(topCandidate); + readability.initializeNode(topCandidate); + } + + /** + * Now that we have the top candidate, look through its siblings for content that might also be related. + * Things like preambles, content split by ads that we removed, etc. + **/ + var articleContent = document.createElement("DIV"); + if (isPaging) { + articleContent.id = "readability-content"; + } + var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); + var siblingNodes = topCandidate.parentNode.childNodes; + + + for(var s=0, sl=siblingNodes.length; s < sl; s+=1) { + var siblingNode = siblingNodes[s]; + var append = false; + + /** + * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList. + * Example of error visible here: http://www.esquire.com/features/honesty0707 + **/ + if(!siblingNode) { + continue; + } + + dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); + dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); + + if(siblingNode === topCandidate) + { + append = true; + } + + var contentBonus = 0; + /* Give a bonus if sibling nodes and top candidates have the example same classname */ + if(siblingNode.className === topCandidate.className && topCandidate.className !== "") { + contentBonus += topCandidate.readability.contentScore * 0.2; + } + + if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) + { + append = true; + } + + if(siblingNode.nodeName === "P") { + var linkDensity = readability.getLinkDensity(siblingNode); + var nodeContent = readability.getInnerText(siblingNode); + var nodeLength = nodeContent.length; + + if(nodeLength > 80 && linkDensity < 0.25) + { + append = true; + } + else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) + { + append = true; + } + } + + if(append) { + dbg("Appending node: " + siblingNode); + + var nodeToAppend = null; + if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { + /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ + + dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); + nodeToAppend = document.createElement("DIV"); + try { + nodeToAppend.id = siblingNode.id; + nodeToAppend.innerHTML = siblingNode.innerHTML; + } + catch(er) { + dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); + nodeToAppend = siblingNode; + s-=1; + sl-=1; + } + } else { + nodeToAppend = siblingNode; + s-=1; + sl-=1; + } + + /* To ensure a node does not interfere with readability styles, remove its classnames */ + nodeToAppend.className = ""; + + /* Append sibling and subtract from our list because it removes the node when you append to another node */ + articleContent.appendChild(nodeToAppend); + } + } + + /** + * So we have all of the content that we need. Now we clean it up for presentation. + **/ + readability.prepArticle(articleContent); + + if (readability.curPageNum === 1) { + articleContent.innerHTML = '
§
'; + + document.getElementById("readability-content").appendChild(articlePage); + + if(readability.curPageNum > readability.maxPages) { + var nextPageMarkup = ""; + + articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; + return; + } + + /** + * Now that we've built the article page DOM element, get the page content + * asynchronously and load the cleaned content into the div we created for it. + **/ + (function(pageUrl, thisPage) { + readability.ajax(pageUrl, { + success: function(r) { + + /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ + var eTag = r.getResponseHeader('ETag'); + if(eTag) { + if(eTag in readability.pageETags) { + dbg("Exact duplicate page found via ETag. Aborting."); + articlePage.style.display = 'none'; + return; + } else { + readability.pageETags[eTag] = 1; + } + } + + // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. + var page = document.createElement("DIV"); + + /** + * Do some preprocessing to our HTML to make it ready for appending. + * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. + * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript. + * • Turn all double br's into p's - was handled by prepDocument in the original view. + * Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages. + **/ + var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/');
+ responseHtml = responseHtml.replace(readability.regexps.replaceFonts, '<$1span>');
+
+ page.innerHTML = responseHtml;
+
+ /**
+ * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle.
+ **/
+ readability.flags = 0x1 | 0x2 | 0x4;
+
+ var nextPageLink = readability.findNextPageLink(page),
+ content = readability.grabArticle(page);
+
+ if(!content) {
+ dbg("No content found in page to append. Aborting.");
+ return;
+ }
+
+ /**
+ * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
+ * Compare it against all of the the previous document's we've gotten. If the previous
+ * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
+ **/
+ var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
+ if(firstP && firstP.innerHTML.length > 100) {
+ for(var i=1; i <= readability.curPageNum; i+=1) {
+ var rPage = document.getElementById('readability-page-' + i);
+ if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
+ dbg('Duplicate of page ' + i + ' - skipping.');
+ articlePage.style.display = 'none';
+ readability.parsedPages[pageUrl] = true;
+ return;
+ }
+ }
+ }
+
+ readability.removeScripts(content);
+
+ thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
+
+ /**
+ * After the page has rendered, post process the content. This delay is necessary because,
+ * in webkit at least, offsetWidth is not set in time to determine image width. We have to
+ * wait a little bit for reflow to finish before we can fix floating images.
+ **/
+ window.setTimeout(
+ function() { readability.postProcessContent(thisPage); },
+ 500
+ );
+
+ if(nextPageLink) {
+ readability.appendNextPage(nextPageLink);
+ }
+ }
+ });
+ }(nextPageLink, articlePage));
+ },
+
+ /**
+ * Get an elements class/id weight. Uses regular expressions to tell if this
+ * element looks good or bad.
+ *
+ * @param Element
+ * @return number (Integer)
+ **/
+ getClassWeight: function (e) {
+ if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
+ return 0;
+ }
+
+ var weight = 0;
+
+ /* Look for a special classname */
+ if (typeof(e.className) === 'string' && e.className !== '')
+ {
+ if(e.className.search(readability.regexps.negative) !== -1) {
+ weight -= 25; }
+
+ if(e.className.search(readability.regexps.positive) !== -1) {
+ weight += 25; }
+ }
+
+ /* Look for a special ID */
+ if (typeof(e.id) === 'string' && e.id !== '')
+ {
+ if(e.id.search(readability.regexps.negative) !== -1) {
+ weight -= 25; }
+
+ if(e.id.search(readability.regexps.positive) !== -1) {
+ weight += 25; }
+ }
+
+ return weight;
+ },
+
+ nodeIsVisible: function (node) {
+ return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none';
+ },
+
+ /**
+ * Remove extraneous break tags from a node.
+ *
+ * @param Element
+ * @return void
+ **/
+ killBreaks: function (e) {
+ try {
+ e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaks,'
');
+ }
+ catch (eBreaks) {
+ dbg("KillBreaks failed - this is an IE bug. Ignoring.: " + eBreaks);
+ }
+ },
+
+ /**
+ * Clean a node of all elements of type "tag".
+ * (Unless it's a youtube/vimeo video. People love movies.)
+ *
+ * @param Element
+ * @param string tag to clean
+ * @return void
+ **/
+ clean: function (e, tag) {
+ var targetList = e.getElementsByTagName( tag );
+ var isEmbed = (tag === 'object' || tag === 'embed');
+
+ for (var y=targetList.length-1; y >= 0; y-=1) {
+ /* Allow youtube and vimeo videos through as people usually want to see those. */
+ if(isEmbed) {
+ var attributeValues = "";
+ for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
+ attributeValues += targetList[y].attributes[i].value + '|';
+ }
+
+ /* First, check the elements attributes to see if any of them contain youtube or vimeo */
+ if (attributeValues.search(readability.regexps.videos) !== -1) {
+ continue;
+ }
+
+ /* Then check the elements inside this element for the same. */
+ if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
+ continue;
+ }
+
+ }
+
+ targetList[y].parentNode.removeChild(targetList[y]);
+ }
+ },
+
+ /**
+ * Clean an element of all tags of type "tag" if they look fishy.
+ * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
+ *
+ * @return void
+ **/
+ cleanConditionally: function (e, tag) {
+
+ if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
+ return;
+ }
+
+ var tagsList = e.getElementsByTagName(tag);
+ var curTagsLength = tagsList.length;
+
+ /**
+ * Gather counts for other typical elements embedded within.
+ * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+ *
+ * TODO: Consider taking into account original contentScore here.
+ **/
+ for (var i=curTagsLength-1; i >= 0; i-=1) {
+ var weight = readability.getClassWeight(tagsList[i]);
+ var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
+
+ dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
+
+ if(weight+contentScore < 0)
+ {
+ tagsList[i].parentNode.removeChild(tagsList[i]);
+ }
+ else if ( readability.getCharCount(tagsList[i],',') < 10) {
+ /**
+ * If there are not very many commas, and the number of
+ * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
+ **/
+ var p = tagsList[i].getElementsByTagName("p").length;
+ var img = tagsList[i].getElementsByTagName("img").length;
+ var li = tagsList[i].getElementsByTagName("li").length-100;
+ var input = tagsList[i].getElementsByTagName("input").length;
+
+ var embedCount = 0;
+ var embeds = tagsList[i].getElementsByTagName("embed");
+ for(var ei=0,il=embeds.length; ei < il; ei+=1) {
+ if (embeds[ei].src.search(readability.regexps.videos) === -1) {
+ embedCount+=1;
+ }
+ }
+
+ var linkDensity = readability.getLinkDensity(tagsList[i]);
+ var contentLength = readability.getInnerText(tagsList[i]).length;
+ var toRemove = false;
+
+ if ( img > p ) {
+ toRemove = true;
+ } else if(li > p && tag !== "ul" && tag !== "ol") {
+ toRemove = true;
+ } else if( input > Math.floor(p/3) ) {
+ toRemove = true;
+ } else if(contentLength < 25 && (img === 0 || img > 2) ) {
+ toRemove = true;
+ } else if(weight < 25 && linkDensity > 0.2) {
+ toRemove = true;
+ } else if(weight >= 25 && linkDensity > 0.5) {
+ toRemove = true;
+ } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
+ toRemove = true;
+ }
+
+ if(toRemove) {
+ tagsList[i].parentNode.removeChild(tagsList[i]);
+ }
+ }
+ }
+ },
+
+ /**
+ * Clean out spurious headers from an Element. Checks things like classnames and link density.
+ *
+ * @param Element
+ * @return void
+ **/
+ cleanHeaders: function (e) {
+ for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) {
+ var headers = e.getElementsByTagName('h' + headerIndex);
+ for (var i=headers.length-1; i >=0; i-=1) {
+ if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
+ headers[i].parentNode.removeChild(headers[i]);
+ }
+ }
+ }
+ },
+
+ /*** Smooth scrolling logic ***/
+
+ /**
+ * easeInOut animation algorithm - returns an integer that says how far to move at this point in the animation.
+ * Borrowed from jQuery's easing library.
+ * @return integer
+ **/
+ easeInOut: function(start,end,totalSteps,actualStep) {
+ var delta = end - start;
+
+ if ((actualStep/=totalSteps/2) < 1) {
+ return delta/2*actualStep*actualStep + start;
+ }
+ actualStep -=1;
+ return -delta/2 * ((actualStep)*(actualStep-2) - 1) + start;
+ },
+
+ /**
+ * Helper function to, in a cross compatible way, get or set the current scroll offset of the document.
+ * @return mixed integer on get, the result of window.scrollTo on set
+ **/
+ scrollTop: function(scroll){
+ var setScroll = typeof scroll !== 'undefined';
+
+ if(setScroll) {
+ return window.scrollTo(0, scroll);
+ }
+ if(typeof window.pageYOffset !== 'undefined') {
+ return window.pageYOffset;
+ }
+ else if(document.documentElement.clientHeight) {
+ return document.documentElement.scrollTop;
+ }
+ else {
+ return document.body.scrollTop;
+ }
+ },
+
+ /**
+ * scrollTo - Smooth scroll to the point of scrollEnd in the document.
+ * @return void
+ **/
+ curScrollStep: 0,
+ scrollTo: function (scrollStart, scrollEnd, steps, interval) {
+ if(
+ (scrollStart < scrollEnd && readability.scrollTop() < scrollEnd) ||
+ (scrollStart > scrollEnd && readability.scrollTop() > scrollEnd)
+ ) {
+ readability.curScrollStep+=1;
+ if(readability.curScrollStep > steps) {
+ return;
+ }
+
+ var oldScrollTop = readability.scrollTop();
+
+ readability.scrollTop(readability.easeInOut(scrollStart, scrollEnd, steps, readability.curScrollStep));
+
+ // We're at the end of the window.
+ if(oldScrollTop === readability.scrollTop()) {
+ return;
+ }
+
+ window.setTimeout(function() {
+ readability.scrollTo(scrollStart, scrollEnd, steps, interval);
+ }, interval);
+ }
+ },
+
+
+ /**
+ * Show the email popup.
+ *
+ * @return void
+ **/
+ emailBox: function () {
+ var emailContainerExists = document.getElementById('email-container');
+ if(null !== emailContainerExists)
+ {
+ return;
+ }
+
+ var emailContainer = document.createElement("DIV");
+ emailContainer.setAttribute('id', 'email-container');
+ emailContainer.innerHTML = '';
+
+ document.body.appendChild(emailContainer);
+ },
+
+ /**
+ * Close the email popup. This is a hacktackular way to check if we're in a "close loop".
+ * Since we don't have crossdomain access to the frame, we can only know when it has
+ * loaded again. If it's loaded over 3 times, we know to close the frame.
+ *
+ * @return void
+ **/
+ removeFrame: function () {
+ readability.iframeLoads+=1;
+ if (readability.iframeLoads > 3)
+ {
+ var emailContainer = document.getElementById('email-container');
+ if (null !== emailContainer) {
+ emailContainer.parentNode.removeChild(emailContainer);
+ }
+
+ readability.iframeLoads = 0;
+ }
+ },
+
+ htmlspecialchars: function (s) {
+ if (typeof(s) === "string") {
+ s = s.replace(/&/g, "&");
+ s = s.replace(/"/g, """);
+ s = s.replace(/'/g, "'");
+ s = s.replace(//g, ">");
+ }
+
+ return s;
+ },
+
+ flagIsActive: function(flag) {
+ return (readability.flags & flag) > 0;
+ },
+
+ addFlag: function(flag) {
+ readability.flags = readability.flags | flag;
+ },
+
+ removeFlag: function(flag) {
+ readability.flags = readability.flags & ~flag;
+ }
+
+};
+
+readability.init();
diff --git a/readability/readability-x.js b/readability/readability-x.js
index 1748a91..ac8316b 100644
--- a/readability/readability-x.js
+++ b/readability/readability-x.js
@@ -1,9 +1,9 @@
/*jslint undef: true, nomen: true, eqeqeq: true, plusplus: true, newcap: true, immed: true, browser: true, devel: true, passfail: false */
/*global window: false, readConvertLinksToFootnotes: false, readStyle: false, readSize: false, readMargin: false, Typekit: false, ActiveXObject: false */
-var dbg = (typeof console !== 'undefined') ? function(s) {
+var dbg = /*(typeof console !== 'undefined') ? function(s) {
console.log("Readability: " + s);
-} : function() {};
+} :*/ function() {};
/*
* Readability. An Arc90 Lab Experiment.
diff --git a/readability/readability.js b/readability/readability.js
index d0fd4f7..995015c 100755
--- a/readability/readability.js
+++ b/readability/readability.js
@@ -1,9 +1,9 @@
/*jslint undef: true, nomen: true, eqeqeq: true, plusplus: true, newcap: true, immed: true, browser: true, devel: true, passfail: false */
/*global window: false, readConvertLinksToFootnotes: false, readStyle: false, readSize: false, readMargin: false, Typekit: false, ActiveXObject: false */
-var dbg = (typeof console !== 'undefined') ? function(s) {
+var dbg = /*(typeof console !== 'undefined') ? function(s) {
console.log("Readability: " + s);
-} : function() {};
+} :*/ function() {};
/*
* Readability. An Arc90 Lab Experiment.