Skip to content

Commit

Permalink
Merge pull request #78 from nlevitt/extract-html-no-mimetype
Browse files Browse the repository at this point in the history
treat content as html and extract links if it looks like html, even if m...
  • Loading branch information
vonrosen committed Jul 22, 2014

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents d0ebd40 + 4b75dfa commit ba48f96
Showing 1 changed file with 13 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -694,16 +694,24 @@ protected boolean shouldExtract(CrawlURI uri) {
// assume it's okay to extract
}
}

String mime = uri.getContentType().toLowerCase();
return mime.startsWith("text/html")
if (mime.startsWith("text/html")
|| mime.startsWith("application/xhtml")
|| mime.startsWith("text/vnd.wap.wml")
|| mime.startsWith("application/vnd.wap.wml")
|| mime.startsWith("application/vnd.wap.xhtml");
|| mime.startsWith("application/vnd.wap.xhtml")) {
return true;
}

String contentPrefixLC = uri.getRecorder().getContentReplayPrefixString(1000).toLowerCase();
if (contentPrefixLC.contains("<html") || contentPrefixLC.contains("<!doctype html")) {
return true;
}

return false;
}



public boolean innerExtract(CrawlURI curi) {
if (!curi.containsContentTypeCharsetDeclaration()) {
String contentPrefix = curi.getRecorder().getContentReplayPrefixString(1000);

0 comments on commit ba48f96

Please sign in to comment.