From 4b75dfa16f78cd3d9e5db48936ff9388ca5ab2aa Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 17 Jul 2014 21:09:51 -0700 Subject: [PATCH] treat content as html and extract links if it looks like html, even if mime type belies that --- .../modules/extractor/ExtractorHTML.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java index c5f710c8a..f97e6d866 100644 --- a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java @@ -694,16 +694,24 @@ protected boolean shouldExtract(CrawlURI uri) { // assume it's okay to extract } } - + String mime = uri.getContentType().toLowerCase(); - return mime.startsWith("text/html") + if (mime.startsWith("text/html") || mime.startsWith("application/xhtml") || mime.startsWith("text/vnd.wap.wml") || mime.startsWith("application/vnd.wap.wml") - || mime.startsWith("application/vnd.wap.xhtml"); + || mime.startsWith("application/vnd.wap.xhtml")) { + return true; + } + + String contentPrefixLC = uri.getRecorder().getContentReplayPrefixString(1000).toLowerCase(); + if (contentPrefixLC.contains("