Merge pull request #78 from nlevitt/extract-html-no-mimetype

treat content as html and extract links if it looks like html, even if m...
internetarchive · Jul 22, 2014 · ba48f96 · ba48f96
2 parents d0ebd40 + 4b75dfa
commit ba48f96
Showing 1 changed file with 13 additions and 5 deletions.
diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java
@@ -694,16 +694,24 @@ protected boolean shouldExtract(CrawlURI uri) {
                 // assume it's okay to extract
             }
         }
-        
+
         String mime = uri.getContentType().toLowerCase();
-        return mime.startsWith("text/html")
+        if (mime.startsWith("text/html")
                 || mime.startsWith("application/xhtml")
                 || mime.startsWith("text/vnd.wap.wml")
                 || mime.startsWith("application/vnd.wap.wml")
-                || mime.startsWith("application/vnd.wap.xhtml");
+                || mime.startsWith("application/vnd.wap.xhtml")) {
+            return true;
+        }
+
+        String contentPrefixLC = uri.getRecorder().getContentReplayPrefixString(1000).toLowerCase();
+        if (contentPrefixLC.contains("<html") || contentPrefixLC.contains("<!doctype html")) {
+            return true;
+        }
+
+        return false;
     }
-
-
+
     public boolean innerExtract(CrawlURI curi) {
         if (!curi.containsContentTypeCharsetDeclaration()) {
             String contentPrefix = curi.getRecorder().getContentReplayPrefixString(1000);