Merge pull request #161 from kris-sigur/NoNonSuccessLinkExtraction

No link extraction on URI not successfully downloaded
internetarchive · Jun 6, 2016 · bb10b4a · bb10b4a
2 parents c80f0ce + 70867ef
commit bb10b4a
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 2 deletions.
diff --git a/modules/src/main/java/org/archive/modules/extractor/ContentExtractor.java b/modules/src/main/java/org/archive/modules/extractor/ContentExtractor.java
@@ -42,7 +42,11 @@ final protected void extract(CrawlURI uri) {
 
     /**
      * Determines if links should be extracted from the given URI. This method
-     * performs three checks. The first check runs only if
+     * performs four checks. It first checks if the URI was processed successfully,
+     * i.e. {@link CrawlURI#isSuccess()} returns true. 
+     * 
+     * <p>
+     * The second check runs only if
      * {@link ExtractorParameters#getExtractIndependently()} is false. It checks
      * {@link ExtractorURI#hasBeenLinkExtracted()} result. If that result is
      * true, then this method returns false, as some other extractor has claimed
@@ -63,6 +67,9 @@ final protected void extract(CrawlURI uri) {
      * @return true if links should be extracted from the URI, false otherwise
      */
     final protected boolean shouldProcess(CrawlURI uri) {
+    	if (!uri.isSuccess()) {
+    		return false;
+    	}
         if (!getExtractorParameters().getExtractIndependently()
                 && uri.hasBeenLinkExtracted()) {
             return false;

diff --git a/modules/src/main/java/org/archive/modules/extractor/StringExtractorTestBase.java b/modules/src/main/java/org/archive/modules/extractor/StringExtractorTestBase.java
@@ -79,7 +79,8 @@ public void testExtraction() throws Exception {
     private void testOne(String text, String expectedURL) throws Exception {
         Collection<TestData> testDataCol = makeData(text, expectedURL);
         for (TestData testData: testDataCol) {
-            extractor.process(testData.uri);
+        	testData.uri.setFetchStatus(200);
+        	extractor.process(testData.uri);
             HashSet<CrawlURI> expected = new HashSet<CrawlURI>();
             if (testData.expectedResult != null) {
                 expected.add(testData.expectedResult);