Skip to content

Commit

Permalink
Merge pull request #161 from kris-sigur/NoNonSuccessLinkExtraction
Browse files Browse the repository at this point in the history
No link extraction on URI not successfully downloaded
  • Loading branch information
nlevitt committed Jun 6, 2016
2 parents c80f0ce + 70867ef commit bb10b4a
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@ final protected void extract(CrawlURI uri) {

/**
* Determines if links should be extracted from the given URI. This method
* performs three checks. The first check runs only if
* performs four checks. It first checks if the URI was processed successfully,
* i.e. {@link CrawlURI#isSuccess()} returns true.
*
* <p>
* The second check runs only if
* {@link ExtractorParameters#getExtractIndependently()} is false. It checks
* {@link ExtractorURI#hasBeenLinkExtracted()} result. If that result is
* true, then this method returns false, as some other extractor has claimed
Expand All @@ -63,6 +67,9 @@ final protected void extract(CrawlURI uri) {
* @return true if links should be extracted from the URI, false otherwise
*/
final protected boolean shouldProcess(CrawlURI uri) {
if (!uri.isSuccess()) {
return false;
}
if (!getExtractorParameters().getExtractIndependently()
&& uri.hasBeenLinkExtracted()) {
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ public void testExtraction() throws Exception {
private void testOne(String text, String expectedURL) throws Exception {
Collection<TestData> testDataCol = makeData(text, expectedURL);
for (TestData testData: testDataCol) {
extractor.process(testData.uri);
testData.uri.setFetchStatus(200);
extractor.process(testData.uri);
HashSet<CrawlURI> expected = new HashSet<CrawlURI>();
if (testData.expectedResult != null) {
expected.add(testData.expectedResult);
Expand Down

0 comments on commit bb10b4a

Please sign in to comment.