Skip to content

Commit

Permalink
Merge pull request #79 from nlevitt/revisit-redux
Browse files Browse the repository at this point in the history
Revisit redux
  • Loading branch information
Kristinn Sigurðsson committed Jul 21, 2014
2 parents f101bba + cb6ac8b commit d0ebd40
Show file tree
Hide file tree
Showing 14 changed files with 461 additions and 265 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@

package org.archive.crawler.util;

import org.apache.commons.httpclient.HttpStatus;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;
import org.archive.modules.revisit.IdenticalPayloadDigestRevisit;
import org.archive.modules.revisit.ServerNotModifiedRevisit;
import org.archive.util.ArchiveUtils;
import org.archive.util.Histotable;

Expand All @@ -31,22 +32,27 @@ public class CrawledBytesHistotable extends Histotable<String>

public static final String NOTMODIFIED = "notModified";
public static final String DUPLICATE = "dupByHash";
public static final String OTHERDUPLICATE = "otherDup";
public static final String NOVEL = "novel";
public static final String NOTMODIFIEDCOUNT = "notModifiedCount";
public static final String DUPLICATECOUNT = "dupByHashCount";
public static final String OTHERDUPLICATECOUNT = "otherDupCount";
public static final String NOVELCOUNT = "novelCount";

public CrawledBytesHistotable() {
super();
}

public void accumulate(CrawlURI curi) {
if(curi.getFetchStatus()==HttpStatus.SC_NOT_MODIFIED) {
if (curi.getRevisitProfile() instanceof ServerNotModifiedRevisit) {
tally(NOTMODIFIED, curi.getContentSize());
tally(NOTMODIFIEDCOUNT,1);
} else if (curi.getAnnotations().contains("duplicate:digest")) {
} else if (curi.getRevisitProfile() instanceof IdenticalPayloadDigestRevisit) {
tally(DUPLICATE,curi.getContentSize());
tally(DUPLICATECOUNT,1);
} else if (curi.getRevisitProfile() != null) {
tally(OTHERDUPLICATE, curi.getContentSize());
tally(OTHERDUPLICATECOUNT, 1);
} else {
tally(NOVEL,curi.getContentSize());
tally(NOVELCOUNT,1);
Expand All @@ -71,15 +77,21 @@ public String summary() {
sb.append(" ");
sb.append(NOTMODIFIED);
}
if(get(OTHERDUPLICATE)!=null) {
sb.append(", ");
sb.append(ArchiveUtils.formatBytesForDisplay(get(OTHERDUPLICATE)));
sb.append(" ");
sb.append(OTHERDUPLICATECOUNT);
}
sb.append(")");
return sb.toString();
}

public long getTotalBytes() {
return get(NOVEL) + get(DUPLICATE) + get(NOTMODIFIED);
return get(NOVEL) + get(DUPLICATE) + get(NOTMODIFIED) + get(OTHERDUPLICATE);
}

public long getTotalUrls() {
return get(NOVELCOUNT) + get(DUPLICATECOUNT) + get(NOTMODIFIEDCOUNT);
return get(NOVELCOUNT) + get(DUPLICATECOUNT) + get(NOTMODIFIEDCOUNT) + get(OTHERDUPLICATECOUNT);
}
}
29 changes: 27 additions & 2 deletions modules/src/main/java/org/archive/modules/CrawlURI.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@
import static org.archive.modules.fetcher.FetchStatusCodes.S_TOO_MANY_RETRIES;
import static org.archive.modules.fetcher.FetchStatusCodes.S_UNATTEMPTED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_FETCH_HISTORY;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST_HISTORY;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_FETCH_HISTORY;

import java.io.IOException;
import java.io.ObjectInputStream;
Expand Down Expand Up @@ -87,6 +87,7 @@
import org.archive.modules.extractor.HTMLLinkContext;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;
import org.archive.modules.revisit.RevisitProfile;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.spring.OverlayContext;
Expand Down Expand Up @@ -267,6 +268,12 @@ public static enum FetchType { HTTP_GET, HTTP_POST, UNKNOWN };
private byte[] contentDigest = null;
private String contentDigestScheme = null;


/**
* If this value is non-null, a determination has been made that this CrawlURI instance is a revisit or
* recrawl. Details are provided by the RevisitProfile object.
*/
transient private RevisitProfile revisitProfile = null;

/**
* Create a new instance of CrawlURI from a {@link UURI}.
Expand Down Expand Up @@ -874,6 +881,8 @@ public void processingCleanup() {
extraInfo = null;
outLinks = null;

this.revisitProfile = null;

// XXX er uh surprised this wasn't here before?
fetchType = FetchType.UNKNOWN;
}
Expand Down Expand Up @@ -1911,6 +1920,23 @@ public HashMap<String, Object> getContentDigestHistory() {
public boolean hasContentDigestHistory() {
return getData().get(A_CONTENT_DIGEST_HISTORY) != null;
}

/**
* Indicates if this CrawlURI object has been deemed a revisit.
* @return
*/
public boolean isRevisit() {
return revisitProfile!=null;
}

public RevisitProfile getRevisitProfile() {
return revisitProfile;
}

public void setRevisitProfile(RevisitProfile revisitProfile) {
this.revisitProfile = revisitProfile;
}


// brought over from old Link class
@Override
Expand Down Expand Up @@ -1970,5 +1996,4 @@ public boolean equals(Object o) {
&& equals(viaContext, u.viaContext)
&& equals(pathFromSeed, u.pathFromSeed);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,15 @@

package org.archive.modules.deciderules.recrawl;

import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST;

import java.util.Map;

import org.archive.format.warc.WARCConstants;
import org.archive.modules.CrawlURI;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.PredicatedDecideRule;
import org.archive.modules.revisit.RevisitProfile;

/**
* Rule applies configured decision to any CrawlURIs whose prior-history
* content-digest matches the latest fetch.
* Rule applies configured decision to any CrawlURIs whose revisit profile is set with a profile matching
* {@link WARCConstants#PROFILE_REVISIT_IDENTICAL_DIGEST}
*
* @author gojomo
*/
Expand All @@ -48,35 +46,28 @@ public IdenticalDigestDecideRule() {
}

/**
* Evaluate whether given CrawlURI's content-digest exactly
* matches that of preceding fetch.
* Evaluate whether given CrawlURI's revisit profile has been set to identical digest
*
* @param object should be CrawlURI
* @return true if current-fetch content-digest matches previous
* @return true if CrawlURI has been flagged as an identical digest revist
*/
protected boolean evaluate(CrawlURI curi) {
return hasIdenticalDigest(curi);
}


/**
* Utility method for testing if a CrawlURI's last two history
* entries (one being the most recent fetch) have identical
* content-digest information.
* Utility method for testing if a CrawlURI's revisit profile matches an identical payload digest.
*
* @param curi CrawlURI to test
* @return true if last two history entries have identical digests,
* otherwise false
* @return true if revisit profile is set to identical payload digest, false otherwise
*/
public static boolean hasIdenticalDigest(CrawlURI curi) {
Map<String,Object>[] history = curi.getFetchHistory();

return history != null
&& history[0] != null
&& history[0].containsKey(A_CONTENT_DIGEST)
&& history[1] != null
&& history[1].containsKey(A_CONTENT_DIGEST)
&& history[0].get(A_CONTENT_DIGEST).equals(history[1].get(A_CONTENT_DIGEST));
RevisitProfile revisit = curi.getRevisitProfile();
if (revisit==null) {
return false;
}
return revisit.getProfileName().equals(WARCConstants.PROFILE_REVISIT_IDENTICAL_DIGEST);
}

}
52 changes: 26 additions & 26 deletions modules/src/main/java/org/archive/modules/fetcher/FetchStats.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
import java.util.LinkedHashMap;
import java.util.Map;

import org.apache.commons.httpclient.HttpStatus;
import org.archive.modules.CrawlURI;
import org.archive.modules.revisit.IdenticalPayloadDigestRevisit;
import org.archive.modules.revisit.ServerNotModifiedRevisit;
import org.archive.util.ArchiveUtils;
import org.archive.util.ReportUtils;
import org.archive.util.Reporter;
Expand Down Expand Up @@ -67,7 +68,9 @@ public interface CollectsFetchStats {
protected long notModifiedBytes;
protected long notModifiedUrls;
protected long dupByHashBytes;
protected long dupByHashUrls;
protected long dupByHashUrls;
protected long otherDupBytes;
protected long otherDupUrls;

protected long lastSuccessTime;

Expand All @@ -89,18 +92,6 @@ public synchronized void tally(CrawlURI curi, Stage stage) {
fetchResponses++;
totalBytes += curi.getContentSize();
successBytes += curi.getContentSize();

if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED) {
notModifiedBytes += curi.getContentSize();
notModifiedUrls++;
} else if (curi.getAnnotations().contains("duplicate:digest")) {
dupByHashBytes += curi.getContentSize();
dupByHashUrls++;
} else {
novelBytes += curi.getContentSize();
novelUrls++;
}

lastSuccessTime = curi.getFetchCompletedTime();
break;
case DISREGARDED:
Expand All @@ -115,21 +106,22 @@ public synchronized void tally(CrawlURI curi, Stage stage) {
} else {
fetchResponses++;
totalBytes += curi.getContentSize();

if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED) {
notModifiedBytes += curi.getContentSize();
notModifiedUrls++;
} else if (curi.getAnnotations().contains("duplicate:digest")) {
dupByHashBytes += curi.getContentSize();
dupByHashUrls++;
} else {
novelBytes += curi.getContentSize();
novelUrls++;
}

}
fetchFailures++;
break;
default:
break;
}

if (curi.getRevisitProfile() instanceof ServerNotModifiedRevisit) {
notModifiedBytes += curi.getContentSize();
notModifiedUrls++;
} else if (curi.getRevisitProfile() instanceof IdenticalPayloadDigestRevisit) {
dupByHashBytes += curi.getContentSize();
dupByHashUrls++;
} else if (curi.getRevisitProfile() != null) {
otherDupBytes += curi.getContentSize();
otherDupUrls++;
}
}

Expand Down Expand Up @@ -189,6 +181,14 @@ public long getDupByHashUrls() {
return dupByHashUrls;
}

public long getOtherDupBytes() {
return otherDupBytes;
}

public long getOtherDupUrls() {
return otherDupUrls;
}

/* (non-Javadoc)
* @see org.archive.util.Reporter#reportTo(java.io.PrintWriter)
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,13 @@
*/
package org.archive.modules.recrawl;

import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_DATE;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_URL;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_RECORD_ID;

import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.revisit.IdenticalPayloadDigestRevisit;
import org.springframework.beans.factory.annotation.Autowired;

public class ContentDigestHistoryLoader extends Processor {
Expand All @@ -41,7 +46,16 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
contentDigestHistory.load(curi);

if (!curi.getContentDigestHistory().isEmpty()) {
IdenticalPayloadDigestRevisit revisit =
new IdenticalPayloadDigestRevisit(curi.getContentDigestSchemeString());
revisit.setRefersToDate((String)curi.getContentDigestHistory().get(A_ORIGINAL_DATE));
revisit.setRefersToTargetURI((String)curi.getContentDigestHistory().get(A_ORIGINAL_URL));
String warcRecordId= (String)curi.getContentDigestHistory().get(A_WARC_RECORD_ID);
if (warcRecordId!=null) {
revisit.setRefersToRecordID(warcRecordId);
}
curi.setRevisitProfile(revisit);
curi.getAnnotations().add("duplicate:digest");
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@

import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule;
import org.archive.modules.revisit.IdenticalPayloadDigestRevisit;
import org.archive.modules.revisit.ServerNotModifiedRevisit;

/**
* Maintain a history of fetch information inside the CrawlURI's attributes.
Expand Down Expand Up @@ -72,8 +73,7 @@ protected void innerProcess(CrawlURI puri) throws InterruptedException {
// save status
latestFetch.put(A_STATUS, curi.getFetchStatus());
// save fetch start time
latestFetch.put(A_FETCH_BEGAN_TIME,
curi.getData().get(A_FETCH_BEGAN_TIME));
latestFetch.put(A_FETCH_BEGAN_TIME, curi.getFetchBeginTime());
// save digest
String digest = curi.getContentDigestSchemeString();
if (digest != null) {
Expand Down Expand Up @@ -106,11 +106,41 @@ protected void innerProcess(CrawlURI puri) throws InterruptedException {

curi.getData().put(A_FETCH_HISTORY, history);

if (IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
if (curi.getFetchStatus() == 304) {
ServerNotModifiedRevisit revisit = new ServerNotModifiedRevisit();
revisit.setETag((String) latestFetch.get(A_ETAG_HEADER));
revisit.setLastModified((String) latestFetch.get(A_LAST_MODIFIED_HEADER));
curi.setRevisitProfile(revisit);
} else if (hasIdenticalDigest(curi)) {
curi.getAnnotations().add("duplicate:digest");
IdenticalPayloadDigestRevisit revisit =
new IdenticalPayloadDigestRevisit((String)history[1].get(A_CONTENT_DIGEST));
revisit.setRefersToTargetURI(curi.getURI()); // Matches are always on the same URI
revisit.setRefersToDate((Long)history[1].get(A_FETCH_BEGAN_TIME));
curi.setRevisitProfile(revisit);
}
}

/**
* Utility method for testing if a CrawlURI's last two history
* entries (one being the most recent fetch) have identical
* content-digest information.
*
* @param curi CrawlURI to test
* @return true if last two history entries have identical digests,
* otherwise false
*/
public static boolean hasIdenticalDigest(CrawlURI curi) {
Map<String,Object>[] history = curi.getFetchHistory();

return history != null
&& history[0] != null
&& history[0].containsKey(A_CONTENT_DIGEST)
&& history[1] != null
&& history[1].containsKey(A_CONTENT_DIGEST)
&& history[0].get(A_CONTENT_DIGEST).equals(history[1].get(A_CONTENT_DIGEST));
}

/** Get or create proper-sized history array */
@SuppressWarnings("unchecked")
protected HashMap<String, Object>[] historyRealloc(CrawlURI curi) {
Expand Down
Loading

0 comments on commit d0ebd40

Please sign in to comment.