diff --git a/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java b/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java index d187bfc68..758751845 100644 --- a/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java +++ b/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java @@ -18,7 +18,9 @@ */ package org.archive.crawler.frontier; +import org.apache.commons.lang.StringUtils; import org.archive.modules.CrawlURI; +import org.archive.net.UURI; import org.archive.net.PublicSuffixes; /** @@ -32,9 +34,34 @@ public class AssignmentLevelSurtQueueAssignmentPolicy extends private static final long serialVersionUID = -1533545293624791702L; @Override - public String getClassKey(CrawlURI cauri) { - String candidate = super.getClassKey(cauri); - candidate = PublicSuffixes.reduceSurtToAssignmentLevel(candidate); + public String getClassKey(CrawlURI curi) { + if(getDeferToPrevious() && !StringUtils.isEmpty(curi.getClassKey())) { + return curi.getClassKey(); + } + + UURI basis = curi.getPolicyBasisUURI(); + String candidate = super.getClassKey(curi); + candidate = PublicSuffixes.reduceSurtToAssignmentLevel(candidate); + + if(!StringUtils.isEmpty(getForceQueueAssignment())) { + candidate = getForceQueueAssignment(); + } + + // all whois urls in the same queue + if (curi.getUURI().getScheme().equals("whois")) { + return "whois..."; + } + + if(StringUtils.isEmpty(candidate)) { + return DEFAULT_CLASS_KEY; + } + if(getParallelQueues()>1) { + int subqueue = getSubqueue(basis,getParallelQueues()); + if (subqueue>0) { + candidate += "+"+subqueue; + } + } + return candidate; } diff --git a/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java b/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java index 8001540db..3212c4f07 100644 --- a/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java +++ b/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java @@ -90,9 +90,12 @@ public String getClassKey(CrawlURI curi) { if(getDeferToPrevious() && !StringUtils.isEmpty(curi.getClassKey())) { return curi.getClassKey(); } + + UURI basis = curi.getPolicyBasisUURI(); + String candidate = getCoreKey(basis); if(!StringUtils.isEmpty(getForceQueueAssignment())) { - return getForceQueueAssignment(); + candidate = getForceQueueAssignment(); } // all whois urls in the same queue @@ -100,9 +103,6 @@ public String getClassKey(CrawlURI curi) { return "whois..."; } - UURI basis = curi.getPolicyBasisUURI(); - String candidate = getCoreKey(basis); - if(StringUtils.isEmpty(candidate)) { return DEFAULT_CLASS_KEY; } diff --git a/engine/src/main/java/org/archive/crawler/prefetch/QuotaEnforcer.java b/engine/src/main/java/org/archive/crawler/prefetch/QuotaEnforcer.java index d76302271..171bb5d0a 100644 --- a/engine/src/main/java/org/archive/crawler/prefetch/QuotaEnforcer.java +++ b/engine/src/main/java/org/archive/crawler/prefetch/QuotaEnforcer.java @@ -348,7 +348,7 @@ public void setGroupMaxNovelUrls(long max) { * being force-retired (if the Frontier supports this). Note that if your * queues combine URIs that are different with regard to the quota category, * the retirement may hold back URIs not in the same quota category. Default - * is false. + * is true. */ { setForceRetire(true);