diff --git a/README.md b/README.md
index 0dbbb2e..fa2c6fb 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Add the following dependency to your pom.xml:
com.github.peterbencze
serritor
- 2.1.0
+ 2.1.1
```
@@ -27,7 +27,7 @@ Add the following dependency to your pom.xml:
Add the following dependency to your build.gradle:
```groovy
-compile group: 'com.github.peterbencze', name: 'serritor', version: '2.1.0'
+compile group: 'com.github.peterbencze', name: 'serritor', version: '2.1.1'
```
### Manual dependencies
diff --git a/checkstyle.xml b/checkstyle.xml
index 52ef575..f4e794f 100644
--- a/checkstyle.xml
+++ b/checkstyle.xml
@@ -1,36 +1,53 @@
+ "-//Checkstyle//DTD Checkstyle Configuration 1.3//EN"
+ "https://checkstyle.org/dtds/configuration_1_3.dtd">
+ Authors: Max Vetrenko, Ruslan Diachenko, Roman Ivanov.
+ -->
-
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
+
+
+
+
+
-
@@ -44,20 +61,28 @@
-
-
-
-
-
+
+
+
-
-
+
+
+
+
+
+
+ INSTANCE_INIT, ANNOTATION_DEF, ENUM_DEF"/>
+
+
+
+
+
+
+
+
+
+
+
@@ -124,6 +172,7 @@
value="Package name ''{0}'' must match pattern ''{1}''."/>
+
@@ -137,13 +186,17 @@
+
+
+
+
-
@@ -187,6 +240,9 @@
+
@@ -194,14 +250,26 @@
+
+
+
+
-
+ value="COMMA, SEMI, POST_INC, POST_DEC, DOT,
+ LABELED_STAT, METHOD_REF"/>
-
+
+
+
+
-
+
+
+
+
+
-
+
@@ -250,6 +323,14 @@
-
+
+
+
+
+
+
+
+
diff --git a/pom.xml b/pom.xml
index 71151fd..85f8e9d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
4.0.0
com.github.peterbencze
serritor
- 2.1.0
+ 2.1.1
jar
Serritor
@@ -59,7 +59,7 @@
org.seleniumhq.selenium
htmlunit-driver
- 2.35.1
+ 2.41.0
net.lightbody.bmp
@@ -69,7 +69,7 @@
com.google.guava
guava
- 28.0-jre
+ 29.0-jre
org.eclipse.jetty
@@ -100,12 +100,12 @@
org.slf4j
slf4j-api
- 1.7.26
+ 1.7.30
com.auth0
java-jwt
- 3.8.1
+ 3.10.3
org.mindrot
@@ -127,19 +127,19 @@
com.github.tomakehurst
wiremock-jre8-standalone
- 2.23.2
+ 2.26.3
test
org.awaitility
awaitility
- 3.1.6
+ 4.0.3
test
net.jodah
failsafe
- 2.0.1
+ 2.4.0
test
@@ -149,7 +149,7 @@
org.apache.maven.plugins
maven-source-plugin
- 3.1.0
+ 3.2.1
attach-source
@@ -162,7 +162,7 @@
org.apache.maven.plugins
maven-javadoc-plugin
- 3.1.0
+ 3.2.0
attach-javadoc
@@ -181,12 +181,12 @@
org.apache.maven.plugins
maven-checkstyle-plugin
- 3.0.0
+ 3.1.1
com.puppycrawl.tools
checkstyle
- 8.20
+ 8.33
@@ -245,7 +245,7 @@
org.apache.maven.plugins
maven-site-plugin
- 3.7.1
+ 3.9.0
true
true
diff --git a/src/main/java/com/github/peterbencze/serritor/api/Crawler.java b/src/main/java/com/github/peterbencze/serritor/api/Crawler.java
index 2ab3fc6..6461c15 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/Crawler.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/Crawler.java
@@ -88,14 +88,14 @@ public abstract class Crawler {
private final StatsCounter statsCounter;
private final CrawlFrontier crawlFrontier;
private final CustomCallbackManager callbackManager;
+ private final AtomicBoolean isStopped;
+ private final AtomicBoolean isStopInitiated;
private BasicCookieStore cookieStore;
private CloseableHttpClient httpClient;
private BrowserMobProxyServer proxyServer;
private WebDriver webDriver;
private CrawlDelayMechanism crawlDelayMechanism;
- private AtomicBoolean isStopped;
- private AtomicBoolean isStopInitiated;
/**
* Base constructor which sets up the crawler with the provided configuration.
@@ -196,6 +196,8 @@ private void start(
if (!isResuming) {
crawlFrontier.reset();
+ statsCounter.reset();
+ config.getCrawlSeeds().forEach(request -> crawlFrontier.feedRequest(request, true));
}
cookieStore = new BasicCookieStore();
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
index 0550509..4bf17ed 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
@@ -42,6 +42,7 @@
public final class CrawlFrontier implements Serializable {
private static final Logger LOGGER = LoggerFactory.getLogger(CrawlFrontier.class);
+ private static final int INITIAL_CRAWL_DEPTH = 1;
private final CrawlerConfiguration config;
private final StatsCounter statsCounter;
@@ -62,8 +63,6 @@ public CrawlFrontier(final CrawlerConfiguration config, final StatsCounter stats
this.statsCounter = statsCounter;
urlFingerprints = new HashSet<>();
candidates = createPriorityQueue();
-
- feedCrawlSeeds();
}
/**
@@ -115,6 +114,8 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) {
builder.setRefererUrl(currentCandidate.getRequestUrl())
.setCrawlDepth(nextCrawlDepth);
+ } else {
+ builder.setCrawlDepth(INITIAL_CRAWL_DEPTH);
}
LOGGER.debug("Adding request to the list of crawl candidates");
@@ -149,17 +150,6 @@ public void reset() {
urlFingerprints.clear();
candidates.clear();
-
- feedCrawlSeeds();
- }
-
- /**
- * Feeds all the crawl seeds to the crawl frontier.
- */
- private void feedCrawlSeeds() {
- LOGGER.debug("Feeding crawl seeds");
-
- config.getCrawlSeeds().forEach((CrawlRequest request) -> feedRequest(request, true));
}
/**
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java
index 6c657fd..785a601 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java
@@ -263,6 +263,25 @@ public StatsCounterSnapshot getSnapshot() {
return lock.readWithLock(() -> new StatsCounterSnapshot(this));
}
+ /**
+ * Resets stats counter to its initial state.
+ */
+ public void reset() {
+ lock.writeWithLock(() -> {
+ remainingCrawlCandidateCount = 0;
+ processedCrawlCandidateCount = 0;
+ responseSuccessCount = 0;
+ pageLoadTimeoutCount = 0;
+ requestRedirectCount = 0;
+ nonHtmlResponseCount = 0;
+ responseErrorCount = 0;
+ networkErrorCount = 0;
+ filteredDuplicateRequestCount = 0;
+ filteredOffsiteRequestCount = 0;
+ filteredCrawlDepthLimitExceedingRequestCount = 0;
+ });
+ }
+
/**
* Increments the number of processed crawl candidates.
*/
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
index 2f63098..a747ca6 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
@@ -23,11 +23,11 @@
import com.github.peterbencze.serritor.api.CrawlerConfiguration;
import com.github.peterbencze.serritor.api.CrawlerConfiguration.CrawlerConfigurationBuilder;
import com.github.peterbencze.serritor.internal.stats.StatsCounter;
-import com.google.common.collect.Sets;
import java.net.URI;
import java.util.Arrays;
-import java.util.Collections;
import java.util.List;
+import java.util.Optional;
+import org.hamcrest.Matchers;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
@@ -39,10 +39,10 @@
public final class CrawlFrontierTest {
// Allowed crawl domains
- private static final String ALLOWED_CRAWL_DOMAIN_0 = "root-url-0.com";
- private static final String ALLOWED_CRAWL_DOMAIN_1 = "root-url-1.com";
+ private static final String ROOT_URL_0_DOMAIN = "root-url-0.com";
+ private static final String ROOT_URL_1_DOMAIN = "root-url-1.com";
private static final List ALLOWED_CRAWL_DOMAINS
- = Arrays.asList(ALLOWED_CRAWL_DOMAIN_0, ALLOWED_CRAWL_DOMAIN_1);
+ = Arrays.asList(ROOT_URL_0_DOMAIN, ROOT_URL_1_DOMAIN);
// Root URLs
private static final URI ROOT_URL_0
@@ -52,7 +52,7 @@ public final class CrawlFrontierTest {
private static final URI ROOT_URL_1 = URI.create("http://root-url-1.com/");
// Root URL crawl depth
- private static final int ROOT_URL_CRAWL_DEPTH = 0;
+ private static final int ROOT_URL_CRAWL_DEPTH = 1;
// Root URL priorities
private static final int ROOT_URL_0_PRIORITY = 0;
@@ -80,7 +80,7 @@ public final class CrawlFrontierTest {
= URI.create(String.format("http://root-url-1.com%s-0", CHILD_URL_PATH));
// Child URL crawl depth
- private static final int CHILD_URL_CRAWL_DEPTH = 1;
+ private static final int CHILD_URL_CRAWL_DEPTH = 2;
// Child URL priorities
private static final int CHILD_URL_0_PRIORITY = 0;
@@ -105,11 +105,9 @@ public final class CrawlFrontierTest {
private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST
= new CrawlRequestBuilder(OFFSITE_URL).setPriority(OFFSITE_URL_PRIORITY).build();
- // Max crawl depth
- private static final int MAX_CRAWL_DEPTH = 1;
-
private CrawlerConfiguration configMock;
private StatsCounter statsCounterMock;
+ private CrawlFrontier crawlFrontier;
@Before
public void before() {
@@ -120,204 +118,194 @@ public void before() {
.build());
statsCounterMock = Mockito.mock(StatsCounter.class);
+
+ crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
}
@Test
- public void testHasNextCandidateWithNonEmptyQueue() {
- CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
-
- Assert.assertTrue(crawlFrontier.hasNextCandidate());
-
- crawlFrontier.getNextCandidate();
-
- Assert.assertTrue(crawlFrontier.hasNextCandidate());
-
- crawlFrontier.getNextCandidate();
-
- Assert.assertFalse(crawlFrontier.hasNextCandidate());
-
- crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false);
- crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false);
-
- Assert.assertTrue(crawlFrontier.hasNextCandidate());
+ public void testFeedRequestWhenOffsiteRequestFilterIsDisabledAndRequestIsOffsite() {
+ Mockito.when(configMock.isOffsiteRequestFilterEnabled()).thenReturn(false);
- crawlFrontier.getNextCandidate();
+ crawlFrontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, true);
- Assert.assertTrue(crawlFrontier.hasNextCandidate());
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(true));
+ Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.notNullValue());
+ Mockito.verify(statsCounterMock).recordRemainingCrawlCandidate();
+ }
- crawlFrontier.getNextCandidate();
+ @Test
+ public void testFeedRequestWhenOffsiteRequestFilterIsEnabledAndRequestDomainIsNotAllowed() {
+ crawlFrontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, true);
- Assert.assertFalse(crawlFrontier.hasNextCandidate());
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false));
+ Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue());
+ Mockito.verify(statsCounterMock).recordOffsiteRequest();
}
@Test
- public void testHasNextCandidateWithEmptyQueue() {
- Mockito.when(configMock.getCrawlSeeds()).thenReturn(Collections.EMPTY_SET);
+ public void testFeedRequestWhenDuplicateRequestFilterIsDisabledAndRequestIsADuplicate() {
+ Mockito.when(configMock.isDuplicateRequestFilterEnabled()).thenReturn(false);
+ crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true);
+ crawlFrontier.getNextCandidate();
- CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
+ crawlFrontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, false);
- Assert.assertFalse(crawlFrontier.hasNextCandidate());
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(true));
+ Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.notNullValue());
+ Mockito.verify(statsCounterMock, Mockito.times(2)).recordRemainingCrawlCandidate();
}
@Test
- public void testEnabledDuplicateRequestFiltering() {
- CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
+ public void testFeedRequestWhenDuplicateRequestFilterIsEnabledAndRequestIsADuplicate() {
+ crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true);
+ crawlFrontier.getNextCandidate();
- clearCrawlCandidateQueue(crawlFrontier);
crawlFrontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, false);
- Assert.assertFalse(crawlFrontier.hasNextCandidate());
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false));
+ Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue());
+ Mockito.verify(statsCounterMock).recordDuplicateRequest();
}
@Test
- public void testDisabledDuplicateRequestFiltering() {
- Mockito.when(configMock.isDuplicateRequestFilterEnabled()).thenReturn(false);
-
- CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
+ public void testFeedRequestWhenCrawlDepthLimitIsSetAndRequestExceedsLimit() {
+ Mockito.when(configMock.getMaximumCrawlDepth()).thenReturn(1);
+ crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true);
+ crawlFrontier.getNextCandidate();
- clearCrawlCandidateQueue(crawlFrontier);
- crawlFrontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, true);
+ crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false);
- Assert.assertTrue(crawlFrontier.hasNextCandidate());
- Assert.assertEquals(DUPLICATE_ROOT_URL_0, crawlFrontier.getNextCandidate().getRequestUrl());
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false));
+ Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue());
+ Mockito.verify(statsCounterMock).recordCrawlDepthLimitExceedingRequest();
}
@Test
- public void testEnabledOffsiteRequestFiltering() {
- Mockito.when(configMock.getCrawlSeeds())
- .thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST));
-
- CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
+ public void testFeedRequestWhenRequestIsNotDuplicateAndIsNotOffsiteAndIsInCrawlDepthLimit() {
+ Mockito.when(configMock.getMaximumCrawlDepth()).thenReturn(1);
+
+ crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true);
+
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(true));
+ CrawlCandidate candidate = crawlFrontier.getNextCandidate();
+ Assert.assertThat(candidate.getRequestUrl(), Matchers.is(ROOT_URL_0));
+ Assert.assertThat(candidate.getCrawlDepth(), Matchers.is(ROOT_URL_CRAWL_DEPTH));
+ Assert.assertThat(candidate.getPriority(), Matchers.is(ROOT_URL_0_PRIORITY));
+ Assert.assertThat(candidate.getDomain().toString(), Matchers.is(ROOT_URL_0_DOMAIN));
+ Assert.assertThat(candidate.getRefererUrl(), Matchers.nullValue());
+ Assert.assertThat(candidate.getMetadata(), Matchers.is(Optional.empty()));
+ Mockito.verify(statsCounterMock).recordRemainingCrawlCandidate();
+ }
- Assert.assertFalse(crawlFrontier.hasNextCandidate());
+ @Test
+ public void testHasNextCandidateWhenCandidateQueueIsEmpty() {
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false));
}
@Test
- public void testDisabledOffsiteRequestFiltering() {
- Mockito.when(configMock.isOffsiteRequestFilterEnabled()).thenReturn(false);
- Mockito.when(configMock.getCrawlSeeds())
- .thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST));
+ public void testHashNextCandidateWhenCandidateQueueIsNotEmpty() {
+ crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true);
- CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(true));
+ }
- Assert.assertTrue(crawlFrontier.hasNextCandidate());
- Assert.assertEquals(OFFSITE_URL,
- crawlFrontier.getNextCandidate().getRequestUrl());
+ @Test
+ public void testGetNextCandidateWhenCandidateQueueIsEmpty() {
+ Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue());
}
@Test
public void testGetNextCandidateWhenUsingBreadthFirstCrawlStrategy() {
- CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
+ crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true);
+ crawlFrontier.feedRequest(ROOT_URL_1_CRAWL_REQUEST, true);
CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate();
- Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl());
- Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
- Assert.assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority());
+ Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(ROOT_URL_1));
+ Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(ROOT_URL_CRAWL_DEPTH));
+ Assert.assertThat(nextCandidate.getPriority(), Matchers.is(ROOT_URL_1_PRIORITY));
crawlFrontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false);
nextCandidate = crawlFrontier.getNextCandidate();
- Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl());
- Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
- Assert.assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority());
+ Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(ROOT_URL_0));
+ Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(ROOT_URL_CRAWL_DEPTH));
+ Assert.assertThat(nextCandidate.getPriority(), Matchers.is(ROOT_URL_0_PRIORITY));
crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false);
crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false);
nextCandidate = crawlFrontier.getNextCandidate();
- Assert.assertEquals(CHILD_URL_2, nextCandidate.getRequestUrl());
- Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
- Assert.assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority());
+ Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(CHILD_URL_2));
+ Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH));
+ Assert.assertThat(nextCandidate.getPriority(), Matchers.is(CHILD_URL_2_PRIORITY));
// A priority queue doesn't ensure FIFO order when elements have the same depth and priority
nextCandidate = crawlFrontier.getNextCandidate();
int previousChildCandidatePriority = nextCandidate.getPriority();
- Assert.assertTrue(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH));
- Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
+ Assert.assertThat(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH),
+ Matchers.is(true));
+ Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH));
nextCandidate = crawlFrontier.getNextCandidate();
- Assert.assertTrue(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH));
- Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
- Assert.assertEquals(previousChildCandidatePriority, nextCandidate.getPriority());
+ Assert.assertThat(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH),
+ Matchers.is(true));
+ Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH));
+ Assert.assertThat(previousChildCandidatePriority, Matchers.is(nextCandidate.getPriority()));
- Assert.assertFalse(crawlFrontier.hasNextCandidate());
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false));
+ Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue());
}
@Test
public void testGetNextCandidateWhenUsingDepthFirstCrawlStrategy() {
Mockito.when(configMock.getCrawlStrategy()).thenReturn(CrawlStrategy.DEPTH_FIRST);
-
CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
+ crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true);
+ crawlFrontier.feedRequest(ROOT_URL_1_CRAWL_REQUEST, true);
CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate();
- Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl());
- Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
- Assert.assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority());
+ Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(ROOT_URL_1));
+ Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(ROOT_URL_CRAWL_DEPTH));
+ Assert.assertThat(nextCandidate.getPriority(), Matchers.is(ROOT_URL_1_PRIORITY));
crawlFrontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false);
// A priority queue doesn't ensure FIFO order when elements have the same depth and priority
nextCandidate = crawlFrontier.getNextCandidate();
- Assert.assertTrue(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH));
- Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
- Assert.assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority());
+ Assert.assertThat(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH),
+ Matchers.is(true));
+ Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH));
+ Assert.assertThat(nextCandidate.getPriority(), Matchers.is(CHILD_URL_2_PRIORITY));
nextCandidate = crawlFrontier.getNextCandidate();
- Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl());
- Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
- Assert.assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority());
+ Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(ROOT_URL_0));
+ Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(ROOT_URL_CRAWL_DEPTH));
+ Assert.assertThat(nextCandidate.getPriority(), Matchers.is(ROOT_URL_0_PRIORITY));
crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false);
crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false);
nextCandidate = crawlFrontier.getNextCandidate();
- Assert.assertEquals(CHILD_URL_0, nextCandidate.getRequestUrl());
- Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
- Assert.assertEquals(CHILD_URL_0_PRIORITY, nextCandidate.getPriority());
+ Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(CHILD_URL_0));
+ Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH));
+ Assert.assertThat(nextCandidate.getPriority(), Matchers.is(CHILD_URL_0_PRIORITY));
nextCandidate = crawlFrontier.getNextCandidate();
- Assert.assertEquals(CHILD_URL_1, nextCandidate.getRequestUrl());
- Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
- Assert.assertEquals(CHILD_URL_1_PRIORITY, nextCandidate.getPriority());
- Assert.assertFalse(crawlFrontier.hasNextCandidate());
- }
-
- @Test
- public void testCrawlDepthLimitation() {
- Mockito.when(configMock.getMaximumCrawlDepth()).thenReturn(MAX_CRAWL_DEPTH);
-
- CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
-
- clearCrawlCandidateQueue(crawlFrontier);
- crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false);
-
- CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate();
- Assert.assertTrue(nextCandidate.getCrawlDepth() <= MAX_CRAWL_DEPTH);
+ Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(CHILD_URL_1));
+ Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH));
+ Assert.assertThat(nextCandidate.getPriority(), Matchers.is(CHILD_URL_1_PRIORITY));
- crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false);
-
- Assert.assertFalse(crawlFrontier.hasNextCandidate());
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false));
+ Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue());
}
@Test
- public void testReset() {
- CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock);
+ public void testResetWhenCandidateQueueIsNotEmpty() {
+ crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true);
crawlFrontier.reset();
- // Check if only the crawl seeds are present after the reset
- Assert.assertTrue(crawlFrontier.hasNextCandidate());
- Assert.assertEquals(ROOT_URL_1, crawlFrontier.getNextCandidate().getRequestUrl());
-
- Assert.assertTrue(crawlFrontier.hasNextCandidate());
- Assert.assertEquals(ROOT_URL_0, crawlFrontier.getNextCandidate().getRequestUrl());
-
- Assert.assertFalse(crawlFrontier.hasNextCandidate());
- }
-
- private static void clearCrawlCandidateQueue(final CrawlFrontier crawlFrontier) {
- while (crawlFrontier.hasNextCandidate()) {
- crawlFrontier.getNextCandidate();
- }
+ Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false));
+ Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue());
}
}
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java b/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java
index a853822..4a722fb 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java
@@ -16,6 +16,7 @@
package com.github.peterbencze.serritor.internal.stats;
+import org.hamcrest.Matchers;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
@@ -133,4 +134,39 @@ public void testRecordNetworkError() {
Assert.assertEquals(processedCrawlCandidateCountBefore + 1,
statsCounter.getProcessedCrawlCandidateCount());
}
+
+ @Test
+ public void testReset() {
+ statsCounter.recordRemainingCrawlCandidate();
+ statsCounter.recordRemainingCrawlCandidate();
+ statsCounter.recordRemainingCrawlCandidate();
+ statsCounter.recordRemainingCrawlCandidate();
+ statsCounter.recordRemainingCrawlCandidate();
+ statsCounter.recordRemainingCrawlCandidate();
+ statsCounter.recordRemainingCrawlCandidate();
+ statsCounter.recordResponseSuccess();
+ statsCounter.recordPageLoadTimeout();
+ statsCounter.recordRequestRedirect();
+ statsCounter.recordNonHtmlResponse();
+ statsCounter.recordResponseError();
+ statsCounter.recordNetworkError();
+ statsCounter.recordDuplicateRequest();
+ statsCounter.recordOffsiteRequest();
+ statsCounter.recordCrawlDepthLimitExceedingRequest();
+
+ statsCounter.reset();
+
+ Assert.assertThat(statsCounter.getRemainingCrawlCandidateCount(), Matchers.is(0));
+ Assert.assertThat(statsCounter.getProcessedCrawlCandidateCount(), Matchers.is(0));
+ Assert.assertThat(statsCounter.getResponseSuccessCount(), Matchers.is(0));
+ Assert.assertThat(statsCounter.getPageLoadTimeoutCount(), Matchers.is(0));
+ Assert.assertThat(statsCounter.getRequestRedirectCount(), Matchers.is(0));
+ Assert.assertThat(statsCounter.getNonHtmlResponseCount(), Matchers.is(0));
+ Assert.assertThat(statsCounter.getResponseErrorCount(), Matchers.is(0));
+ Assert.assertThat(statsCounter.getNetworkErrorCount(), Matchers.is(0));
+ Assert.assertThat(statsCounter.getFilteredDuplicateRequestCount(), Matchers.is(0));
+ Assert.assertThat(statsCounter.getFilteredOffsiteRequestCount(), Matchers.is(0));
+ Assert.assertThat(statsCounter.getFilteredCrawlDepthLimitExceedingRequestCount(),
+ Matchers.is(0));
+ }
}
diff --git a/src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java b/src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java
index b1fe3e0..fece580 100644
--- a/src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java
+++ b/src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java
@@ -16,9 +16,11 @@
package com.github.peterbencze.serritor.it;
-import com.github.peterbencze.serritor.api.Crawler;
import com.github.peterbencze.serritor.api.Browser;
import com.github.peterbencze.serritor.api.CrawlRequest;
+import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder;
+import com.github.peterbencze.serritor.api.CrawlStats;
+import com.github.peterbencze.serritor.api.Crawler;
import com.github.peterbencze.serritor.api.CrawlerConfiguration;
import com.github.peterbencze.serritor.api.event.NonHtmlResponseEvent;
import com.github.peterbencze.serritor.api.event.ResponseSuccessEvent;
@@ -34,13 +36,12 @@
import net.lightbody.bmp.client.ClientUtil;
import org.apache.commons.io.IOUtils;
import org.apache.http.entity.ContentType;
+import org.hamcrest.Matchers;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
-import org.openqa.selenium.htmlunit.HtmlUnitDriver;
-import org.openqa.selenium.remote.BrowserType;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.DesiredCapabilities;
@@ -104,7 +105,7 @@ protected void onNonHtmlResponse(final NonHtmlResponseEvent event) {
}
@Test
- public void testResumeState() throws IOException {
+ public void testResumeState() {
WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/foo"))
.willReturn(WireMock.ok()
.withHeader("Content-Type", ContentType.TEXT_HTML.toString())));
@@ -113,8 +114,6 @@ public void testResumeState() throws IOException {
.willReturn(WireMock.ok()
.withHeader("Content-Type", ContentType.TEXT_HTML.toString())));
- File destinationFile = createTempFile();
-
CrawlerConfiguration config = new CrawlerConfiguration.CrawlerConfigurationBuilder()
.addCrawlSeed(CrawlRequest.createDefault("http://te.st/foo"))
.addCrawlSeed(CrawlRequest.createDefault("http://te.st/bar"))
@@ -144,6 +143,73 @@ protected void onResponseSuccess(final ResponseSuccessEvent event) {
Assert.assertEquals(0, WireMock.findUnmatchedRequests().size());
}
+ @Test
+ public void testCrawlerRestartWhenStateWasRestored() {
+ WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/foo"))
+ .willReturn(WireMock.ok()
+ .withHeader("Content-Type", ContentType.TEXT_HTML.toString())));
+ WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/bar"))
+ .willReturn(WireMock.ok()
+ .withHeader("Content-Type", ContentType.TEXT_HTML.toString())));
+
+ CrawlerConfiguration config = new CrawlerConfiguration.CrawlerConfigurationBuilder()
+ .addCrawlSeed(new CrawlRequestBuilder("http://te.st/foo").setPriority(1).build())
+ .addCrawlSeed(CrawlRequest.createDefault("http://te.st/bar"))
+ .build();
+ Crawler crawler = new Crawler(config) {
+ @Override
+ protected void onResponseSuccess(final ResponseSuccessEvent event) {
+ super.onResponseSuccess(event);
+
+ stop();
+ }
+
+ };
+ crawler.start(Browser.HTML_UNIT, capabilities);
+
+ CrawlStats stats = crawler.getCrawlStats();
+ Assert.assertThat(stats.getRemainingCrawlCandidateCount(), Matchers.is(1));
+ Assert.assertThat(stats.getProcessedCrawlCandidateCount(), Matchers.is(1));
+ Assert.assertThat(stats.getResponseSuccessCount(), Matchers.is(1));
+ Assert.assertThat(stats.getPageLoadTimeoutCount(), Matchers.is(0));
+ Assert.assertThat(stats.getRequestRedirectCount(), Matchers.is(0));
+ Assert.assertThat(stats.getNonHtmlResponseCount(), Matchers.is(0));
+ Assert.assertThat(stats.getResponseErrorCount(), Matchers.is(0));
+ Assert.assertThat(stats.getNetworkErrorCount(), Matchers.is(0));
+ Assert.assertThat(stats.getFilteredDuplicateRequestCount(), Matchers.is(0));
+ Assert.assertThat(stats.getFilteredOffsiteRequestCount(), Matchers.is(0));
+ Assert.assertThat(stats.getFilteredCrawlDepthLimitExceedingRequestCount(), Matchers.is(0));
+
+ WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/foo")));
+ WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/foo")));
+ WireMock.verify(0, WireMock.headRequestedFor(WireMock.urlEqualTo("/bar")));
+ WireMock.verify(0, WireMock.getRequestedFor(WireMock.urlEqualTo("/bar")));
+
+ crawler = new Crawler(crawler.getState()) {
+ };
+ crawler.start(Browser.HTML_UNIT, capabilities);
+
+ stats = crawler.getCrawlStats();
+ Assert.assertThat(stats.getRemainingCrawlCandidateCount(), Matchers.is(0));
+ Assert.assertThat(stats.getProcessedCrawlCandidateCount(), Matchers.is(2));
+ Assert.assertThat(stats.getResponseSuccessCount(), Matchers.is(2));
+ Assert.assertThat(stats.getPageLoadTimeoutCount(), Matchers.is(0));
+ Assert.assertThat(stats.getRequestRedirectCount(), Matchers.is(0));
+ Assert.assertThat(stats.getNonHtmlResponseCount(), Matchers.is(0));
+ Assert.assertThat(stats.getResponseErrorCount(), Matchers.is(0));
+ Assert.assertThat(stats.getNetworkErrorCount(), Matchers.is(0));
+ Assert.assertThat(stats.getFilteredDuplicateRequestCount(), Matchers.is(0));
+ Assert.assertThat(stats.getFilteredOffsiteRequestCount(), Matchers.is(0));
+ Assert.assertThat(stats.getFilteredCrawlDepthLimitExceedingRequestCount(), Matchers.is(0));
+
+ WireMock.verify(2, WireMock.headRequestedFor(WireMock.urlEqualTo("/foo")));
+ WireMock.verify(2, WireMock.getRequestedFor(WireMock.urlEqualTo("/foo")));
+ WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/bar")));
+ WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/bar")));
+
+ Assert.assertThat(WireMock.findUnmatchedRequests().size(), Matchers.is(0));
+ }
+
@Test
public void testHttpClientCookieSynchronization() {
WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/foo"))
@@ -235,15 +301,6 @@ private static BrowserMobProxyServer createProxyServer(final int mockWebServerPo
return server;
}
- private static HtmlUnitDriver createHtmlUnitDriver(final BrowserMobProxyServer server) {
- DesiredCapabilities capabilities = new DesiredCapabilities();
- capabilities.setBrowserName(BrowserType.HTMLUNIT);
- capabilities.setJavascriptEnabled(true);
- capabilities.setCapability(CapabilityType.PROXY, ClientUtil.createSeleniumProxy(server));
-
- return new HtmlUnitDriver(capabilities);
- }
-
private static File createTempFile() throws IOException {
File tempFile = File.createTempFile("tmp", null);
tempFile.deleteOnExit();