diff --git a/README.md b/README.md index 0dbbb2e..fa2c6fb 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Add the following dependency to your pom.xml: com.github.peterbencze serritor - 2.1.0 + 2.1.1 ``` @@ -27,7 +27,7 @@ Add the following dependency to your pom.xml: Add the following dependency to your build.gradle: ```groovy -compile group: 'com.github.peterbencze', name: 'serritor', version: '2.1.0' +compile group: 'com.github.peterbencze', name: 'serritor', version: '2.1.1' ``` ### Manual dependencies diff --git a/checkstyle.xml b/checkstyle.xml index 52ef575..f4e794f 100644 --- a/checkstyle.xml +++ b/checkstyle.xml @@ -1,36 +1,53 @@ + "-//Checkstyle//DTD Checkstyle Configuration 1.3//EN" + "https://checkstyle.org/dtds/configuration_1_3.dtd"> + Authors: Max Vetrenko, Ruslan Diachenko, Roman Ivanov. + --> - + + + + + + + + + + + + - + - + + + + + - @@ -44,20 +61,28 @@ - - - - - + + + - - + + + + + + + INSTANCE_INIT, ANNOTATION_DEF, ENUM_DEF"/> + + + + + + + + + + + @@ -124,6 +172,7 @@ value="Package name ''{0}'' must match pattern ''{1}''."/> + @@ -137,13 +186,17 @@ + + + + - @@ -187,6 +240,9 @@ + @@ -194,14 +250,26 @@ + + + + - + value="COMMA, SEMI, POST_INC, POST_DEC, DOT, + LABELED_STAT, METHOD_REF"/> - + + + + - + + + + + - + @@ -250,6 +323,14 @@ - + + + + + + + + diff --git a/pom.xml b/pom.xml index 71151fd..85f8e9d 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 com.github.peterbencze serritor - 2.1.0 + 2.1.1 jar Serritor @@ -59,7 +59,7 @@ org.seleniumhq.selenium htmlunit-driver - 2.35.1 + 2.41.0 net.lightbody.bmp @@ -69,7 +69,7 @@ com.google.guava guava - 28.0-jre + 29.0-jre org.eclipse.jetty @@ -100,12 +100,12 @@ org.slf4j slf4j-api - 1.7.26 + 1.7.30 com.auth0 java-jwt - 3.8.1 + 3.10.3 org.mindrot @@ -127,19 +127,19 @@ com.github.tomakehurst wiremock-jre8-standalone - 2.23.2 + 2.26.3 test org.awaitility awaitility - 3.1.6 + 4.0.3 test net.jodah failsafe - 2.0.1 + 2.4.0 test @@ -149,7 +149,7 @@ org.apache.maven.plugins maven-source-plugin - 3.1.0 + 3.2.1 attach-source @@ -162,7 +162,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.1.0 + 3.2.0 attach-javadoc @@ -181,12 +181,12 @@ org.apache.maven.plugins maven-checkstyle-plugin - 3.0.0 + 3.1.1 com.puppycrawl.tools checkstyle - 8.20 + 8.33 @@ -245,7 +245,7 @@ org.apache.maven.plugins maven-site-plugin - 3.7.1 + 3.9.0 true true diff --git a/src/main/java/com/github/peterbencze/serritor/api/Crawler.java b/src/main/java/com/github/peterbencze/serritor/api/Crawler.java index 2ab3fc6..6461c15 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/Crawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/Crawler.java @@ -88,14 +88,14 @@ public abstract class Crawler { private final StatsCounter statsCounter; private final CrawlFrontier crawlFrontier; private final CustomCallbackManager callbackManager; + private final AtomicBoolean isStopped; + private final AtomicBoolean isStopInitiated; private BasicCookieStore cookieStore; private CloseableHttpClient httpClient; private BrowserMobProxyServer proxyServer; private WebDriver webDriver; private CrawlDelayMechanism crawlDelayMechanism; - private AtomicBoolean isStopped; - private AtomicBoolean isStopInitiated; /** * Base constructor which sets up the crawler with the provided configuration. @@ -196,6 +196,8 @@ private void start( if (!isResuming) { crawlFrontier.reset(); + statsCounter.reset(); + config.getCrawlSeeds().forEach(request -> crawlFrontier.feedRequest(request, true)); } cookieStore = new BasicCookieStore(); diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 0550509..4bf17ed 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -42,6 +42,7 @@ public final class CrawlFrontier implements Serializable { private static final Logger LOGGER = LoggerFactory.getLogger(CrawlFrontier.class); + private static final int INITIAL_CRAWL_DEPTH = 1; private final CrawlerConfiguration config; private final StatsCounter statsCounter; @@ -62,8 +63,6 @@ public CrawlFrontier(final CrawlerConfiguration config, final StatsCounter stats this.statsCounter = statsCounter; urlFingerprints = new HashSet<>(); candidates = createPriorityQueue(); - - feedCrawlSeeds(); } /** @@ -115,6 +114,8 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { builder.setRefererUrl(currentCandidate.getRequestUrl()) .setCrawlDepth(nextCrawlDepth); + } else { + builder.setCrawlDepth(INITIAL_CRAWL_DEPTH); } LOGGER.debug("Adding request to the list of crawl candidates"); @@ -149,17 +150,6 @@ public void reset() { urlFingerprints.clear(); candidates.clear(); - - feedCrawlSeeds(); - } - - /** - * Feeds all the crawl seeds to the crawl frontier. - */ - private void feedCrawlSeeds() { - LOGGER.debug("Feeding crawl seeds"); - - config.getCrawlSeeds().forEach((CrawlRequest request) -> feedRequest(request, true)); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java index 6c657fd..785a601 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java @@ -263,6 +263,25 @@ public StatsCounterSnapshot getSnapshot() { return lock.readWithLock(() -> new StatsCounterSnapshot(this)); } + /** + * Resets stats counter to its initial state. + */ + public void reset() { + lock.writeWithLock(() -> { + remainingCrawlCandidateCount = 0; + processedCrawlCandidateCount = 0; + responseSuccessCount = 0; + pageLoadTimeoutCount = 0; + requestRedirectCount = 0; + nonHtmlResponseCount = 0; + responseErrorCount = 0; + networkErrorCount = 0; + filteredDuplicateRequestCount = 0; + filteredOffsiteRequestCount = 0; + filteredCrawlDepthLimitExceedingRequestCount = 0; + }); + } + /** * Increments the number of processed crawl candidates. */ diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 2f63098..a747ca6 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -23,11 +23,11 @@ import com.github.peterbencze.serritor.api.CrawlerConfiguration; import com.github.peterbencze.serritor.api.CrawlerConfiguration.CrawlerConfigurationBuilder; import com.github.peterbencze.serritor.internal.stats.StatsCounter; -import com.google.common.collect.Sets; import java.net.URI; import java.util.Arrays; -import java.util.Collections; import java.util.List; +import java.util.Optional; +import org.hamcrest.Matchers; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -39,10 +39,10 @@ public final class CrawlFrontierTest { // Allowed crawl domains - private static final String ALLOWED_CRAWL_DOMAIN_0 = "root-url-0.com"; - private static final String ALLOWED_CRAWL_DOMAIN_1 = "root-url-1.com"; + private static final String ROOT_URL_0_DOMAIN = "root-url-0.com"; + private static final String ROOT_URL_1_DOMAIN = "root-url-1.com"; private static final List ALLOWED_CRAWL_DOMAINS - = Arrays.asList(ALLOWED_CRAWL_DOMAIN_0, ALLOWED_CRAWL_DOMAIN_1); + = Arrays.asList(ROOT_URL_0_DOMAIN, ROOT_URL_1_DOMAIN); // Root URLs private static final URI ROOT_URL_0 @@ -52,7 +52,7 @@ public final class CrawlFrontierTest { private static final URI ROOT_URL_1 = URI.create("http://root-url-1.com/"); // Root URL crawl depth - private static final int ROOT_URL_CRAWL_DEPTH = 0; + private static final int ROOT_URL_CRAWL_DEPTH = 1; // Root URL priorities private static final int ROOT_URL_0_PRIORITY = 0; @@ -80,7 +80,7 @@ public final class CrawlFrontierTest { = URI.create(String.format("http://root-url-1.com%s-0", CHILD_URL_PATH)); // Child URL crawl depth - private static final int CHILD_URL_CRAWL_DEPTH = 1; + private static final int CHILD_URL_CRAWL_DEPTH = 2; // Child URL priorities private static final int CHILD_URL_0_PRIORITY = 0; @@ -105,11 +105,9 @@ public final class CrawlFrontierTest { private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST = new CrawlRequestBuilder(OFFSITE_URL).setPriority(OFFSITE_URL_PRIORITY).build(); - // Max crawl depth - private static final int MAX_CRAWL_DEPTH = 1; - private CrawlerConfiguration configMock; private StatsCounter statsCounterMock; + private CrawlFrontier crawlFrontier; @Before public void before() { @@ -120,204 +118,194 @@ public void before() { .build()); statsCounterMock = Mockito.mock(StatsCounter.class); + + crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); } @Test - public void testHasNextCandidateWithNonEmptyQueue() { - CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); - - Assert.assertTrue(crawlFrontier.hasNextCandidate()); - - crawlFrontier.getNextCandidate(); - - Assert.assertTrue(crawlFrontier.hasNextCandidate()); - - crawlFrontier.getNextCandidate(); - - Assert.assertFalse(crawlFrontier.hasNextCandidate()); - - crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); - crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - - Assert.assertTrue(crawlFrontier.hasNextCandidate()); + public void testFeedRequestWhenOffsiteRequestFilterIsDisabledAndRequestIsOffsite() { + Mockito.when(configMock.isOffsiteRequestFilterEnabled()).thenReturn(false); - crawlFrontier.getNextCandidate(); + crawlFrontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, true); - Assert.assertTrue(crawlFrontier.hasNextCandidate()); + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(true)); + Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.notNullValue()); + Mockito.verify(statsCounterMock).recordRemainingCrawlCandidate(); + } - crawlFrontier.getNextCandidate(); + @Test + public void testFeedRequestWhenOffsiteRequestFilterIsEnabledAndRequestDomainIsNotAllowed() { + crawlFrontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, true); - Assert.assertFalse(crawlFrontier.hasNextCandidate()); + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false)); + Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue()); + Mockito.verify(statsCounterMock).recordOffsiteRequest(); } @Test - public void testHasNextCandidateWithEmptyQueue() { - Mockito.when(configMock.getCrawlSeeds()).thenReturn(Collections.EMPTY_SET); + public void testFeedRequestWhenDuplicateRequestFilterIsDisabledAndRequestIsADuplicate() { + Mockito.when(configMock.isDuplicateRequestFilterEnabled()).thenReturn(false); + crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true); + crawlFrontier.getNextCandidate(); - CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); + crawlFrontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, false); - Assert.assertFalse(crawlFrontier.hasNextCandidate()); + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(true)); + Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.notNullValue()); + Mockito.verify(statsCounterMock, Mockito.times(2)).recordRemainingCrawlCandidate(); } @Test - public void testEnabledDuplicateRequestFiltering() { - CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); + public void testFeedRequestWhenDuplicateRequestFilterIsEnabledAndRequestIsADuplicate() { + crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true); + crawlFrontier.getNextCandidate(); - clearCrawlCandidateQueue(crawlFrontier); crawlFrontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, false); - Assert.assertFalse(crawlFrontier.hasNextCandidate()); + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false)); + Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue()); + Mockito.verify(statsCounterMock).recordDuplicateRequest(); } @Test - public void testDisabledDuplicateRequestFiltering() { - Mockito.when(configMock.isDuplicateRequestFilterEnabled()).thenReturn(false); - - CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); + public void testFeedRequestWhenCrawlDepthLimitIsSetAndRequestExceedsLimit() { + Mockito.when(configMock.getMaximumCrawlDepth()).thenReturn(1); + crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true); + crawlFrontier.getNextCandidate(); - clearCrawlCandidateQueue(crawlFrontier); - crawlFrontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, true); + crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); - Assert.assertTrue(crawlFrontier.hasNextCandidate()); - Assert.assertEquals(DUPLICATE_ROOT_URL_0, crawlFrontier.getNextCandidate().getRequestUrl()); + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false)); + Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue()); + Mockito.verify(statsCounterMock).recordCrawlDepthLimitExceedingRequest(); } @Test - public void testEnabledOffsiteRequestFiltering() { - Mockito.when(configMock.getCrawlSeeds()) - .thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST)); - - CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); + public void testFeedRequestWhenRequestIsNotDuplicateAndIsNotOffsiteAndIsInCrawlDepthLimit() { + Mockito.when(configMock.getMaximumCrawlDepth()).thenReturn(1); + + crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true); + + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(true)); + CrawlCandidate candidate = crawlFrontier.getNextCandidate(); + Assert.assertThat(candidate.getRequestUrl(), Matchers.is(ROOT_URL_0)); + Assert.assertThat(candidate.getCrawlDepth(), Matchers.is(ROOT_URL_CRAWL_DEPTH)); + Assert.assertThat(candidate.getPriority(), Matchers.is(ROOT_URL_0_PRIORITY)); + Assert.assertThat(candidate.getDomain().toString(), Matchers.is(ROOT_URL_0_DOMAIN)); + Assert.assertThat(candidate.getRefererUrl(), Matchers.nullValue()); + Assert.assertThat(candidate.getMetadata(), Matchers.is(Optional.empty())); + Mockito.verify(statsCounterMock).recordRemainingCrawlCandidate(); + } - Assert.assertFalse(crawlFrontier.hasNextCandidate()); + @Test + public void testHasNextCandidateWhenCandidateQueueIsEmpty() { + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false)); } @Test - public void testDisabledOffsiteRequestFiltering() { - Mockito.when(configMock.isOffsiteRequestFilterEnabled()).thenReturn(false); - Mockito.when(configMock.getCrawlSeeds()) - .thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST)); + public void testHashNextCandidateWhenCandidateQueueIsNotEmpty() { + crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true); - CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(true)); + } - Assert.assertTrue(crawlFrontier.hasNextCandidate()); - Assert.assertEquals(OFFSITE_URL, - crawlFrontier.getNextCandidate().getRequestUrl()); + @Test + public void testGetNextCandidateWhenCandidateQueueIsEmpty() { + Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue()); } @Test public void testGetNextCandidateWhenUsingBreadthFirstCrawlStrategy() { - CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); + crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true); + crawlFrontier.feedRequest(ROOT_URL_1_CRAWL_REQUEST, true); CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate(); - Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl()); - Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - Assert.assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority()); + Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(ROOT_URL_1)); + Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(ROOT_URL_CRAWL_DEPTH)); + Assert.assertThat(nextCandidate.getPriority(), Matchers.is(ROOT_URL_1_PRIORITY)); crawlFrontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false); nextCandidate = crawlFrontier.getNextCandidate(); - Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl()); - Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - Assert.assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority()); + Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(ROOT_URL_0)); + Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(ROOT_URL_CRAWL_DEPTH)); + Assert.assertThat(nextCandidate.getPriority(), Matchers.is(ROOT_URL_0_PRIORITY)); crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); nextCandidate = crawlFrontier.getNextCandidate(); - Assert.assertEquals(CHILD_URL_2, nextCandidate.getRequestUrl()); - Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - Assert.assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority()); + Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(CHILD_URL_2)); + Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH)); + Assert.assertThat(nextCandidate.getPriority(), Matchers.is(CHILD_URL_2_PRIORITY)); // A priority queue doesn't ensure FIFO order when elements have the same depth and priority nextCandidate = crawlFrontier.getNextCandidate(); int previousChildCandidatePriority = nextCandidate.getPriority(); - Assert.assertTrue(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH)); - Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); + Assert.assertThat(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH), + Matchers.is(true)); + Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH)); nextCandidate = crawlFrontier.getNextCandidate(); - Assert.assertTrue(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH)); - Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - Assert.assertEquals(previousChildCandidatePriority, nextCandidate.getPriority()); + Assert.assertThat(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH), + Matchers.is(true)); + Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH)); + Assert.assertThat(previousChildCandidatePriority, Matchers.is(nextCandidate.getPriority())); - Assert.assertFalse(crawlFrontier.hasNextCandidate()); + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false)); + Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue()); } @Test public void testGetNextCandidateWhenUsingDepthFirstCrawlStrategy() { Mockito.when(configMock.getCrawlStrategy()).thenReturn(CrawlStrategy.DEPTH_FIRST); - CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); + crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true); + crawlFrontier.feedRequest(ROOT_URL_1_CRAWL_REQUEST, true); CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate(); - Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl()); - Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - Assert.assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority()); + Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(ROOT_URL_1)); + Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(ROOT_URL_CRAWL_DEPTH)); + Assert.assertThat(nextCandidate.getPriority(), Matchers.is(ROOT_URL_1_PRIORITY)); crawlFrontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false); // A priority queue doesn't ensure FIFO order when elements have the same depth and priority nextCandidate = crawlFrontier.getNextCandidate(); - Assert.assertTrue(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH)); - Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - Assert.assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority()); + Assert.assertThat(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH), + Matchers.is(true)); + Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH)); + Assert.assertThat(nextCandidate.getPriority(), Matchers.is(CHILD_URL_2_PRIORITY)); nextCandidate = crawlFrontier.getNextCandidate(); - Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl()); - Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - Assert.assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority()); + Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(ROOT_URL_0)); + Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(ROOT_URL_CRAWL_DEPTH)); + Assert.assertThat(nextCandidate.getPriority(), Matchers.is(ROOT_URL_0_PRIORITY)); crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); nextCandidate = crawlFrontier.getNextCandidate(); - Assert.assertEquals(CHILD_URL_0, nextCandidate.getRequestUrl()); - Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - Assert.assertEquals(CHILD_URL_0_PRIORITY, nextCandidate.getPriority()); + Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(CHILD_URL_0)); + Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH)); + Assert.assertThat(nextCandidate.getPriority(), Matchers.is(CHILD_URL_0_PRIORITY)); nextCandidate = crawlFrontier.getNextCandidate(); - Assert.assertEquals(CHILD_URL_1, nextCandidate.getRequestUrl()); - Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - Assert.assertEquals(CHILD_URL_1_PRIORITY, nextCandidate.getPriority()); - Assert.assertFalse(crawlFrontier.hasNextCandidate()); - } - - @Test - public void testCrawlDepthLimitation() { - Mockito.when(configMock.getMaximumCrawlDepth()).thenReturn(MAX_CRAWL_DEPTH); - - CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); - - clearCrawlCandidateQueue(crawlFrontier); - crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); - - CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate(); - Assert.assertTrue(nextCandidate.getCrawlDepth() <= MAX_CRAWL_DEPTH); + Assert.assertThat(nextCandidate.getRequestUrl(), Matchers.is(CHILD_URL_1)); + Assert.assertThat(nextCandidate.getCrawlDepth(), Matchers.is(CHILD_URL_CRAWL_DEPTH)); + Assert.assertThat(nextCandidate.getPriority(), Matchers.is(CHILD_URL_1_PRIORITY)); - crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - - Assert.assertFalse(crawlFrontier.hasNextCandidate()); + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false)); + Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue()); } @Test - public void testReset() { - CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); + public void testResetWhenCandidateQueueIsNotEmpty() { + crawlFrontier.feedRequest(ROOT_URL_0_CRAWL_REQUEST, true); crawlFrontier.reset(); - // Check if only the crawl seeds are present after the reset - Assert.assertTrue(crawlFrontier.hasNextCandidate()); - Assert.assertEquals(ROOT_URL_1, crawlFrontier.getNextCandidate().getRequestUrl()); - - Assert.assertTrue(crawlFrontier.hasNextCandidate()); - Assert.assertEquals(ROOT_URL_0, crawlFrontier.getNextCandidate().getRequestUrl()); - - Assert.assertFalse(crawlFrontier.hasNextCandidate()); - } - - private static void clearCrawlCandidateQueue(final CrawlFrontier crawlFrontier) { - while (crawlFrontier.hasNextCandidate()) { - crawlFrontier.getNextCandidate(); - } + Assert.assertThat(crawlFrontier.hasNextCandidate(), Matchers.is(false)); + Assert.assertThat(crawlFrontier.getNextCandidate(), Matchers.nullValue()); } } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java b/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java index a853822..4a722fb 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java @@ -16,6 +16,7 @@ package com.github.peterbencze.serritor.internal.stats; +import org.hamcrest.Matchers; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -133,4 +134,39 @@ public void testRecordNetworkError() { Assert.assertEquals(processedCrawlCandidateCountBefore + 1, statsCounter.getProcessedCrawlCandidateCount()); } + + @Test + public void testReset() { + statsCounter.recordRemainingCrawlCandidate(); + statsCounter.recordRemainingCrawlCandidate(); + statsCounter.recordRemainingCrawlCandidate(); + statsCounter.recordRemainingCrawlCandidate(); + statsCounter.recordRemainingCrawlCandidate(); + statsCounter.recordRemainingCrawlCandidate(); + statsCounter.recordRemainingCrawlCandidate(); + statsCounter.recordResponseSuccess(); + statsCounter.recordPageLoadTimeout(); + statsCounter.recordRequestRedirect(); + statsCounter.recordNonHtmlResponse(); + statsCounter.recordResponseError(); + statsCounter.recordNetworkError(); + statsCounter.recordDuplicateRequest(); + statsCounter.recordOffsiteRequest(); + statsCounter.recordCrawlDepthLimitExceedingRequest(); + + statsCounter.reset(); + + Assert.assertThat(statsCounter.getRemainingCrawlCandidateCount(), Matchers.is(0)); + Assert.assertThat(statsCounter.getProcessedCrawlCandidateCount(), Matchers.is(0)); + Assert.assertThat(statsCounter.getResponseSuccessCount(), Matchers.is(0)); + Assert.assertThat(statsCounter.getPageLoadTimeoutCount(), Matchers.is(0)); + Assert.assertThat(statsCounter.getRequestRedirectCount(), Matchers.is(0)); + Assert.assertThat(statsCounter.getNonHtmlResponseCount(), Matchers.is(0)); + Assert.assertThat(statsCounter.getResponseErrorCount(), Matchers.is(0)); + Assert.assertThat(statsCounter.getNetworkErrorCount(), Matchers.is(0)); + Assert.assertThat(statsCounter.getFilteredDuplicateRequestCount(), Matchers.is(0)); + Assert.assertThat(statsCounter.getFilteredOffsiteRequestCount(), Matchers.is(0)); + Assert.assertThat(statsCounter.getFilteredCrawlDepthLimitExceedingRequestCount(), + Matchers.is(0)); + } } diff --git a/src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java b/src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java index b1fe3e0..fece580 100644 --- a/src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java +++ b/src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java @@ -16,9 +16,11 @@ package com.github.peterbencze.serritor.it; -import com.github.peterbencze.serritor.api.Crawler; import com.github.peterbencze.serritor.api.Browser; import com.github.peterbencze.serritor.api.CrawlRequest; +import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; +import com.github.peterbencze.serritor.api.CrawlStats; +import com.github.peterbencze.serritor.api.Crawler; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import com.github.peterbencze.serritor.api.event.NonHtmlResponseEvent; import com.github.peterbencze.serritor.api.event.ResponseSuccessEvent; @@ -34,13 +36,12 @@ import net.lightbody.bmp.client.ClientUtil; import org.apache.commons.io.IOUtils; import org.apache.http.entity.ContentType; +import org.hamcrest.Matchers; import org.junit.After; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; -import org.openqa.selenium.htmlunit.HtmlUnitDriver; -import org.openqa.selenium.remote.BrowserType; import org.openqa.selenium.remote.CapabilityType; import org.openqa.selenium.remote.DesiredCapabilities; @@ -104,7 +105,7 @@ protected void onNonHtmlResponse(final NonHtmlResponseEvent event) { } @Test - public void testResumeState() throws IOException { + public void testResumeState() { WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/foo")) .willReturn(WireMock.ok() .withHeader("Content-Type", ContentType.TEXT_HTML.toString()))); @@ -113,8 +114,6 @@ public void testResumeState() throws IOException { .willReturn(WireMock.ok() .withHeader("Content-Type", ContentType.TEXT_HTML.toString()))); - File destinationFile = createTempFile(); - CrawlerConfiguration config = new CrawlerConfiguration.CrawlerConfigurationBuilder() .addCrawlSeed(CrawlRequest.createDefault("http://te.st/foo")) .addCrawlSeed(CrawlRequest.createDefault("http://te.st/bar")) @@ -144,6 +143,73 @@ protected void onResponseSuccess(final ResponseSuccessEvent event) { Assert.assertEquals(0, WireMock.findUnmatchedRequests().size()); } + @Test + public void testCrawlerRestartWhenStateWasRestored() { + WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/foo")) + .willReturn(WireMock.ok() + .withHeader("Content-Type", ContentType.TEXT_HTML.toString()))); + WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/bar")) + .willReturn(WireMock.ok() + .withHeader("Content-Type", ContentType.TEXT_HTML.toString()))); + + CrawlerConfiguration config = new CrawlerConfiguration.CrawlerConfigurationBuilder() + .addCrawlSeed(new CrawlRequestBuilder("http://te.st/foo").setPriority(1).build()) + .addCrawlSeed(CrawlRequest.createDefault("http://te.st/bar")) + .build(); + Crawler crawler = new Crawler(config) { + @Override + protected void onResponseSuccess(final ResponseSuccessEvent event) { + super.onResponseSuccess(event); + + stop(); + } + + }; + crawler.start(Browser.HTML_UNIT, capabilities); + + CrawlStats stats = crawler.getCrawlStats(); + Assert.assertThat(stats.getRemainingCrawlCandidateCount(), Matchers.is(1)); + Assert.assertThat(stats.getProcessedCrawlCandidateCount(), Matchers.is(1)); + Assert.assertThat(stats.getResponseSuccessCount(), Matchers.is(1)); + Assert.assertThat(stats.getPageLoadTimeoutCount(), Matchers.is(0)); + Assert.assertThat(stats.getRequestRedirectCount(), Matchers.is(0)); + Assert.assertThat(stats.getNonHtmlResponseCount(), Matchers.is(0)); + Assert.assertThat(stats.getResponseErrorCount(), Matchers.is(0)); + Assert.assertThat(stats.getNetworkErrorCount(), Matchers.is(0)); + Assert.assertThat(stats.getFilteredDuplicateRequestCount(), Matchers.is(0)); + Assert.assertThat(stats.getFilteredOffsiteRequestCount(), Matchers.is(0)); + Assert.assertThat(stats.getFilteredCrawlDepthLimitExceedingRequestCount(), Matchers.is(0)); + + WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/foo"))); + WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/foo"))); + WireMock.verify(0, WireMock.headRequestedFor(WireMock.urlEqualTo("/bar"))); + WireMock.verify(0, WireMock.getRequestedFor(WireMock.urlEqualTo("/bar"))); + + crawler = new Crawler(crawler.getState()) { + }; + crawler.start(Browser.HTML_UNIT, capabilities); + + stats = crawler.getCrawlStats(); + Assert.assertThat(stats.getRemainingCrawlCandidateCount(), Matchers.is(0)); + Assert.assertThat(stats.getProcessedCrawlCandidateCount(), Matchers.is(2)); + Assert.assertThat(stats.getResponseSuccessCount(), Matchers.is(2)); + Assert.assertThat(stats.getPageLoadTimeoutCount(), Matchers.is(0)); + Assert.assertThat(stats.getRequestRedirectCount(), Matchers.is(0)); + Assert.assertThat(stats.getNonHtmlResponseCount(), Matchers.is(0)); + Assert.assertThat(stats.getResponseErrorCount(), Matchers.is(0)); + Assert.assertThat(stats.getNetworkErrorCount(), Matchers.is(0)); + Assert.assertThat(stats.getFilteredDuplicateRequestCount(), Matchers.is(0)); + Assert.assertThat(stats.getFilteredOffsiteRequestCount(), Matchers.is(0)); + Assert.assertThat(stats.getFilteredCrawlDepthLimitExceedingRequestCount(), Matchers.is(0)); + + WireMock.verify(2, WireMock.headRequestedFor(WireMock.urlEqualTo("/foo"))); + WireMock.verify(2, WireMock.getRequestedFor(WireMock.urlEqualTo("/foo"))); + WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/bar"))); + WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/bar"))); + + Assert.assertThat(WireMock.findUnmatchedRequests().size(), Matchers.is(0)); + } + @Test public void testHttpClientCookieSynchronization() { WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/foo")) @@ -235,15 +301,6 @@ private static BrowserMobProxyServer createProxyServer(final int mockWebServerPo return server; } - private static HtmlUnitDriver createHtmlUnitDriver(final BrowserMobProxyServer server) { - DesiredCapabilities capabilities = new DesiredCapabilities(); - capabilities.setBrowserName(BrowserType.HTMLUNIT); - capabilities.setJavascriptEnabled(true); - capabilities.setCapability(CapabilityType.PROXY, ClientUtil.createSeleniumProxy(server)); - - return new HtmlUnitDriver(capabilities); - } - private static File createTempFile() throws IOException { File tempFile = File.createTempFile("tmp", null); tempFile.deleteOnExit();