From 25b297a0adfa3a9880bdf33e99e695d3cdd56e11 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 24 Jan 2019 21:34:49 +0100 Subject: [PATCH 01/63] Update project version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index e4280bc..7a3b0a9 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 com.github.peterbencze serritor - 1.6.0 + 2.0.0 jar Serritor From f245de7220d8de219650f0b1de3394c195435e48 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 24 Jan 2019 22:28:17 +0100 Subject: [PATCH 02/63] Disable HttpClient redirect handling --- .../peterbencze/serritor/api/BaseCrawler.java | 1079 ++++++++--------- .../peterbencze/serritor/it/SerritorIT.java | 37 + 2 files changed, 560 insertions(+), 556 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index ab349d6..36aec4f 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -1,556 +1,523 @@ -/* - * Copyright 2017 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.api; - -import com.gargoylesoftware.htmlunit.WebClient; -import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; -import com.github.peterbencze.serritor.api.event.CrawlEvent; -import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent; -import com.github.peterbencze.serritor.api.event.PageLoadEvent; -import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent; -import com.github.peterbencze.serritor.api.event.RequestErrorEvent; -import com.github.peterbencze.serritor.api.event.RequestRedirectEvent; -import com.github.peterbencze.serritor.internal.CookieConverter; -import com.github.peterbencze.serritor.internal.CrawlFrontier; -import com.github.peterbencze.serritor.internal.CrawlerState; -import com.github.peterbencze.serritor.internal.crawldelaymechanism.AdaptiveCrawlDelayMechanism; -import com.github.peterbencze.serritor.internal.crawldelaymechanism.CrawlDelayMechanism; -import com.github.peterbencze.serritor.internal.crawldelaymechanism.FixedCrawlDelayMechanism; -import com.github.peterbencze.serritor.internal.crawldelaymechanism.RandomCrawlDelayMechanism; -import com.github.peterbencze.serritor.internal.event.EventCallbackManager; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.URI; -import java.nio.charset.UnsupportedCharsetException; -import java.util.List; -import java.util.concurrent.TimeUnit; -import java.util.logging.Level; -import java.util.logging.Logger; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.SerializationUtils; -import org.apache.commons.lang3.Validate; -import org.apache.http.Header; -import org.apache.http.HttpEntity; -import org.apache.http.HttpResponse; -import org.apache.http.ParseException; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.client.methods.HttpHead; -import org.apache.http.client.protocol.HttpClientContext; -import org.apache.http.client.utils.HttpClientUtils; -import org.apache.http.entity.ContentType; -import org.apache.http.impl.client.BasicCookieStore; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClientBuilder; -import org.openqa.selenium.JavascriptExecutor; -import org.openqa.selenium.TimeoutException; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.htmlunit.HtmlUnitDriver; - -/** - * Provides a skeletal implementation of a crawler to minimize the effort for users to implement - * their own. - * - * @author Peter Bencze - */ -public abstract class BaseCrawler { - - private static final Logger LOGGER = Logger.getLogger(BaseCrawler.class.getName()); - - private CrawlerConfiguration config; - private EventCallbackManager callbackManager; - private CrawlFrontier crawlFrontier; - private BasicCookieStore cookieStore; - private CloseableHttpClient httpClient; - private WebDriver webDriver; - private CrawlDelayMechanism crawlDelayMechanism; - private boolean isStopped; - private boolean isStopping; - - /** - * Base constructor which is used to configure the crawler. - * - * @param config the configuration of the crawler - */ - protected BaseCrawler(final CrawlerConfiguration config) { - this(); - - this.config = config; - } - - /** - * Base constructor which loads a previously saved state. - * - * @param inStream the input stream from which the state should be loaded - */ - protected BaseCrawler(final InputStream inStream) { - this(); - - CrawlerState state = SerializationUtils.deserialize(inStream); - config = state.getStateObject(CrawlerConfiguration.class); - crawlFrontier = state.getStateObject(CrawlFrontier.class); - } - - /** - * Private base constructor which does simple initialization. - */ - private BaseCrawler() { - callbackManager = new EventCallbackManager(); - callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD, this::onPageLoad); - callbackManager.setDefaultEventCallback(CrawlEvent.NON_HTML_CONTENT, - this::onNonHtmlContent); - callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD_TIMEOUT, - this::onPageLoadTimeout); - callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_REDIRECT, - this::onRequestRedirect); - callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_ERROR, this::onRequestError); - - isStopping = false; - isStopped = true; - } - - /** - * Starts the crawler using HtmlUnit headless browser. This method will block until the crawler - * finishes. - */ - public final void start() { - start(new HtmlUnitDriver(true)); - } - - /** - * Starts the crawler using the browser specified by the given WebDriver instance. - * This method will block until the crawler finishes. - * - * @param webDriver the WebDriver instance to control the browser - */ - public final void start(final WebDriver webDriver) { - start(webDriver, false); - } - - /** - * Performs initialization and runs the crawler. - * - * @param isResuming indicates if a previously saved state is to be resumed - */ - private void start(final WebDriver webDriver, final boolean isResuming) { - try { - Validate.validState(isStopped, "The crawler is already running."); - this.webDriver = Validate.notNull(webDriver, "The webdriver cannot be null."); - - // If the crawl delay strategy is set to adaptive, we check if the browser supports the - // Navigation Timing API or not. However HtmlUnit requires a page to be loaded first - // before executing JavaScript, so we load a blank page. - if (webDriver instanceof HtmlUnitDriver - && config.getCrawlDelayStrategy().equals(CrawlDelayStrategy.ADAPTIVE)) { - webDriver.get(WebClient.ABOUT_BLANK); - } - - if (!isResuming) { - crawlFrontier = new CrawlFrontier(config); - } - - cookieStore = new BasicCookieStore(); - httpClient = HttpClientBuilder.create() - .setDefaultCookieStore(cookieStore) - .useSystemProperties() - .build(); - crawlDelayMechanism = createCrawlDelayMechanism(); - isStopped = false; - - run(); - } finally { - HttpClientUtils.closeQuietly(httpClient); - - if (this.webDriver != null) { - this.webDriver.quit(); - } - - isStopping = false; - isStopped = true; - } - } - - /** - * Saves the current state of the crawler to the given output stream. - * - * @param outStream the output stream - */ - public final void saveState(final OutputStream outStream) { - Validate.validState(crawlFrontier != null, "Cannot save state at this point."); - - CrawlerState state = new CrawlerState(); - state.putStateObject(config); - state.putStateObject(crawlFrontier); - - SerializationUtils.serialize(state, outStream); - } - - /** - * Resumes the previously loaded state using HtmlUnit headless browser. This method will block - * until the crawler finishes. - */ - public final void resumeState() { - resumeState(new HtmlUnitDriver(true)); - } - - /** - * Resumes the previously loaded state using the browser specified by the given - * WebDriver instance. This method will block until the crawler finishes. - * - * @param webDriver the WebDriver instance to control the browser - */ - public final void resumeState(final WebDriver webDriver) { - Validate.validState(crawlFrontier != null, "Cannot resume state at this point."); - - start(webDriver, true); - } - - /** - * Registers an operation which is invoked when the specific event occurs and the provided - * pattern matches the request URL. - * - * @param event the event for which the callback should be triggered - * @param callback the pattern matching callback to invoke - */ - protected final void registerCustomEventCallback( - final CrawlEvent event, - final PatternMatchingCallback callback) { - Validate.notNull(event, "The event cannot be null."); - Validate.notNull(callback, "The callback cannot be null."); - - callbackManager.addCustomEventCallback(event, callback); - } - - /** - * Gracefully stops the crawler. - */ - protected final void stop() { - Validate.validState(!isStopped, "The crawler is not started."); - Validate.validState(!isStopping, "The crawler is already stopping."); - - // Indicate that the crawling should be stopped - isStopping = true; - } - - /** - * Feeds a crawl request to the crawler. The crawler should be running, otherwise the request - * has to be added as a crawl seed instead. - * - * @param request the crawl request - */ - protected final void crawl(final CrawlRequest request) { - Validate.validState(!isStopped, - "The crawler is not started. Maybe you meant to add this request as a crawl seed?"); - Validate.validState(!isStopping, "Cannot add request when the crawler is stopping."); - Validate.notNull(request, "The request cannot be null."); - - crawlFrontier.feedRequest(request, false); - } - - /** - * Feeds multiple crawl requests to the crawler. The crawler should be running, otherwise the - * requests have to be added as crawl seeds instead. - * - * @param requests the list of crawl requests - */ - protected final void crawl(final List requests) { - requests.forEach(this::crawl); - } - - /** - * Downloads the file specified by the URL. - * - * @param source the source URL - * @param destination the destination file - * - * @throws IOException if an I/O error occurs while downloading the file - */ - protected final void downloadFile(final URI source, final File destination) throws IOException { - Validate.validState(!isStopped, "Cannot download file when the crawler is not started."); - Validate.validState(!isStopping, "Cannot download file when the crawler is stopping."); - Validate.notNull(source, "The source URL cannot be null."); - Validate.notNull(destination, "The destination file cannot be null."); - - HttpGet request = new HttpGet(source); - try (CloseableHttpResponse response = httpClient.execute(request)) { - HttpEntity entity = response.getEntity(); - if (entity != null) { - FileUtils.copyInputStreamToFile(entity.getContent(), destination); - } - } - } - - /** - * Defines the workflow of the crawler. - */ - private void run() { - onStart(); - - while (!isStopping && crawlFrontier.hasNextCandidate()) { - CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); - String candidateUrl = currentCandidate.getRequestUrl().toString(); - HttpClientContext context = HttpClientContext.create(); - CloseableHttpResponse httpHeadResponse = null; - boolean isUnsuccessfulRequest = false; - - try { - // Send an HTTP HEAD request to determine its availability and content type - httpHeadResponse = getHttpHeadResponse(candidateUrl, context); - } catch (IOException exception) { - callbackManager.call(CrawlEvent.REQUEST_ERROR, - new RequestErrorEvent(currentCandidate, exception)); - isUnsuccessfulRequest = true; - } - - if (!isUnsuccessfulRequest) { - String responseUrl = getFinalResponseUrl(context, candidateUrl); - if (responseUrl.equals(candidateUrl)) { - String responseMimeType = getResponseMimeType(httpHeadResponse); - if (responseMimeType.equals(ContentType.TEXT_HTML.getMimeType())) { - boolean isTimedOut = false; - TimeoutException timeoutException = null; - - try { - // Open URL in browser - webDriver.get(candidateUrl); - } catch (TimeoutException exception) { - isTimedOut = true; - timeoutException = exception; - } - - // Ensure the HTTP client and Selenium have the same state - syncHttpClientCookies(); - - if (isTimedOut) { - callbackManager.call(CrawlEvent.PAGE_LOAD_TIMEOUT, - new PageLoadTimeoutEvent(currentCandidate, timeoutException)); - } else { - String loadedPageUrl = webDriver.getCurrentUrl(); - if (!loadedPageUrl.equals(candidateUrl)) { - // Create a new crawl request for the redirected URL (JS redirect) - handleRequestRedirect(currentCandidate, loadedPageUrl); - } else { - callbackManager.call(CrawlEvent.PAGE_LOAD, - new PageLoadEvent(currentCandidate, webDriver)); - } - } - } else { - // URLs that point to non-HTML content should not be opened in the browser - callbackManager.call(CrawlEvent.NON_HTML_CONTENT, - new NonHtmlContentEvent(currentCandidate, responseMimeType)); - } - } else { - // Create a new crawl request for the redirected URL - handleRequestRedirect(currentCandidate, responseUrl); - } - } - - HttpClientUtils.closeQuietly(httpHeadResponse); - performDelay(); - } - - onStop(); - } - - /** - * Creates the crawl delay mechanism according to the configuration. - * - * @return the created crawl delay mechanism - */ - @SuppressWarnings("checkstyle:MissingSwitchDefault") - private CrawlDelayMechanism createCrawlDelayMechanism() { - switch (config.getCrawlDelayStrategy()) { - case FIXED: - return new FixedCrawlDelayMechanism(config); - case RANDOM: - return new RandomCrawlDelayMechanism(config); - case ADAPTIVE: - return new AdaptiveCrawlDelayMechanism(config, (JavascriptExecutor) webDriver); - } - - throw new IllegalArgumentException("Unsupported crawl delay strategy."); - } - - /** - * Sends an HTTP HEAD request to the given URL and returns the response. - * - * @param destinationUrl the destination URL - * - * @return the HTTP HEAD response - * - * @throws IOException if an error occurs while trying to fulfill the request - */ - private CloseableHttpResponse getHttpHeadResponse( - final String destinationUrl, - final HttpClientContext context) throws IOException { - HttpHead request = new HttpHead(destinationUrl); - return httpClient.execute(request, context); - } - - /** - * If the HTTP HEAD request was redirected, it returns the final redirected URL. If not, it - * returns the original URL of the candidate. - * - * @param context the current HTTP client context - * @param candidateUrl the URL of the candidate - * - * @return the final response URL - */ - private static String getFinalResponseUrl( - final HttpClientContext context, - final String candidateUrl) { - List redirectLocations = context.getRedirectLocations(); - if (redirectLocations != null) { - return redirectLocations.get(redirectLocations.size() - 1).toString(); - } - - return candidateUrl; - } - - /** - * Returns the MIME type of the HTTP HEAD response. If the Content-Type header is not present in - * the response it returns "text/plain". - * - * @param httpHeadResponse the HTTP HEAD response - * - * @return the MIME type of the response - */ - private static String getResponseMimeType(final HttpResponse httpHeadResponse) { - Header contentTypeHeader = httpHeadResponse.getFirstHeader("Content-Type"); - if (contentTypeHeader != null) { - String contentType = contentTypeHeader.getValue(); - if (contentType != null) { - try { - return ContentType.parse(contentType).getMimeType(); - } catch (ParseException | UnsupportedCharsetException exception) { - return contentType.split(";")[0].trim(); - } - } - } - - return ContentType.DEFAULT_TEXT.getMimeType(); - } - - /** - * Creates a crawl request for the redirected URL, feeds it to the crawler and calls the - * appropriate event callback. - * - * @param currentCrawlCandidate the current crawl candidate - * @param redirectedUrl the URL of the redirected request - */ - private void handleRequestRedirect( - final CrawlCandidate currentCrawlCandidate, - final String redirectedUrl) { - CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectedUrl) - .setPriority(currentCrawlCandidate.getPriority()); - currentCrawlCandidate.getMetadata().ifPresent(builder::setMetadata); - CrawlRequest redirectedRequest = builder.build(); - - crawlFrontier.feedRequest(redirectedRequest, false); - callbackManager.call(CrawlEvent.REQUEST_REDIRECT, - new RequestRedirectEvent(currentCrawlCandidate, redirectedRequest)); - } - - /** - * Copies all the Selenium cookies for the current domain to the HTTP client cookie store. - */ - private void syncHttpClientCookies() { - webDriver.manage() - .getCookies() - .stream() - .map(CookieConverter::convertToHttpClientCookie) - .forEach(cookieStore::addCookie); - } - - /** - * Delays the next request. - */ - private void performDelay() { - try { - TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay()); - } catch (InterruptedException ex) { - Thread.currentThread().interrupt(); - isStopping = true; - } - } - - /** - * Callback which gets called when the crawler is started. - */ - protected void onStart() { - LOGGER.info("onStart"); - } - - /** - * Callback which gets called when the browser loads the page. - * - * @param event the PageLoadEvent instance - */ - protected void onPageLoad(final PageLoadEvent event) { - LOGGER.log(Level.INFO, "onPageLoad: {0}", event.getCrawlCandidate().getRequestUrl()); - } - - /** - * Callback which gets called when the content type is not HTML. - * - * @param event the NonHtmlContentEvent instance - */ - protected void onNonHtmlContent(final NonHtmlContentEvent event) { - LOGGER.log(Level.INFO, "onNonHtmlContent: {0}", event.getCrawlCandidate().getRequestUrl()); - } - - /** - * Callback which gets called when a request error occurs. - * - * @param event the RequestErrorEvent instance - */ - protected void onRequestError(final RequestErrorEvent event) { - LOGGER.log(Level.INFO, "onRequestError: {0}", event.getCrawlCandidate().getRequestUrl()); - } - - /** - * Callback which gets called when a request is redirected. - * - * @param event the RequestRedirectEvent instance - */ - protected void onRequestRedirect(final RequestRedirectEvent event) { - LOGGER.log(Level.INFO, "onRequestRedirect: {0} -> {1}", - new Object[]{ - event.getCrawlCandidate().getRequestUrl(), - event.getRedirectedCrawlRequest().getRequestUrl() - }); - } - - /** - * Callback which gets called when the page does not load in the browser within the timeout - * period. - * - * @param event the PageLoadTimeoutEvent instance - */ - protected void onPageLoadTimeout(final PageLoadTimeoutEvent event) { - LOGGER.log(Level.INFO, "onPageLoadTimeout: {0}", event.getCrawlCandidate().getRequestUrl()); - } - - /** - * Callback which gets called when the crawler is stopped. - */ - protected void onStop() { - LOGGER.info("onStop"); - } -} +/* + * Copyright 2017 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api; + +import com.gargoylesoftware.htmlunit.WebClient; +import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; +import com.github.peterbencze.serritor.api.event.CrawlEvent; +import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent; +import com.github.peterbencze.serritor.api.event.PageLoadEvent; +import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent; +import com.github.peterbencze.serritor.api.event.RequestErrorEvent; +import com.github.peterbencze.serritor.api.event.RequestRedirectEvent; +import com.github.peterbencze.serritor.internal.CookieConverter; +import com.github.peterbencze.serritor.internal.CrawlFrontier; +import com.github.peterbencze.serritor.internal.CrawlerState; +import com.github.peterbencze.serritor.internal.crawldelaymechanism.AdaptiveCrawlDelayMechanism; +import com.github.peterbencze.serritor.internal.crawldelaymechanism.CrawlDelayMechanism; +import com.github.peterbencze.serritor.internal.crawldelaymechanism.FixedCrawlDelayMechanism; +import com.github.peterbencze.serritor.internal.crawldelaymechanism.RandomCrawlDelayMechanism; +import com.github.peterbencze.serritor.internal.event.EventCallbackManager; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URI; +import java.nio.charset.UnsupportedCharsetException; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.SerializationUtils; +import org.apache.commons.lang3.Validate; +import org.apache.http.Header; +import org.apache.http.HttpEntity; +import org.apache.http.HttpHeaders; +import org.apache.http.HttpResponse; +import org.apache.http.ParseException; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpHead; +import org.apache.http.client.utils.HttpClientUtils; +import org.apache.http.entity.ContentType; +import org.apache.http.impl.client.BasicCookieStore; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; +import org.eclipse.jetty.http.HttpStatus; +import org.openqa.selenium.JavascriptExecutor; +import org.openqa.selenium.TimeoutException; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.htmlunit.HtmlUnitDriver; + +/** + * Provides a skeletal implementation of a crawler to minimize the effort for users to implement + * their own. + * + * @author Peter Bencze + */ +public abstract class BaseCrawler { + + private static final Logger LOGGER = Logger.getLogger(BaseCrawler.class.getName()); + + private CrawlerConfiguration config; + private EventCallbackManager callbackManager; + private CrawlFrontier crawlFrontier; + private BasicCookieStore cookieStore; + private CloseableHttpClient httpClient; + private WebDriver webDriver; + private CrawlDelayMechanism crawlDelayMechanism; + private boolean isStopped; + private boolean isStopping; + + /** + * Base constructor which is used to configure the crawler. + * + * @param config the configuration of the crawler + */ + protected BaseCrawler(final CrawlerConfiguration config) { + this(); + + this.config = config; + } + + /** + * Base constructor which loads a previously saved state. + * + * @param inStream the input stream from which the state should be loaded + */ + protected BaseCrawler(final InputStream inStream) { + this(); + + CrawlerState state = SerializationUtils.deserialize(inStream); + config = state.getStateObject(CrawlerConfiguration.class); + crawlFrontier = state.getStateObject(CrawlFrontier.class); + } + + /** + * Private base constructor which does simple initialization. + */ + private BaseCrawler() { + callbackManager = new EventCallbackManager(); + callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD, this::onPageLoad); + callbackManager.setDefaultEventCallback(CrawlEvent.NON_HTML_CONTENT, + this::onNonHtmlContent); + callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD_TIMEOUT, + this::onPageLoadTimeout); + callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_REDIRECT, + this::onRequestRedirect); + callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_ERROR, this::onRequestError); + + isStopping = false; + isStopped = true; + } + + /** + * Starts the crawler using HtmlUnit headless browser. This method will block until the crawler + * finishes. + */ + public final void start() { + start(new HtmlUnitDriver(true)); + } + + /** + * Starts the crawler using the browser specified by the given WebDriver instance. + * This method will block until the crawler finishes. + * + * @param webDriver the WebDriver instance to control the browser + */ + public final void start(final WebDriver webDriver) { + start(webDriver, false); + } + + /** + * Performs initialization and runs the crawler. + * + * @param isResuming indicates if a previously saved state is to be resumed + */ + private void start(final WebDriver webDriver, final boolean isResuming) { + try { + Validate.validState(isStopped, "The crawler is already running."); + this.webDriver = Validate.notNull(webDriver, "The webdriver cannot be null."); + + // If the crawl delay strategy is set to adaptive, we check if the browser supports the + // Navigation Timing API or not. However HtmlUnit requires a page to be loaded first + // before executing JavaScript, so we load a blank page. + if (webDriver instanceof HtmlUnitDriver + && config.getCrawlDelayStrategy().equals(CrawlDelayStrategy.ADAPTIVE)) { + webDriver.get(WebClient.ABOUT_BLANK); + } + + if (!isResuming) { + crawlFrontier = new CrawlFrontier(config); + } + + cookieStore = new BasicCookieStore(); + httpClient = HttpClientBuilder.create() + .disableRedirectHandling() + .setDefaultCookieStore(cookieStore) + .useSystemProperties() + .build(); + crawlDelayMechanism = createCrawlDelayMechanism(); + isStopped = false; + + run(); + } finally { + HttpClientUtils.closeQuietly(httpClient); + + if (this.webDriver != null) { + this.webDriver.quit(); + } + + isStopping = false; + isStopped = true; + } + } + + /** + * Saves the current state of the crawler to the given output stream. + * + * @param outStream the output stream + */ + public final void saveState(final OutputStream outStream) { + Validate.validState(crawlFrontier != null, "Cannot save state at this point."); + + CrawlerState state = new CrawlerState(); + state.putStateObject(config); + state.putStateObject(crawlFrontier); + + SerializationUtils.serialize(state, outStream); + } + + /** + * Resumes the previously loaded state using HtmlUnit headless browser. This method will block + * until the crawler finishes. + */ + public final void resumeState() { + resumeState(new HtmlUnitDriver(true)); + } + + /** + * Resumes the previously loaded state using the browser specified by the given + * WebDriver instance. This method will block until the crawler finishes. + * + * @param webDriver the WebDriver instance to control the browser + */ + public final void resumeState(final WebDriver webDriver) { + Validate.validState(crawlFrontier != null, "Cannot resume state at this point."); + + start(webDriver, true); + } + + /** + * Registers an operation which is invoked when the specific event occurs and the provided + * pattern matches the request URL. + * + * @param event the event for which the callback should be triggered + * @param callback the pattern matching callback to invoke + */ + protected final void registerCustomEventCallback( + final CrawlEvent event, + final PatternMatchingCallback callback) { + Validate.notNull(event, "The event cannot be null."); + Validate.notNull(callback, "The callback cannot be null."); + + callbackManager.addCustomEventCallback(event, callback); + } + + /** + * Gracefully stops the crawler. + */ + protected final void stop() { + Validate.validState(!isStopped, "The crawler is not started."); + Validate.validState(!isStopping, "The crawler is already stopping."); + + // Indicate that the crawling should be stopped + isStopping = true; + } + + /** + * Feeds a crawl request to the crawler. The crawler should be running, otherwise the request + * has to be added as a crawl seed instead. + * + * @param request the crawl request + */ + protected final void crawl(final CrawlRequest request) { + Validate.validState(!isStopped, + "The crawler is not started. Maybe you meant to add this request as a crawl seed?"); + Validate.validState(!isStopping, "Cannot add request when the crawler is stopping."); + Validate.notNull(request, "The request cannot be null."); + + crawlFrontier.feedRequest(request, false); + } + + /** + * Feeds multiple crawl requests to the crawler. The crawler should be running, otherwise the + * requests have to be added as crawl seeds instead. + * + * @param requests the list of crawl requests + */ + protected final void crawl(final List requests) { + requests.forEach(this::crawl); + } + + /** + * Downloads the file specified by the URL. + * + * @param source the source URL + * @param destination the destination file + * + * @throws IOException if an I/O error occurs while downloading the file + */ + protected final void downloadFile(final URI source, final File destination) throws IOException { + Validate.validState(!isStopped, "Cannot download file when the crawler is not started."); + Validate.validState(!isStopping, "Cannot download file when the crawler is stopping."); + Validate.notNull(source, "The source URL cannot be null."); + Validate.notNull(destination, "The destination file cannot be null."); + + HttpGet request = new HttpGet(source); + try (CloseableHttpResponse response = httpClient.execute(request)) { + HttpEntity entity = response.getEntity(); + if (entity != null) { + FileUtils.copyInputStreamToFile(entity.getContent(), destination); + } + } + } + + /** + * Defines the workflow of the crawler. + */ + private void run() { + onStart(); + + while (!isStopping && crawlFrontier.hasNextCandidate()) { + CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); + String candidateUrl = currentCandidate.getRequestUrl().toString(); + CloseableHttpResponse httpHeadResponse = null; + boolean isUnsuccessfulRequest = false; + + try { + // Send an HTTP HEAD request to determine its availability and content type + httpHeadResponse = httpClient.execute(new HttpHead(candidateUrl)); + } catch (IOException exception) { + callbackManager.call(CrawlEvent.REQUEST_ERROR, + new RequestErrorEvent(currentCandidate, exception)); + isUnsuccessfulRequest = true; + } + + if (!isUnsuccessfulRequest) { + int statusCode = httpHeadResponse.getStatusLine().getStatusCode(); + Header locationHeader = httpHeadResponse.getFirstHeader(HttpHeaders.LOCATION); + + if (HttpStatus.isRedirection(statusCode) && locationHeader != null) { + // Create new crawl request for the redirected URL + handleRequestRedirect(currentCandidate, locationHeader.getValue()); + } else { + String responseMimeType = getResponseMimeType(httpHeadResponse); + if (responseMimeType.equals(ContentType.TEXT_HTML.getMimeType())) { + boolean isTimedOut = false; + TimeoutException timeoutException = null; + + try { + // Open URL in browser + webDriver.get(candidateUrl); + } catch (TimeoutException exception) { + isTimedOut = true; + timeoutException = exception; + } + + // Ensure the HTTP client and Selenium have the same state + syncHttpClientCookies(); + + if (isTimedOut) { + callbackManager.call(CrawlEvent.PAGE_LOAD_TIMEOUT, + new PageLoadTimeoutEvent(currentCandidate, timeoutException)); + } else { + String loadedPageUrl = webDriver.getCurrentUrl(); + if (!loadedPageUrl.equals(candidateUrl)) { + // Create a new crawl request for the redirected URL (JS redirect) + handleRequestRedirect(currentCandidate, loadedPageUrl); + } else { + callbackManager.call(CrawlEvent.PAGE_LOAD, + new PageLoadEvent(currentCandidate, webDriver)); + } + } + } else { + // URLs that point to non-HTML content should not be opened in the browser + callbackManager.call(CrawlEvent.NON_HTML_CONTENT, + new NonHtmlContentEvent(currentCandidate, responseMimeType)); + } + } + } + + HttpClientUtils.closeQuietly(httpHeadResponse); + performDelay(); + } + + onStop(); + } + + /** + * Creates the crawl delay mechanism according to the configuration. + * + * @return the created crawl delay mechanism + */ + @SuppressWarnings("checkstyle:MissingSwitchDefault") + private CrawlDelayMechanism createCrawlDelayMechanism() { + switch (config.getCrawlDelayStrategy()) { + case FIXED: + return new FixedCrawlDelayMechanism(config); + case RANDOM: + return new RandomCrawlDelayMechanism(config); + case ADAPTIVE: + return new AdaptiveCrawlDelayMechanism(config, (JavascriptExecutor) webDriver); + } + + throw new IllegalArgumentException("Unsupported crawl delay strategy."); + } + + /** + * Returns the MIME type of the HTTP HEAD response. If the Content-Type header is not present in + * the response it returns "text/plain". + * + * @param httpHeadResponse the HTTP HEAD response + * + * @return the MIME type of the response + */ + private static String getResponseMimeType(final HttpResponse httpHeadResponse) { + Header contentTypeHeader = httpHeadResponse.getFirstHeader("Content-Type"); + if (contentTypeHeader != null) { + String contentType = contentTypeHeader.getValue(); + if (contentType != null) { + try { + return ContentType.parse(contentType).getMimeType(); + } catch (ParseException | UnsupportedCharsetException exception) { + return contentType.split(";")[0].trim(); + } + } + } + + return ContentType.DEFAULT_TEXT.getMimeType(); + } + + /** + * Creates a crawl request for the redirected URL, feeds it to the crawler and calls the + * appropriate event callback. + * + * @param currentCrawlCandidate the current crawl candidate + * @param redirectedUrl the URL of the redirected request + */ + private void handleRequestRedirect( + final CrawlCandidate currentCrawlCandidate, + final String redirectedUrl) { + CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectedUrl) + .setPriority(currentCrawlCandidate.getPriority()); + currentCrawlCandidate.getMetadata().ifPresent(builder::setMetadata); + CrawlRequest redirectedRequest = builder.build(); + + crawlFrontier.feedRequest(redirectedRequest, false); + callbackManager.call(CrawlEvent.REQUEST_REDIRECT, + new RequestRedirectEvent(currentCrawlCandidate, redirectedRequest)); + } + + /** + * Copies all the Selenium cookies for the current domain to the HTTP client cookie store. + */ + private void syncHttpClientCookies() { + webDriver.manage() + .getCookies() + .stream() + .map(CookieConverter::convertToHttpClientCookie) + .forEach(cookieStore::addCookie); + } + + /** + * Delays the next request. + */ + private void performDelay() { + try { + TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay()); + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + isStopping = true; + } + } + + /** + * Callback which gets called when the crawler is started. + */ + protected void onStart() { + LOGGER.info("onStart"); + } + + /** + * Callback which gets called when the browser loads the page. + * + * @param event the PageLoadEvent instance + */ + protected void onPageLoad(final PageLoadEvent event) { + LOGGER.log(Level.INFO, "onPageLoad: {0}", event.getCrawlCandidate().getRequestUrl()); + } + + /** + * Callback which gets called when the content type is not HTML. + * + * @param event the NonHtmlContentEvent instance + */ + protected void onNonHtmlContent(final NonHtmlContentEvent event) { + LOGGER.log(Level.INFO, "onNonHtmlContent: {0}", event.getCrawlCandidate().getRequestUrl()); + } + + /** + * Callback which gets called when a request error occurs. + * + * @param event the RequestErrorEvent instance + */ + protected void onRequestError(final RequestErrorEvent event) { + LOGGER.log(Level.INFO, "onRequestError: {0}", event.getCrawlCandidate().getRequestUrl()); + } + + /** + * Callback which gets called when a request is redirected. + * + * @param event the RequestRedirectEvent instance + */ + protected void onRequestRedirect(final RequestRedirectEvent event) { + LOGGER.log(Level.INFO, "onRequestRedirect: {0} -> {1}", + new Object[]{ + event.getCrawlCandidate().getRequestUrl(), + event.getRedirectedCrawlRequest().getRequestUrl() + }); + } + + /** + * Callback which gets called when the page does not load in the browser within the timeout + * period. + * + * @param event the PageLoadTimeoutEvent instance + */ + protected void onPageLoadTimeout(final PageLoadTimeoutEvent event) { + LOGGER.log(Level.INFO, "onPageLoadTimeout: {0}", event.getCrawlCandidate().getRequestUrl()); + } + + /** + * Callback which gets called when the crawler is stopped. + */ + protected void onStop() { + LOGGER.info("onStop"); + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java b/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java index 911b556..b102275 100644 --- a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java +++ b/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java @@ -24,6 +24,7 @@ import com.github.tomakehurst.wiremock.WireMockServer; import com.github.tomakehurst.wiremock.client.WireMock; import com.github.tomakehurst.wiremock.core.WireMockConfiguration; +import com.google.common.net.HttpHeaders; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -182,6 +183,42 @@ public void testHttpClientCookieSynchronization() { .withCookie("foo", WireMock.equalTo("bar"))); } + @Test + public void testRedirectHandling() { + WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/foo")) + .willReturn(WireMock.permanentRedirect("http://te.st/bar"))); + + WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/bar")) + .willReturn(WireMock.ok() + .withHeader(HttpHeaders.CONTENT_TYPE, ContentType.TEXT_HTML.toString()) + .withBody(""))); + + WireMock.givenThat(WireMock.any(WireMock.urlEqualTo("/baz")) + .willReturn(WireMock.ok() + .withHeader(HttpHeaders.CONTENT_TYPE, ContentType.TEXT_HTML.toString()))); + + CrawlerConfiguration config = new CrawlerConfiguration.CrawlerConfigurationBuilder() + .addCrawlSeed(CrawlRequest.createDefault("http://te.st/foo")) + .build(); + + BaseCrawler crawler = new BaseCrawler(config) { + }; + crawler.start(htmlUnitDriver); + + WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/foo"))); + WireMock.verify(0, WireMock.getRequestedFor(WireMock.urlEqualTo("/foo"))); + + WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/bar"))); + WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/bar"))); + + WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/baz"))); + + // Visited 2 times because of JS redirect + WireMock.verify(2, WireMock.getRequestedFor(WireMock.urlEqualTo("/baz"))); + + Assert.assertEquals(0, WireMock.findUnmatchedRequests().size()); + } + @After public void after() { WireMock.reset(); From 85905a54a169a364e8e2ff6bf065f8fe29ca1d77 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 30 Jan 2019 23:21:50 +0100 Subject: [PATCH 03/63] Add internal proxy server to overcome Selenium limitations --- pom.xml | 11 +- .../peterbencze/serritor/api/BaseCrawler.java | 255 ++++++++++++------ .../peterbencze/serritor/api/Browser.java | 29 ++ .../serritor/api/CompleteCrawlResponse.java | 52 ++++ .../serritor/api/PartialCrawlResponse.java | 117 ++++++++ .../serritor/api/event/CrawlEvent.java | 7 +- .../serritor/api/event/NetworkErrorEvent.java | 51 ++++ .../api/event/NonHtmlContentEvent.java | 104 +++---- .../serritor/api/event/PageLoadEvent.java | 106 ++++---- .../api/event/PageLoadTimeoutEvent.java | 23 +- .../serritor/api/event/RequestErrorEvent.java | 107 ++++---- .../api/event/RequestRedirectEvent.java | 17 +- .../serritor/api/helper/UrlFinder.java | 5 +- .../serritor/internal/WebDriverFactory.java | 101 +++++++ .../serritor/api/helper/UrlFinderTest.java | 21 +- .../peterbencze/serritor/it/SerritorIT.java | 22 +- 16 files changed, 752 insertions(+), 276 deletions(-) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/Browser.java create mode 100644 src/main/java/com/github/peterbencze/serritor/api/CompleteCrawlResponse.java create mode 100644 src/main/java/com/github/peterbencze/serritor/api/PartialCrawlResponse.java create mode 100644 src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java diff --git a/pom.xml b/pom.xml index 7a3b0a9..d97fff3 100644 --- a/pom.xml +++ b/pom.xml @@ -61,6 +61,11 @@ htmlunit-driver 2.33.0 + + net.lightbody.bmp + browsermob-core + 2.1.5 + com.google.guava guava @@ -78,12 +83,6 @@ 2.23.0 test - - net.lightbody.bmp - browsermob-core - 2.1.5 - test - com.github.tomakehurst wiremock diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 36aec4f..382c0b2 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -19,6 +19,7 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; import com.github.peterbencze.serritor.api.event.CrawlEvent; +import com.github.peterbencze.serritor.api.event.NetworkErrorEvent; import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent; import com.github.peterbencze.serritor.api.event.PageLoadEvent; import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent; @@ -27,6 +28,7 @@ import com.github.peterbencze.serritor.internal.CookieConverter; import com.github.peterbencze.serritor.internal.CrawlFrontier; import com.github.peterbencze.serritor.internal.CrawlerState; +import com.github.peterbencze.serritor.internal.WebDriverFactory; import com.github.peterbencze.serritor.internal.crawldelaymechanism.AdaptiveCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.crawldelaymechanism.CrawlDelayMechanism; import com.github.peterbencze.serritor.internal.crawldelaymechanism.FixedCrawlDelayMechanism; @@ -36,12 +38,16 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.net.InetSocketAddress; import java.net.URI; import java.nio.charset.UnsupportedCharsetException; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; +import net.lightbody.bmp.BrowserMobProxyServer; +import net.lightbody.bmp.client.ClientUtil; +import net.lightbody.bmp.core.har.HarResponse; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.SerializationUtils; import org.apache.commons.lang3.Validate; @@ -60,9 +66,12 @@ import org.apache.http.impl.client.HttpClientBuilder; import org.eclipse.jetty.http.HttpStatus; import org.openqa.selenium.JavascriptExecutor; +import org.openqa.selenium.Proxy; import org.openqa.selenium.TimeoutException; import org.openqa.selenium.WebDriver; import org.openqa.selenium.htmlunit.HtmlUnitDriver; +import org.openqa.selenium.remote.CapabilityType; +import org.openqa.selenium.remote.DesiredCapabilities; /** * Provides a skeletal implementation of a crawler to minimize the effort for users to implement @@ -79,6 +88,7 @@ public abstract class BaseCrawler { private CrawlFrontier crawlFrontier; private BasicCookieStore cookieStore; private CloseableHttpClient httpClient; + private BrowserMobProxyServer proxyServer; private WebDriver webDriver; private CrawlDelayMechanism crawlDelayMechanism; private boolean isStopped; @@ -120,6 +130,7 @@ private BaseCrawler() { this::onPageLoadTimeout); callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_REDIRECT, this::onRequestRedirect); + callbackManager.setDefaultEventCallback(CrawlEvent.NETWORK_ERROR, this::onNetworkError); callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_ERROR, this::onRequestError); isStopping = false; @@ -127,21 +138,31 @@ private BaseCrawler() { } /** - * Starts the crawler using HtmlUnit headless browser. This method will block until the crawler - * finishes. + * Starts the crawler. The crawler will use HtmlUnit headless browser to visit URLs. This method + * will block until the crawler finishes. */ public final void start() { - start(new HtmlUnitDriver(true)); + start(Browser.HTML_UNIT); } /** - * Starts the crawler using the browser specified by the given WebDriver instance. - * This method will block until the crawler finishes. + * Starts the crawler. The crawler will use the specified browser to visit URLs. This method + * will block until the crawler finishes. * - * @param webDriver the WebDriver instance to control the browser + * @param browser the browser type to use for crawling */ - public final void start(final WebDriver webDriver) { - start(webDriver, false); + public final void start(final Browser browser) { + start(browser, new DesiredCapabilities()); + } + + /** + * Starts the crawler. The crawler will use the specified browser to visit URLs. + * + * @param browser the type of the browser to use for crawling + * @param capabilities the browser properties + */ + public final void start(final Browser browser, final DesiredCapabilities capabilities) { + start(browser, capabilities, false); } /** @@ -149,10 +170,29 @@ public final void start(final WebDriver webDriver) { * * @param isResuming indicates if a previously saved state is to be resumed */ - private void start(final WebDriver webDriver, final boolean isResuming) { + private void start(final Browser browser, + final DesiredCapabilities capabilities, + final boolean isResuming) { try { Validate.validState(isStopped, "The crawler is already running."); - this.webDriver = Validate.notNull(webDriver, "The webdriver cannot be null."); + + DesiredCapabilities capabilitiesClone = new DesiredCapabilities(capabilities); + proxyServer = new BrowserMobProxyServer(); + + Proxy chainedProxy = (Proxy) capabilitiesClone.getCapability(CapabilityType.PROXY); + if (chainedProxy != null && chainedProxy.getHttpProxy() != null) { + String[] urlComponents = chainedProxy.getHttpProxy().split(":"); + String host = urlComponents[0]; + int port = Integer.valueOf(urlComponents[1]); + + proxyServer.setChainedProxy(new InetSocketAddress(host, port)); + } + + proxyServer.start(); + capabilitiesClone.setCapability(CapabilityType.PROXY, + ClientUtil.createSeleniumProxy(proxyServer)); + + webDriver = WebDriverFactory.createWebDriver(browser, capabilitiesClone); // If the crawl delay strategy is set to adaptive, we check if the browser supports the // Navigation Timing API or not. However HtmlUnit requires a page to be loaded first @@ -179,10 +219,12 @@ private void start(final WebDriver webDriver, final boolean isResuming) { } finally { HttpClientUtils.closeQuietly(httpClient); - if (this.webDriver != null) { - this.webDriver.quit(); + if (webDriver != null) { + webDriver.quit(); } + proxyServer.stop(); + isStopping = false; isStopped = true; } @@ -204,23 +246,34 @@ public final void saveState(final OutputStream outStream) { } /** - * Resumes the previously loaded state using HtmlUnit headless browser. This method will block - * until the crawler finishes. + * Resumes the previously loaded state. The crawler will use HtmlUnit headless browser to visit + * URLs. This method will block until the crawler finishes. */ public final void resumeState() { - resumeState(new HtmlUnitDriver(true)); + resumeState(Browser.HTML_UNIT); + } + + /** + * Resumes the previously loaded state. The crawler will use the specified browser to visit + * URLs. This method will block until the crawler finishes. + * + * @param browser the type of the browser to use for crawling + */ + public final void resumeState(final Browser browser) { + resumeState(browser, new DesiredCapabilities()); } /** - * Resumes the previously loaded state using the browser specified by the given - * WebDriver instance. This method will block until the crawler finishes. + * Resumes the previously loaded state. The crawler will use the specified browser to visit + * URLs. This method will block until the crawler finishes. * - * @param webDriver the WebDriver instance to control the browser + * @param browser the type of the browser to use for crawling + * @param capabilities the browser properties */ - public final void resumeState(final WebDriver webDriver) { + public final void resumeState(final Browser browser, final DesiredCapabilities capabilities) { Validate.validState(crawlFrontier != null, "Cannot resume state at this point."); - start(webDriver, true); + start(browser, capabilities, true); } /** @@ -304,68 +357,101 @@ protected final void downloadFile(final URI source, final File destination) thro private void run() { onStart(); + boolean shouldPerformDelay = false; + while (!isStopping && crawlFrontier.hasNextCandidate()) { + // Do not perform delay in the first iteration + if (shouldPerformDelay) { + performDelay(); + } else { + shouldPerformDelay = true; + } + CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); String candidateUrl = currentCandidate.getRequestUrl().toString(); CloseableHttpResponse httpHeadResponse = null; - boolean isUnsuccessfulRequest = false; try { - // Send an HTTP HEAD request to determine its availability and content type - httpHeadResponse = httpClient.execute(new HttpHead(candidateUrl)); - } catch (IOException exception) { - callbackManager.call(CrawlEvent.REQUEST_ERROR, - new RequestErrorEvent(currentCandidate, exception)); - isUnsuccessfulRequest = true; - } + try { + httpHeadResponse = httpClient.execute(new HttpHead(candidateUrl)); + } catch (IOException exception) { + callbackManager.call(CrawlEvent.NETWORK_ERROR, + new NetworkErrorEvent(currentCandidate, exception.toString())); + + continue; + } - if (!isUnsuccessfulRequest) { int statusCode = httpHeadResponse.getStatusLine().getStatusCode(); Header locationHeader = httpHeadResponse.getFirstHeader(HttpHeaders.LOCATION); - if (HttpStatus.isRedirection(statusCode) && locationHeader != null) { - // Create new crawl request for the redirected URL - handleRequestRedirect(currentCandidate, locationHeader.getValue()); - } else { - String responseMimeType = getResponseMimeType(httpHeadResponse); - if (responseMimeType.equals(ContentType.TEXT_HTML.getMimeType())) { - boolean isTimedOut = false; - TimeoutException timeoutException = null; - - try { - // Open URL in browser - webDriver.get(candidateUrl); - } catch (TimeoutException exception) { - isTimedOut = true; - timeoutException = exception; - } - - // Ensure the HTTP client and Selenium have the same state - syncHttpClientCookies(); - - if (isTimedOut) { - callbackManager.call(CrawlEvent.PAGE_LOAD_TIMEOUT, - new PageLoadTimeoutEvent(currentCandidate, timeoutException)); - } else { - String loadedPageUrl = webDriver.getCurrentUrl(); - if (!loadedPageUrl.equals(candidateUrl)) { - // Create a new crawl request for the redirected URL (JS redirect) - handleRequestRedirect(currentCandidate, loadedPageUrl); - } else { - callbackManager.call(CrawlEvent.PAGE_LOAD, - new PageLoadEvent(currentCandidate, webDriver)); - } - } - } else { - // URLs that point to non-HTML content should not be opened in the browser - callbackManager.call(CrawlEvent.NON_HTML_CONTENT, - new NonHtmlContentEvent(currentCandidate, responseMimeType)); - } + // Create a new crawl request for the redirected URL (HTTP redirect) + handleRequestRedirect(currentCandidate, + new PartialCrawlResponse(httpHeadResponse), locationHeader.getValue()); + + continue; + } + + String mimeType = getResponseMimeType(httpHeadResponse); + if (!mimeType.equals(ContentType.TEXT_HTML.getMimeType())) { + // URLs that point to non-HTML content should not be opened in the browser + callbackManager.call(CrawlEvent.NON_HTML_CONTENT, + new NonHtmlContentEvent(currentCandidate, + new PartialCrawlResponse(httpHeadResponse))); + + continue; } + + proxyServer.newHar(); + + try { + webDriver.get(candidateUrl); + + // Ensure HTTP client and Selenium have the same cookies + syncHttpClientCookies(); + } catch (TimeoutException exception) { + callbackManager.call(CrawlEvent.PAGE_LOAD_TIMEOUT, + new PageLoadTimeoutEvent(currentCandidate, + new PartialCrawlResponse(httpHeadResponse))); + + continue; + } + } finally { + HttpClientUtils.closeQuietly(httpHeadResponse); + } + + HarResponse harResponse = proxyServer.getHar() + .getLog() + .getEntries() + .get(0) + .getResponse(); + if (harResponse.getError() != null) { + callbackManager.call(CrawlEvent.NETWORK_ERROR, + new NetworkErrorEvent(currentCandidate, harResponse.getError())); + + continue; } - HttpClientUtils.closeQuietly(httpHeadResponse); - performDelay(); + int statusCode = harResponse.getStatus(); + if (HttpStatus.isClientError(statusCode) || HttpStatus.isServerError(statusCode)) { + callbackManager.call(CrawlEvent.REQUEST_ERROR, + new RequestErrorEvent(currentCandidate, + new CompleteCrawlResponse(harResponse, webDriver))); + + continue; + } + + String loadedPageUrl = webDriver.getCurrentUrl(); + if (!loadedPageUrl.equals(candidateUrl)) { + // Create a new crawl request for the redirected URL (JS redirect) + handleRequestRedirect(currentCandidate, + new PartialCrawlResponse(harResponse), loadedPageUrl); + + continue; + } + + callbackManager.call(CrawlEvent.PAGE_LOAD, + new PageLoadEvent(currentCandidate, + new CompleteCrawlResponse(harResponse, webDriver))); } onStop(); @@ -418,20 +504,23 @@ private static String getResponseMimeType(final HttpResponse httpHeadResponse) { * Creates a crawl request for the redirected URL, feeds it to the crawler and calls the * appropriate event callback. * - * @param currentCrawlCandidate the current crawl candidate - * @param redirectedUrl the URL of the redirected request + * @param crawlCandidate the current crawl candidate + * @param partialCrawlResponse the partial crawl response + * @param redirectedUrl the URL of the redirected request */ private void handleRequestRedirect( - final CrawlCandidate currentCrawlCandidate, + final CrawlCandidate crawlCandidate, + final PartialCrawlResponse partialCrawlResponse, final String redirectedUrl) { CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectedUrl) - .setPriority(currentCrawlCandidate.getPriority()); - currentCrawlCandidate.getMetadata().ifPresent(builder::setMetadata); + .setPriority(crawlCandidate.getPriority()); + crawlCandidate.getMetadata().ifPresent(builder::setMetadata); CrawlRequest redirectedRequest = builder.build(); crawlFrontier.feedRequest(redirectedRequest, false); + callbackManager.call(CrawlEvent.REQUEST_REDIRECT, - new RequestRedirectEvent(currentCrawlCandidate, redirectedRequest)); + new RequestRedirectEvent(crawlCandidate, partialCrawlResponse, redirectedRequest)); } /** @@ -483,7 +572,17 @@ protected void onNonHtmlContent(final NonHtmlContentEvent event) { } /** - * Callback which gets called when a request error occurs. + * Callback which gets called when a network error occurs. + * + * @param event the NetworkErrorEvent instance + */ + protected void onNetworkError(final NetworkErrorEvent event) { + LOGGER.log(Level.INFO, "onNetworkError: {0}", event.getErrorMessage()); + } + + /** + * Callback which gets called when a request error (an error with HTTP status code 4xx or 5xx) + * occurs. * * @param event the RequestErrorEvent instance */ @@ -499,8 +598,8 @@ protected void onRequestError(final RequestErrorEvent event) { protected void onRequestRedirect(final RequestRedirectEvent event) { LOGGER.log(Level.INFO, "onRequestRedirect: {0} -> {1}", new Object[]{ - event.getCrawlCandidate().getRequestUrl(), - event.getRedirectedCrawlRequest().getRequestUrl() + event.getCrawlCandidate().getRequestUrl(), + event.getRedirectedCrawlRequest().getRequestUrl() }); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/Browser.java b/src/main/java/com/github/peterbencze/serritor/api/Browser.java new file mode 100644 index 0000000..912586f --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/Browser.java @@ -0,0 +1,29 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api; + +/** + * Supported browsers that can be used for crawling. + * + * @author Peter Bencze + */ +public enum Browser { + + HTML_UNIT, + CHROME, + FIREFOX +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/CompleteCrawlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/CompleteCrawlResponse.java new file mode 100644 index 0000000..9f0fc34 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/CompleteCrawlResponse.java @@ -0,0 +1,52 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api; + +import net.lightbody.bmp.core.har.HarResponse; +import org.openqa.selenium.WebDriver; + +/** + * Represents a complete crawl response that provides access to the HTTP header information and the + * {@link WebDriver} instance to interact with the browser. + * + * @author Peter Bencze + */ +public final class CompleteCrawlResponse extends PartialCrawlResponse { + + private final WebDriver webDriver; + + /** + * Creates a {@link CompleteCrawlResponse} instance from an HAR capture. + * + * @param harResponse the har capture + * @param webDriver the WebDriver instance + */ + public CompleteCrawlResponse(final HarResponse harResponse, final WebDriver webDriver) { + super(harResponse); + + this.webDriver = webDriver; + } + + /** + * Returns the WebDriver instance to interact with the browser. + * + * @return the WebDriver instance + */ + public WebDriver getWebDriver() { + return webDriver; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/PartialCrawlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/PartialCrawlResponse.java new file mode 100644 index 0000000..f53e245 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/PartialCrawlResponse.java @@ -0,0 +1,117 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import net.lightbody.bmp.core.har.HarResponse; +import org.apache.http.Header; +import org.apache.http.HttpResponse; +import org.apache.http.message.BasicHeader; + +/** + * Represents a partial response that only contains HTTP header information. + * + * @author Peter Bencze + */ +public class PartialCrawlResponse { + + private final int statusCode; + private final String statusText; + private final List
headers; + + /** + * Creates a {@link PartialCrawlResponse} instance from an HTTP response message. + * + * @param httpResponse the HTTP response message + */ + public PartialCrawlResponse(final HttpResponse httpResponse) { + statusCode = httpResponse.getStatusLine().getStatusCode(); + statusText = httpResponse.getStatusLine().getReasonPhrase(); + headers = Arrays.asList(httpResponse.getAllHeaders()); + } + + /** + * Creates a {@link PartialCrawlResponse} instance from an HAR capture. + * + * @param harResponse the HAR capture + */ + public PartialCrawlResponse(final HarResponse harResponse) { + statusCode = harResponse.getStatus(); + statusText = harResponse.getStatusText(); + headers = new ArrayList<>(); + harResponse.getHeaders() + .forEach(header -> headers.add(new BasicHeader(header.getName(), + header.getValue()))); + } + + /** + * Returns the HTTP status code of the response. + * + * @return the HTTP status code + */ + public int getStatusCode() { + return statusCode; + } + + /** + * Returns the status message corresponding to the status code of the response. + * + * @return the status message + */ + public String getStatusText() { + return statusText; + } + + /** + * Returns all the headers of the response. + * + * @return all the headers + */ + public List
getAllHeaders() { + return headers; + } + + /** + * Returns all the headers with the specified name of the response. + * + * @param name the name of the headers + * + * @return all the headers with the specified name + */ + public List
getHeaders(final String name) { + return headers.stream() + .filter(header -> name.equals(header.getName())) + .collect(Collectors.toList()); + } + + /** + * Returns the first header with the specified name of the response. + * + * @param name the name of the header + * + * @return the first header with the specified name + */ + public Optional
getFirstHeader(final String name) { + return headers.stream() + .filter(header -> name.equals(header.getName())) + .findFirst(); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/CrawlEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/CrawlEvent.java index a349530..acdf56c 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/CrawlEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/CrawlEvent.java @@ -41,7 +41,12 @@ public enum CrawlEvent { */ REQUEST_REDIRECT, /** - * Event which gets triggered when a request error occurs. + * Event which gets triggered when a network error occurs. + */ + NETWORK_ERROR, + /** + * Event which gets triggered when a request error (an error with HTTP status code 4xx or 5xx) + * occurs. */ REQUEST_ERROR } diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java new file mode 100644 index 0000000..f6e1aff --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java @@ -0,0 +1,51 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.event; + +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.internal.event.EventObject; + +/** + * Event which gets delivered when a network error occurs. + * + * @author Peter Bencze + */ +public final class NetworkErrorEvent extends EventObject { + + private final String errorMessage; + + /** + * Creates a {@link NetworkErrorEvent} instance. + * + * @param crawlCandidate the current crawl candidate + * @param errorMessage the network error message + */ + public NetworkErrorEvent(final CrawlCandidate crawlCandidate, final String errorMessage) { + super(crawlCandidate); + + this.errorMessage = errorMessage; + } + + /** + * Returns the network error message. + * + * @return the network error message + */ + public String getErrorMessage() { + return errorMessage; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java index 5877424..99b26d2 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java @@ -1,50 +1,54 @@ -/* - * Copyright 2017 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.api.event; - -import com.github.peterbencze.serritor.api.CrawlCandidate; -import com.github.peterbencze.serritor.internal.event.EventObject; - -/** - * Event which gets delivered when the MIME type of the response is not "text/html". - * - * @author Peter Bencze - */ -public final class NonHtmlContentEvent extends EventObject { - - private final String mimeType; - - /** - * Creates a {@link NonHtmlContentEvent} instance. - * - * @param crawlCandidate the current crawl candidate - * @param mimeType the MIME type of the response - */ - public NonHtmlContentEvent(final CrawlCandidate crawlCandidate, final String mimeType) { - super(crawlCandidate); - this.mimeType = mimeType; - } - - /** - * Returns the MIME type of the response. - * - * @return the MIME type of the response - */ - public String getMimeType() { - return mimeType; - } -} +/* + * Copyright 2017 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.event; + +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.api.PartialCrawlResponse; +import com.github.peterbencze.serritor.internal.event.EventObject; + +/** + * Event which gets delivered when the MIME type of the response is not "text/html". + * + * @author Peter Bencze + */ +public final class NonHtmlContentEvent extends EventObject { + + private final PartialCrawlResponse partialCrawlResponse; + + /** + * Creates a {@link NonHtmlContentEvent} instance. + * + * @param crawlCandidate the current crawl candidate + * @param partialCrawlResponse the partial crawl response + */ + public NonHtmlContentEvent( + final CrawlCandidate crawlCandidate, + final PartialCrawlResponse partialCrawlResponse) { + super(crawlCandidate); + + this.partialCrawlResponse = partialCrawlResponse; + } + + /** + * Returns the partial crawl response. + * + * @return the partial crawl response + */ + public PartialCrawlResponse getPartialCrawlResponse() { + return partialCrawlResponse; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java index 50abac7..4657e4f 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java @@ -1,52 +1,54 @@ -/* - * Copyright 2017 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.api.event; - -import com.github.peterbencze.serritor.api.CrawlCandidate; -import com.github.peterbencze.serritor.internal.event.EventObject; -import org.openqa.selenium.WebDriver; - -/** - * Event which gets delivered when the browser loads the page. - * - * @author Peter Bencze - */ -public final class PageLoadEvent extends EventObject { - - private final WebDriver webDriver; - - /** - * Creates a {@link PageLoadEvent} instance. - * - * @param crawlCandidate the current crawl candidate - * @param webDriver the WebDriver to control the browser - */ - public PageLoadEvent(final CrawlCandidate crawlCandidate, final WebDriver webDriver) { - super(crawlCandidate); - - this.webDriver = webDriver; - } - - /** - * Returns the WebDriver to control the browser. - * - * @return the WebDriver to control the browser - */ - public WebDriver getWebDriver() { - return webDriver; - } -} +/* + * Copyright 2017 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.event; + +import com.github.peterbencze.serritor.api.CompleteCrawlResponse; +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.internal.event.EventObject; + +/** + * Event which gets delivered when the browser loads the page. + * + * @author Peter Bencze + */ +public final class PageLoadEvent extends EventObject { + + private final CompleteCrawlResponse completeCrawlResponse; + + /** + * Creates a {@link PageLoadEvent} instance. + * + * @param crawlCandidate the current crawl candidate + * @param completeCrawlResponse the complete crawl response + */ + public PageLoadEvent( + final CrawlCandidate crawlCandidate, + final CompleteCrawlResponse completeCrawlResponse) { + super(crawlCandidate); + + this.completeCrawlResponse = completeCrawlResponse; + } + + /** + * Returns the complete crawl response. + * + * @return the complete crawl response + */ + public CompleteCrawlResponse getCompleteCrawlResponse() { + return completeCrawlResponse; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java index 775e01a..77b5984 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java @@ -17,8 +17,8 @@ package com.github.peterbencze.serritor.api.event; import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.api.PartialCrawlResponse; import com.github.peterbencze.serritor.internal.event.EventObject; -import org.openqa.selenium.TimeoutException; /** * Event which gets delivered when a page does not load in the browser within the timeout period. @@ -27,27 +27,28 @@ */ public final class PageLoadTimeoutEvent extends EventObject { - private final TimeoutException exception; + private final PartialCrawlResponse partialCrawlResponse; /** * Creates a {@link PageLoadTimeoutEvent} instance. * - * @param crawlCandidate the current crawl candidate - * @param exception the thrown exception + * @param crawlCandidate the current crawl candidate + * @param partialCrawlResponse the partial crawl response */ - public PageLoadTimeoutEvent(final CrawlCandidate crawlCandidate, - final TimeoutException exception) { + public PageLoadTimeoutEvent( + final CrawlCandidate crawlCandidate, + final PartialCrawlResponse partialCrawlResponse) { super(crawlCandidate); - this.exception = exception; + this.partialCrawlResponse = partialCrawlResponse; } /** - * Returns the thrown exception. + * Returns the partial crawl response. * - * @return the thrown exception + * @return the partial crawl response */ - public TimeoutException getException() { - return exception; + public PartialCrawlResponse getPartialCrawlResponse() { + return partialCrawlResponse; } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java index 12bc08c..ef36537 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java @@ -1,52 +1,55 @@ -/* - * Copyright 2017 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.api.event; - -import com.github.peterbencze.serritor.api.CrawlCandidate; -import com.github.peterbencze.serritor.internal.event.EventObject; -import java.io.IOException; - -/** - * Event which gets delivered when a request error occurs. - * - * @author Peter Bencze - */ -public final class RequestErrorEvent extends EventObject { - - private final IOException exception; - - /** - * Creates a {@link RequestErrorEvent} instance. - * - * @param crawlCandidate the current crawl candidate - * @param exception the thrown exception - */ - public RequestErrorEvent(final CrawlCandidate crawlCandidate, final IOException exception) { - super(crawlCandidate); - - this.exception = exception; - } - - /** - * Returns the thrown exception. - * - * @return the thrown exception - */ - public IOException getException() { - return exception; - } -} +/* + * Copyright 2017 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.event; + +import com.github.peterbencze.serritor.api.CompleteCrawlResponse; +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.internal.event.EventObject; + +/** + * Event which gets delivered when a request error (an error with HTTP status code 4xx or 5xx) + * occurs. + * + * @author Peter Bencze + */ +public final class RequestErrorEvent extends EventObject { + + private final CompleteCrawlResponse completeCrawlResponse; + + /** + * Creates a {@link RequestErrorEvent} instance. + * + * @param crawlCandidate the current crawl candidate + * @param completeCrawlResponse the complete crawl response + */ + public RequestErrorEvent( + final CrawlCandidate crawlCandidate, + final CompleteCrawlResponse completeCrawlResponse) { + super(crawlCandidate); + + this.completeCrawlResponse = completeCrawlResponse; + } + + /** + * Returns the complete crawl response. + * + * @return the complete crawl response + */ + public CompleteCrawlResponse getCompleteCrawlResponse() { + return completeCrawlResponse; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java index f321bc3..98e68cd 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java @@ -18,6 +18,7 @@ import com.github.peterbencze.serritor.api.CrawlCandidate; import com.github.peterbencze.serritor.api.CrawlRequest; +import com.github.peterbencze.serritor.api.PartialCrawlResponse; import com.github.peterbencze.serritor.internal.event.EventObject; /** @@ -27,21 +28,35 @@ */ public final class RequestRedirectEvent extends EventObject { + private final PartialCrawlResponse partialCrawlResponse; private final CrawlRequest redirectedCrawlRequest; /** * Creates a {@link RequestRedirectEvent} instance. * * @param crawlCandidate the current crawl candidate + * @param partialCrawlResponse the partial crawl response * @param redirectedCrawlRequest the crawl request for the redirected URL */ - public RequestRedirectEvent(final CrawlCandidate crawlCandidate, + public RequestRedirectEvent( + final CrawlCandidate crawlCandidate, + final PartialCrawlResponse partialCrawlResponse, final CrawlRequest redirectedCrawlRequest) { super(crawlCandidate); + this.partialCrawlResponse = partialCrawlResponse; this.redirectedCrawlRequest = redirectedCrawlRequest; } + /** + * Returns the partial crawl response. + * + * @return the partial crawl response + */ + public PartialCrawlResponse getPartialCrawlResponse() { + return partialCrawlResponse; + } + /** * Returns the crawl request for the redirected URL. * diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index e3658f7..5dc8786 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -75,7 +75,7 @@ public List findUrlsInPage(final PageLoadEvent event) { // Find elements using the specified locating mechanisms Set extractedElements = locatingMechanisms.stream() - .map(event.getWebDriver()::findElements) + .map(event.getCompleteCrawlResponse().getWebDriver()::findElements) .flatMap(List::stream) .collect(Collectors.toSet()); @@ -89,8 +89,7 @@ public List findUrlsInPage(final PageLoadEvent event) { .forEach(foundUrls::add); }); - return foundUrls.stream() - .collect(Collectors.toList()); + return new ArrayList<>(foundUrls); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java b/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java new file mode 100644 index 0000000..4f41e62 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java @@ -0,0 +1,101 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal; + +import com.github.peterbencze.serritor.api.Browser; +import org.openqa.selenium.Capabilities; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.firefox.FirefoxOptions; +import org.openqa.selenium.htmlunit.HtmlUnitDriver; +import org.openqa.selenium.remote.DesiredCapabilities; + +/** + * Provides preconfigured {@link WebDriver} instances. + * + * @author Peter Bencze + */ +public final class WebDriverFactory { + + /** + * Creates the specific WebDriver instance with the provided properties. + * + * @param browser the type of the browser + * @param capabilities the browser properties + * + * @return the preconfigured WebDriver instance + */ + @SuppressWarnings("checkstyle:MissingSwitchDefault") + public static WebDriver createWebDriver(final Browser browser, + final Capabilities capabilities) { + switch (browser) { + case HTML_UNIT: + return createHtmlUnitDriver(capabilities); + case CHROME: + return createChromeDriver(capabilities); + case FIREFOX: + return createFirefoxDriver(capabilities); + } + + throw new IllegalArgumentException("Unsupported browser."); + } + + /** + * Creates a HtmlUnitDriver instance with the provided properties. + * + * @param extraCapabilities the browser properties + * + * @return the preconfigured HtmlUnitDriver instance + */ + private static HtmlUnitDriver createHtmlUnitDriver(final Capabilities extraCapabilities) { + DesiredCapabilities capabilities = DesiredCapabilities.htmlUnit(); + capabilities.merge(extraCapabilities); + capabilities.setJavascriptEnabled(true); + + return new HtmlUnitDriver(capabilities); + } + + /** + * Creates a ChromeDriver instance with the provided properties. + * + * @param extraCapabilities the browser properties + * + * @return the preconfigured ChromeDriver instance + */ + private static ChromeDriver createChromeDriver(final Capabilities extraCapabilities) { + ChromeOptions options = new ChromeOptions(); + options.merge(extraCapabilities); + + return new ChromeDriver(options); + } + + /** + * Creates a FirefoxDriver instance with the provided properties. + * + * @param extraCapabilities the browser properties + * + * @return the preconfigured FirefoxDriver instance + */ + private static FirefoxDriver createFirefoxDriver(final Capabilities extraCapabilities) { + FirefoxOptions options = new FirefoxOptions(); + options.merge(extraCapabilities); + + return new FirefoxDriver(options); + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java index d33da36..09c3296 100644 --- a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java +++ b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java @@ -16,6 +16,7 @@ package com.github.peterbencze.serritor.api.helper; +import com.github.peterbencze.serritor.api.CompleteCrawlResponse; import com.github.peterbencze.serritor.api.event.PageLoadEvent; import com.github.peterbencze.serritor.api.helper.UrlFinder.UrlFinderBuilder; import java.util.Arrays; @@ -43,30 +44,28 @@ public final class UrlFinderTest { private static final String INVALID_URL = "invalid-url"; private static final String URL_WITH_INVALID_DOMAIN = "http://invalid.domain"; - private WebDriver mockedDriver; private PageLoadEvent mockedEvent; - private WebElement mockedElementWithValidUrl; - private WebElement mockedElementWithInvalidUrlFormat; - private WebElement mockedElementWithInvalidDomain; private UrlFinder urlFinder; @Before public void initialize() { - mockedEvent = Mockito.mock(PageLoadEvent.class); + WebDriver mockedDriver = Mockito.mock(WebDriver.class); + + CompleteCrawlResponse mockedCrawlResponse = Mockito.mock(CompleteCrawlResponse.class); + Mockito.when(mockedCrawlResponse.getWebDriver()).thenReturn(mockedDriver); - mockedDriver = Mockito.mock(WebDriver.class); - Mockito.when(mockedEvent.getWebDriver()) - .thenReturn(mockedDriver); + mockedEvent = Mockito.mock(PageLoadEvent.class); + Mockito.when(mockedEvent.getCompleteCrawlResponse()).thenReturn(mockedCrawlResponse); - mockedElementWithValidUrl = Mockito.mock(WebElement.class); + WebElement mockedElementWithValidUrl = Mockito.mock(WebElement.class); Mockito.when(mockedElementWithValidUrl.getAttribute(Mockito.eq(ATTRIBUTE))) .thenReturn(VALID_URL); - mockedElementWithInvalidUrlFormat = Mockito.mock(WebElement.class); + WebElement mockedElementWithInvalidUrlFormat = Mockito.mock(WebElement.class); Mockito.when(mockedElementWithInvalidUrlFormat.getAttribute(Mockito.eq(ATTRIBUTE))) .thenReturn(INVALID_URL); - mockedElementWithInvalidDomain = Mockito.mock(WebElement.class); + WebElement mockedElementWithInvalidDomain = Mockito.mock(WebElement.class); Mockito.when(mockedElementWithInvalidDomain.getAttribute(Mockito.eq(ATTRIBUTE))) .thenReturn(URL_WITH_INVALID_DOMAIN); diff --git a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java b/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java index b102275..3b61ac3 100644 --- a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java +++ b/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java @@ -17,6 +17,7 @@ package com.github.peterbencze.serritor.it; import com.github.peterbencze.serritor.api.BaseCrawler; +import com.github.peterbencze.serritor.api.Browser; import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent; @@ -39,7 +40,6 @@ import org.junit.After; import org.junit.AfterClass; import org.junit.Assert; -import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.openqa.selenium.htmlunit.HtmlUnitDriver; @@ -57,7 +57,7 @@ public class SerritorIT { private static WireMockServer mockServer; private static BrowserMobProxyServer proxyServer; - private HtmlUnitDriver htmlUnitDriver; + private static DesiredCapabilities capabilities; @BeforeClass public static void beforeClass() { @@ -66,11 +66,10 @@ public static void beforeClass() { proxyServer = createProxyServer(mockServer.port()); System.setProperty("http.proxyHost", "localhost"); System.setProperty("http.proxyPort", String.valueOf(proxyServer.getPort())); - } - @Before - public void before() { - htmlUnitDriver = createHtmlUnitDriver(proxyServer); + capabilities = new DesiredCapabilities(); + capabilities.setCapability(CapabilityType.PROXY, + ClientUtil.createSeleniumProxy(proxyServer)); } @Test @@ -98,7 +97,7 @@ protected void onNonHtmlContent(final NonHtmlContentEvent event) { } } }; - crawler.start(htmlUnitDriver); + crawler.start(Browser.HTML_UNIT, capabilities); WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/foo"))); WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/foo"))); @@ -143,10 +142,11 @@ protected void onPageLoad(final PageLoadEvent event) { } }; - crawler.start(htmlUnitDriver); + crawler.start(Browser.HTML_UNIT, capabilities); + crawler = new BaseCrawler(new FileInputStream(destinationFile)) { }; - crawler.resumeState(createHtmlUnitDriver(proxyServer)); + crawler.resumeState(Browser.HTML_UNIT, capabilities); WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/foo"))); WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/foo"))); @@ -177,7 +177,7 @@ public void testHttpClientCookieSynchronization() { BaseCrawler crawler = new BaseCrawler(config) { }; - crawler.start(htmlUnitDriver); + crawler.start(Browser.HTML_UNIT, capabilities); WireMock.verify(WireMock.headRequestedFor(WireMock.urlEqualTo("/bar")) .withCookie("foo", WireMock.equalTo("bar"))); @@ -203,7 +203,7 @@ public void testRedirectHandling() { BaseCrawler crawler = new BaseCrawler(config) { }; - crawler.start(htmlUnitDriver); + crawler.start(Browser.HTML_UNIT, capabilities); WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/foo"))); WireMock.verify(0, WireMock.getRequestedFor(WireMock.urlEqualTo("/foo"))); From 9005cac60ff543ded8002c2f884f7f4c8f72fc18 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 31 Jan 2019 20:38:35 +0100 Subject: [PATCH 04/63] Change the input parameter of UrlFinder --- .../serritor/api/helper/UrlFinder.java | 8 ++++---- .../serritor/api/helper/UrlFinderTest.java | 20 ++++++++----------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index 5dc8786..1d57e7b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -16,7 +16,7 @@ package com.github.peterbencze.serritor.api.helper; -import com.github.peterbencze.serritor.api.event.PageLoadEvent; +import com.github.peterbencze.serritor.api.CompleteCrawlResponse; import com.google.common.collect.Sets; import com.google.common.net.InternetDomainName; import java.net.URI; @@ -66,16 +66,16 @@ public static UrlFinder createDefault() { /** * Returns a list of validated URLs found in the page's HTML source. * - * @param event the PageLoadEvent instance + * @param completeCrawlResponse the complete crawl response * * @return the list of found URLs */ - public List findUrlsInPage(final PageLoadEvent event) { + public List findUrlsInPage(final CompleteCrawlResponse completeCrawlResponse) { Set foundUrls = new HashSet<>(); // Find elements using the specified locating mechanisms Set extractedElements = locatingMechanisms.stream() - .map(event.getCompleteCrawlResponse().getWebDriver()::findElements) + .map(completeCrawlResponse.getWebDriver()::findElements) .flatMap(List::stream) .collect(Collectors.toSet()); diff --git a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java index 09c3296..3bc144b 100644 --- a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java +++ b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java @@ -17,7 +17,6 @@ package com.github.peterbencze.serritor.api.helper; import com.github.peterbencze.serritor.api.CompleteCrawlResponse; -import com.github.peterbencze.serritor.api.event.PageLoadEvent; import com.github.peterbencze.serritor.api.helper.UrlFinder.UrlFinderBuilder; import java.util.Arrays; import java.util.List; @@ -44,19 +43,13 @@ public final class UrlFinderTest { private static final String INVALID_URL = "invalid-url"; private static final String URL_WITH_INVALID_DOMAIN = "http://invalid.domain"; - private PageLoadEvent mockedEvent; + private CompleteCrawlResponse mockedCrawlResponse; private UrlFinder urlFinder; @Before public void initialize() { WebDriver mockedDriver = Mockito.mock(WebDriver.class); - CompleteCrawlResponse mockedCrawlResponse = Mockito.mock(CompleteCrawlResponse.class); - Mockito.when(mockedCrawlResponse.getWebDriver()).thenReturn(mockedDriver); - - mockedEvent = Mockito.mock(PageLoadEvent.class); - Mockito.when(mockedEvent.getCompleteCrawlResponse()).thenReturn(mockedCrawlResponse); - WebElement mockedElementWithValidUrl = Mockito.mock(WebElement.class); Mockito.when(mockedElementWithValidUrl.getAttribute(Mockito.eq(ATTRIBUTE))) .thenReturn(VALID_URL); @@ -69,17 +62,20 @@ public void initialize() { Mockito.when(mockedElementWithInvalidDomain.getAttribute(Mockito.eq(ATTRIBUTE))) .thenReturn(URL_WITH_INVALID_DOMAIN); - List elementList - = Arrays.asList(mockedElementWithValidUrl, mockedElementWithInvalidUrlFormat, - mockedElementWithInvalidDomain); + List elementList = Arrays.asList(mockedElementWithValidUrl, + mockedElementWithInvalidUrlFormat, mockedElementWithInvalidDomain); Mockito.when(mockedDriver.findElements(By.tagName(TAG_NAME))) .thenReturn(elementList); + mockedCrawlResponse = Mockito.mock(CompleteCrawlResponse.class); + Mockito.when(mockedCrawlResponse.getWebDriver()).thenReturn(mockedDriver); + urlFinder = new UrlFinderBuilder(URL_PATTERN).build(); } @Test public void testFindUrlsInPage() { - Assert.assertEquals(Arrays.asList(VALID_URL), urlFinder.findUrlsInPage(mockedEvent)); + Assert.assertEquals(Arrays.asList(VALID_URL), + urlFinder.findUrlsInPage(mockedCrawlResponse)); } } From 5cee9fe2803caa376b0e8b8fae3c6cf59e486200 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 31 Jan 2019 20:55:47 +0100 Subject: [PATCH 05/63] Maximize browser window on start --- .../java/com/github/peterbencze/serritor/api/BaseCrawler.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 382c0b2..8b7e156 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -193,6 +193,7 @@ private void start(final Browser browser, ClientUtil.createSeleniumProxy(proxyServer)); webDriver = WebDriverFactory.createWebDriver(browser, capabilitiesClone); + webDriver.manage().window().maximize(); // If the crawl delay strategy is set to adaptive, we check if the browser supports the // Navigation Timing API or not. However HtmlUnit requires a page to be loaded first From 16468a93d9373596e4256f2f3cc86a5ba062eca2 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 1 Feb 2019 23:35:31 +0100 Subject: [PATCH 06/63] Add slash to URIs with empty path --- .../peterbencze/serritor/api/CrawlRequest.java | 14 +++++++++++++- .../serritor/internal/CrawlFrontierTest.java | 8 ++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 29d0386..936ef0d 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -21,8 +21,11 @@ import java.io.ObjectInputStream; import java.io.Serializable; import java.net.URI; +import java.net.URISyntaxException; import java.util.Optional; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; +import org.apache.http.client.utils.URIBuilder; /** * Represents a crawl request that may be completed by the crawler. If request filtering is enabled, @@ -122,7 +125,16 @@ public static final class CrawlRequestBuilder { * @param requestUrl the request URL */ public CrawlRequestBuilder(final URI requestUrl) { - this.requestUrl = requestUrl; + if (StringUtils.isEmpty(requestUrl.getPath())) { + try { + // Define a non-empty path for the URI + this.requestUrl = new URIBuilder(requestUrl).setPath("/").build(); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(e.getMessage(), e); + } + } else { + this.requestUrl = requestUrl; + } // Extract the domain from the request URL domain = InternetDomainName.from(requestUrl.getHost()); diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index f6e862f..5cb556c 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -46,10 +46,10 @@ public final class CrawlFrontierTest { // Root URLs private static final URI ROOT_URL_0 - = URI.create("http://root-url-0.com?param1=foo¶m2=bar#fragment"); + = URI.create("http://root-url-0.com/?param1=foo¶m2=bar#fragment"); private static final URI DUPLICATE_ROOT_URL_0 - = URI.create("https://root-url-0.com?param2=bar¶m1=foo"); - private static final URI ROOT_URL_1 = URI.create("http://root-url-1.com"); + = URI.create("https://root-url-0.com/?param2=bar¶m1=foo"); + private static final URI ROOT_URL_1 = URI.create("http://root-url-1.com/"); // Root URL crawl depth private static final int ROOT_URL_CRAWL_DEPTH = 0; @@ -96,7 +96,7 @@ public final class CrawlFrontierTest { = new CrawlRequestBuilder(CHILD_URL_2).setPriority(CHILD_URL_2_PRIORITY).build(); // Offsite URL - private static final URI OFFSITE_URL = URI.create("http://offsite-url.com"); + private static final URI OFFSITE_URL = URI.create("http://offsite-url.com/"); // Offsite URL priority private static final int OFFSITE_URL_PRIORITY = 0; From 257532910e5b3afe9b958e01071906193e9d0d89 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 1 Feb 2019 23:38:55 +0100 Subject: [PATCH 07/63] Check JS redirect before status code --- .../peterbencze/serritor/api/BaseCrawler.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 8b7e156..65369ff 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -432,15 +432,6 @@ private void run() { continue; } - int statusCode = harResponse.getStatus(); - if (HttpStatus.isClientError(statusCode) || HttpStatus.isServerError(statusCode)) { - callbackManager.call(CrawlEvent.REQUEST_ERROR, - new RequestErrorEvent(currentCandidate, - new CompleteCrawlResponse(harResponse, webDriver))); - - continue; - } - String loadedPageUrl = webDriver.getCurrentUrl(); if (!loadedPageUrl.equals(candidateUrl)) { // Create a new crawl request for the redirected URL (JS redirect) @@ -450,6 +441,15 @@ private void run() { continue; } + int statusCode = harResponse.getStatus(); + if (HttpStatus.isClientError(statusCode) || HttpStatus.isServerError(statusCode)) { + callbackManager.call(CrawlEvent.REQUEST_ERROR, + new RequestErrorEvent(currentCandidate, + new CompleteCrawlResponse(harResponse, webDriver))); + + continue; + } + callbackManager.call(CrawlEvent.PAGE_LOAD, new PageLoadEvent(currentCandidate, new CompleteCrawlResponse(harResponse, webDriver))); From 3078c6cd2451579487a6c32dc7aede7d5faf52e4 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 1 Feb 2019 23:44:36 +0100 Subject: [PATCH 08/63] Remove unnecessary path check --- .../serritor/internal/CrawlFrontier.java | 379 +++++++++--------- 1 file changed, 187 insertions(+), 192 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index e08fbc8..33d8f7d 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -1,192 +1,187 @@ -/* - * Copyright 2017 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal; - -import com.github.peterbencze.serritor.api.CrawlCandidate; -import com.github.peterbencze.serritor.api.CrawlCandidate.CrawlCandidateBuilder; -import com.github.peterbencze.serritor.api.CrawlRequest; -import com.github.peterbencze.serritor.api.CrawlerConfiguration; -import java.io.Serializable; -import java.net.URI; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.PriorityQueue; -import java.util.Queue; -import java.util.Set; -import java.util.function.Function; -import org.apache.commons.codec.digest.DigestUtils; - -/** - * Manages crawl requests and provides crawl candidates to the crawler. - * - * @author Peter Bencze - */ -public final class CrawlFrontier implements Serializable { - - private final CrawlerConfiguration config; - private final Set urlFingerprints; - private final Queue candidates; - - private CrawlCandidate currentCandidate; - - /** - * Creates a {@link CrawlFrontier} instance. - * - * @param config the crawler configuration - */ - public CrawlFrontier(final CrawlerConfiguration config) { - this.config = config; - urlFingerprints = new HashSet<>(); - candidates = createPriorityQueue(); - - config.getCrawlSeeds() - .forEach((CrawlRequest request) -> { - feedRequest(request, true); - }); - } - - /** - * Feeds a crawl request to the frontier. - * - * @param request the crawl request - * @param isCrawlSeed indicates if the request is a crawl seed - */ - public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { - if (config.isOffsiteRequestFilteringEnabled()) { - boolean inCrawlDomain = false; - - for (CrawlDomain allowedCrawlDomain : config.getAllowedCrawlDomains()) { - if (allowedCrawlDomain.contains(request.getDomain())) { - inCrawlDomain = true; - break; - } - } - - if (!inCrawlDomain) { - return; - } - } - - if (config.isDuplicateRequestFilteringEnabled()) { - String urlFingerprint = createFingerprintForUrl(request.getRequestUrl()); - - if (urlFingerprints.contains(urlFingerprint)) { - return; - } - - urlFingerprints.add(urlFingerprint); - } - - CrawlCandidateBuilder builder = new CrawlCandidateBuilder(request); - - if (!isCrawlSeed) { - int crawlDepthLimit = config.getMaximumCrawlDepth(); - int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1; - - if (crawlDepthLimit != 0 && nextCrawlDepth > crawlDepthLimit) { - return; - } - - builder = builder - .setRefererUrl(currentCandidate.getRequestUrl()) - .setCrawlDepth(nextCrawlDepth); - } - - candidates.add(builder.build()); - } - - /** - * Indicates if there are any candidates left in the queue. - * - * @return true if there are candidates in the queue, false otherwise - */ - public boolean hasNextCandidate() { - return !candidates.isEmpty(); - } - - /** - * Returns the next crawl candidate from the queue. - * - * @return the next crawl candidate from the queue - */ - public CrawlCandidate getNextCandidate() { - currentCandidate = candidates.poll(); - return currentCandidate; - } - - /** - * Creates the fingerprint of the given URL. If the URL contains query parameters, it sorts - * them. This way URLs with different order of query parameters get the same fingerprint. - * - * @param url the URL for which the fingerprint is created - * - * @return the fingerprint of the URL - */ - private static String createFingerprintForUrl(final URI url) { - StringBuilder truncatedUrl = new StringBuilder(url.getHost()); - - String path = url.getPath(); - if (path != null) { - truncatedUrl.append(path); - } - - String query = url.getQuery(); - if (query != null) { - truncatedUrl.append("?"); - - String[] queryParams = url.getQuery().split("&"); - - List queryParamList = Arrays.asList(queryParams); - queryParamList.stream() - .sorted(String::compareToIgnoreCase) - .forEachOrdered(truncatedUrl::append); - } - - return DigestUtils.sha256Hex(truncatedUrl.toString()); - } - - /** - * Creates a priority queue using the strategy specified in the configuration. - * - * @return the priority queue using the strategy specified in the configuration - */ - @SuppressWarnings("checkstyle:MissingSwitchDefault") - private PriorityQueue createPriorityQueue() { - Function crawlDepthGetter - = (Function & Serializable) CrawlCandidate::getCrawlDepth; - Function priorityGetter - = (Function & Serializable) CrawlCandidate::getPriority; - - switch (config.getCrawlStrategy()) { - case BREADTH_FIRST: - Comparator breadthFirstComparator = Comparator.comparing(crawlDepthGetter) - .thenComparing(priorityGetter, Comparator.reverseOrder()); - - return new PriorityQueue<>(breadthFirstComparator); - case DEPTH_FIRST: - Comparator depthFirstComparator - = Comparator.comparing(crawlDepthGetter, Comparator.reverseOrder()) - .thenComparing(priorityGetter, Comparator.reverseOrder()); - - return new PriorityQueue<>(depthFirstComparator); - } - - throw new IllegalArgumentException("Unsupported crawl strategy."); - } -} +/* + * Copyright 2017 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal; + +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.api.CrawlCandidate.CrawlCandidateBuilder; +import com.github.peterbencze.serritor.api.CrawlRequest; +import com.github.peterbencze.serritor.api.CrawlerConfiguration; +import java.io.Serializable; +import java.net.URI; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.PriorityQueue; +import java.util.Queue; +import java.util.Set; +import java.util.function.Function; +import org.apache.commons.codec.digest.DigestUtils; + +/** + * Manages crawl requests and provides crawl candidates to the crawler. + * + * @author Peter Bencze + */ +public final class CrawlFrontier implements Serializable { + + private final CrawlerConfiguration config; + private final Set urlFingerprints; + private final Queue candidates; + + private CrawlCandidate currentCandidate; + + /** + * Creates a {@link CrawlFrontier} instance. + * + * @param config the crawler configuration + */ + public CrawlFrontier(final CrawlerConfiguration config) { + this.config = config; + urlFingerprints = new HashSet<>(); + candidates = createPriorityQueue(); + + config.getCrawlSeeds() + .forEach((CrawlRequest request) -> { + feedRequest(request, true); + }); + } + + /** + * Feeds a crawl request to the frontier. + * + * @param request the crawl request + * @param isCrawlSeed indicates if the request is a crawl seed + */ + public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { + if (config.isOffsiteRequestFilteringEnabled()) { + boolean inCrawlDomain = false; + + for (CrawlDomain allowedCrawlDomain : config.getAllowedCrawlDomains()) { + if (allowedCrawlDomain.contains(request.getDomain())) { + inCrawlDomain = true; + break; + } + } + + if (!inCrawlDomain) { + return; + } + } + + if (config.isDuplicateRequestFilteringEnabled()) { + String urlFingerprint = createFingerprintForUrl(request.getRequestUrl()); + + if (urlFingerprints.contains(urlFingerprint)) { + return; + } + + urlFingerprints.add(urlFingerprint); + } + + CrawlCandidateBuilder builder = new CrawlCandidateBuilder(request); + + if (!isCrawlSeed) { + int crawlDepthLimit = config.getMaximumCrawlDepth(); + int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1; + + if (crawlDepthLimit != 0 && nextCrawlDepth > crawlDepthLimit) { + return; + } + + builder = builder + .setRefererUrl(currentCandidate.getRequestUrl()) + .setCrawlDepth(nextCrawlDepth); + } + + candidates.add(builder.build()); + } + + /** + * Indicates if there are any candidates left in the queue. + * + * @return true if there are candidates in the queue, false otherwise + */ + public boolean hasNextCandidate() { + return !candidates.isEmpty(); + } + + /** + * Returns the next crawl candidate from the queue. + * + * @return the next crawl candidate from the queue + */ + public CrawlCandidate getNextCandidate() { + currentCandidate = candidates.poll(); + return currentCandidate; + } + + /** + * Creates the fingerprint of the given URL. If the URL contains query parameters, it sorts + * them. This way URLs with different order of query parameters get the same fingerprint. + * + * @param url the URL for which the fingerprint is created + * + * @return the fingerprint of the URL + */ + private static String createFingerprintForUrl(final URI url) { + StringBuilder truncatedUrl = new StringBuilder(url.getHost()).append(url.getPath()); + + String query = url.getQuery(); + if (query != null) { + truncatedUrl.append("?"); + + String[] queryParams = url.getQuery().split("&"); + + List queryParamList = Arrays.asList(queryParams); + queryParamList.stream() + .sorted(String::compareToIgnoreCase) + .forEachOrdered(truncatedUrl::append); + } + + return DigestUtils.sha256Hex(truncatedUrl.toString()); + } + + /** + * Creates a priority queue using the strategy specified in the configuration. + * + * @return the priority queue using the strategy specified in the configuration + */ + @SuppressWarnings("checkstyle:MissingSwitchDefault") + private PriorityQueue createPriorityQueue() { + Function crawlDepthGetter + = (Function & Serializable) CrawlCandidate::getCrawlDepth; + Function priorityGetter + = (Function & Serializable) CrawlCandidate::getPriority; + + switch (config.getCrawlStrategy()) { + case BREADTH_FIRST: + Comparator breadthFirstComparator = Comparator.comparing(crawlDepthGetter) + .thenComparing(priorityGetter, Comparator.reverseOrder()); + + return new PriorityQueue<>(breadthFirstComparator); + case DEPTH_FIRST: + Comparator depthFirstComparator + = Comparator.comparing(crawlDepthGetter, Comparator.reverseOrder()) + .thenComparing(priorityGetter, Comparator.reverseOrder()); + + return new PriorityQueue<>(depthFirstComparator); + } + + throw new IllegalArgumentException("Unsupported crawl strategy."); + } +} From dd16f59f6e8f6d5c9c42126ccdf55031803e1e71 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 1 Feb 2019 23:47:18 +0100 Subject: [PATCH 09/63] Remove unnecessary assignment --- .../github/peterbencze/serritor/internal/CrawlFrontier.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 33d8f7d..78901a3 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -103,8 +103,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { return; } - builder = builder - .setRefererUrl(currentCandidate.getRequestUrl()) + builder.setRefererUrl(currentCandidate.getRequestUrl()) .setCrawlDepth(nextCrawlDepth); } @@ -177,7 +176,7 @@ private PriorityQueue createPriorityQueue() { case DEPTH_FIRST: Comparator depthFirstComparator = Comparator.comparing(crawlDepthGetter, Comparator.reverseOrder()) - .thenComparing(priorityGetter, Comparator.reverseOrder()); + .thenComparing(priorityGetter, Comparator.reverseOrder()); return new PriorityQueue<>(depthFirstComparator); } From 0f822baed4b390ec1984aefce7bf05560dd62e31 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Feb 2019 11:31:59 +0100 Subject: [PATCH 10/63] Use the runtime class type of the event when registering custom callbacks --- .../peterbencze/serritor/api/BaseCrawler.java | 35 ++++++----- .../serritor/api/PatternMatchingCallback.java | 15 +++-- .../serritor/api/event/CrawlEvent.java | 52 ---------------- .../internal/event/EventCallbackManager.java | 51 ++++++++-------- .../event/EventCallbackManagerTest.java | 59 ++++++++----------- 5 files changed, 75 insertions(+), 137 deletions(-) delete mode 100644 src/main/java/com/github/peterbencze/serritor/api/event/CrawlEvent.java diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 65369ff..aea7c8c 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -18,7 +18,6 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; -import com.github.peterbencze.serritor.api.event.CrawlEvent; import com.github.peterbencze.serritor.api.event.NetworkErrorEvent; import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent; import com.github.peterbencze.serritor.api.event.PageLoadEvent; @@ -34,6 +33,7 @@ import com.github.peterbencze.serritor.internal.crawldelaymechanism.FixedCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.crawldelaymechanism.RandomCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.event.EventCallbackManager; +import com.github.peterbencze.serritor.internal.event.EventObject; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -123,15 +123,14 @@ protected BaseCrawler(final InputStream inStream) { */ private BaseCrawler() { callbackManager = new EventCallbackManager(); - callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD, this::onPageLoad); - callbackManager.setDefaultEventCallback(CrawlEvent.NON_HTML_CONTENT, - this::onNonHtmlContent); - callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD_TIMEOUT, + callbackManager.setDefaultEventCallback(PageLoadEvent.class, this::onPageLoad); + callbackManager.setDefaultEventCallback(NonHtmlContentEvent.class, this::onNonHtmlContent); + callbackManager.setDefaultEventCallback(PageLoadTimeoutEvent.class, this::onPageLoadTimeout); - callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_REDIRECT, + callbackManager.setDefaultEventCallback(RequestRedirectEvent.class, this::onRequestRedirect); - callbackManager.setDefaultEventCallback(CrawlEvent.NETWORK_ERROR, this::onNetworkError); - callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_ERROR, this::onRequestError); + callbackManager.setDefaultEventCallback(NetworkErrorEvent.class, this::onNetworkError); + callbackManager.setDefaultEventCallback(RequestErrorEvent.class, this::onRequestError); isStopping = false; isStopped = true; @@ -284,9 +283,9 @@ public final void resumeState(final Browser browser, final DesiredCapabilities c * @param event the event for which the callback should be triggered * @param callback the pattern matching callback to invoke */ - protected final void registerCustomEventCallback( - final CrawlEvent event, - final PatternMatchingCallback callback) { + protected final void registerCustomEventCallback( + final Class event, + final PatternMatchingCallback callback) { Validate.notNull(event, "The event cannot be null."); Validate.notNull(callback, "The callback cannot be null."); @@ -376,7 +375,7 @@ private void run() { try { httpHeadResponse = httpClient.execute(new HttpHead(candidateUrl)); } catch (IOException exception) { - callbackManager.call(CrawlEvent.NETWORK_ERROR, + callbackManager.call(NetworkErrorEvent.class, new NetworkErrorEvent(currentCandidate, exception.toString())); continue; @@ -395,7 +394,7 @@ private void run() { String mimeType = getResponseMimeType(httpHeadResponse); if (!mimeType.equals(ContentType.TEXT_HTML.getMimeType())) { // URLs that point to non-HTML content should not be opened in the browser - callbackManager.call(CrawlEvent.NON_HTML_CONTENT, + callbackManager.call(NonHtmlContentEvent.class, new NonHtmlContentEvent(currentCandidate, new PartialCrawlResponse(httpHeadResponse))); @@ -410,7 +409,7 @@ private void run() { // Ensure HTTP client and Selenium have the same cookies syncHttpClientCookies(); } catch (TimeoutException exception) { - callbackManager.call(CrawlEvent.PAGE_LOAD_TIMEOUT, + callbackManager.call(PageLoadTimeoutEvent.class, new PageLoadTimeoutEvent(currentCandidate, new PartialCrawlResponse(httpHeadResponse))); @@ -426,7 +425,7 @@ private void run() { .get(0) .getResponse(); if (harResponse.getError() != null) { - callbackManager.call(CrawlEvent.NETWORK_ERROR, + callbackManager.call(NetworkErrorEvent.class, new NetworkErrorEvent(currentCandidate, harResponse.getError())); continue; @@ -443,14 +442,14 @@ private void run() { int statusCode = harResponse.getStatus(); if (HttpStatus.isClientError(statusCode) || HttpStatus.isServerError(statusCode)) { - callbackManager.call(CrawlEvent.REQUEST_ERROR, + callbackManager.call(RequestErrorEvent.class, new RequestErrorEvent(currentCandidate, new CompleteCrawlResponse(harResponse, webDriver))); continue; } - callbackManager.call(CrawlEvent.PAGE_LOAD, + callbackManager.call(PageLoadEvent.class, new PageLoadEvent(currentCandidate, new CompleteCrawlResponse(harResponse, webDriver))); } @@ -520,7 +519,7 @@ private void handleRequestRedirect( crawlFrontier.feedRequest(redirectedRequest, false); - callbackManager.call(CrawlEvent.REQUEST_REDIRECT, + callbackManager.call(RequestRedirectEvent.class, new RequestRedirectEvent(crawlCandidate, partialCrawlResponse, redirectedRequest)); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java b/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java index 3c6a686..b3b5140 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java +++ b/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java @@ -25,21 +25,22 @@ * Represents an operation which is invoked when the specified regex pattern matches the request * URL. * + * @param the type of the input to the operation + * * @author Peter Bencze */ -public final class PatternMatchingCallback { +public final class PatternMatchingCallback { private final Pattern urlPattern; - private final Consumer callback; + private final Consumer callback; /** * Creates a {@link PatternMatchingCallback} instance. * - * @param the type of the input to the operation * @param urlPattern the regex pattern used for matching on request URLs * @param callback the operation to be performed when the pattern matches */ - public PatternMatchingCallback( + public PatternMatchingCallback( final Pattern urlPattern, final Consumer callback) { Validate.notNull(urlPattern, "The pattern cannot be null."); @@ -61,11 +62,9 @@ public Pattern getUrlPattern() { /** * Returns the operation to be performed when the pattern matches. * - * @param the type of the input to the operation - * * @return the operation to be performed when the pattern matches */ - public Consumer getCallback() { - return (Consumer) callback; + public Consumer getCallback() { + return callback; } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/CrawlEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/CrawlEvent.java deleted file mode 100644 index acdf56c..0000000 --- a/src/main/java/com/github/peterbencze/serritor/api/event/CrawlEvent.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright 2018 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.api.event; - -/** - * Represents events occurred during the crawling. - * - * @author Peter Bencze - */ -public enum CrawlEvent { - - /** - * Event which gets triggered when the browser loads the page. - */ - PAGE_LOAD, - /** - * Event which gets triggered when the MIME type of the response is not "text/html". - */ - NON_HTML_CONTENT, - /** - * Event which gets triggered when a page does not load in the browser within the timeout - * period. - */ - PAGE_LOAD_TIMEOUT, - /** - * Event which gets triggered when a request is redirected. - */ - REQUEST_REDIRECT, - /** - * Event which gets triggered when a network error occurs. - */ - NETWORK_ERROR, - /** - * Event which gets triggered when a request error (an error with HTTP status code 4xx or 5xx) - * occurs. - */ - REQUEST_ERROR -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java b/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java index b6c280f..d270bd9 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java @@ -17,7 +17,6 @@ package com.github.peterbencze.serritor.internal.event; import com.github.peterbencze.serritor.api.PatternMatchingCallback; -import com.github.peterbencze.serritor.api.event.CrawlEvent; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -34,8 +33,10 @@ */ public final class EventCallbackManager { - private final Map> defaultCallbacks; - private final Map> customCallbacks; + private final Map, + Consumer> defaultCallbacks; + private final Map, + List>> customCallbacks; /** * Creates an {@link EventCallbackManager} instance. @@ -48,26 +49,26 @@ public EventCallbackManager() { /** * Sets the default callback for the specific event. * - * @param the type of the input to the operation - * @param event the event for which the callback should be invoked - * @param callback the operation to be performed + * @param the type of the input to the operation + * @param eventClass the runtime class of the event for which the callback should be invoked + * @param callback the operation to be performed */ public void setDefaultEventCallback( - final CrawlEvent event, + final Class eventClass, final Consumer callback) { - defaultCallbacks.put(event, callback); + defaultCallbacks.put(eventClass, callback); } /** * Associates a pattern matching callback with the specific event. * - * @param event the event for which the callback should be invoked - * @param callback the pattern matching callback to invoke + * @param eventClass the runtime class of the event for which the callback should be invoked + * @param callback the pattern matching callback to invoke */ - public void addCustomEventCallback( - final CrawlEvent event, - final PatternMatchingCallback callback) { - customCallbacks.computeIfAbsent(event, key -> new ArrayList<>()).add(callback); + public void addCustomEventCallback( + final Class eventClass, + final PatternMatchingCallback callback) { + customCallbacks.computeIfAbsent(eventClass, key -> new ArrayList<>()).add(callback); } /** @@ -76,28 +77,30 @@ public void addCustomEventCallback( * request. * * @param the type of the input to the operation - * @param event the event for which the callback should be invoked + * @param eventClass the runtime class of the event for which the callback should be invoked * @param eventObject the input parameter for the callback */ - public void call(final CrawlEvent event, final T eventObject) { - if (!customCallbacks.containsKey(event)) { - ((Consumer) defaultCallbacks.get(event)).accept(eventObject); + @SuppressWarnings("unchecked") + public void call(final Class eventClass, final T eventObject) { + if (!customCallbacks.containsKey(eventClass)) { + ((Consumer) defaultCallbacks.get(eventClass)).accept(eventObject); return; } String requestUrl = eventObject.getCrawlCandidate().getRequestUrl().toString(); - List applicableCallbacks = customCallbacks.get(event) - .stream() - .filter(callback -> callback.getUrlPattern().matcher(requestUrl).matches()) - .collect(Collectors.toList()); + List> applicableCallbacks = + customCallbacks.get(eventClass) + .stream() + .filter(callback -> callback.getUrlPattern().matcher(requestUrl).matches()) + .collect(Collectors.toList()); if (applicableCallbacks.isEmpty()) { - ((Consumer) defaultCallbacks.get(event)).accept(eventObject); + ((Consumer) defaultCallbacks.get(eventClass)).accept(eventObject); return; } applicableCallbacks.stream() .map(PatternMatchingCallback::getCallback) - .forEach(op -> op.accept(eventObject)); + .forEach(op -> ((Consumer) op).accept(eventObject)); } } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/event/EventCallbackManagerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/event/EventCallbackManagerTest.java index 65b7d1f..382f227 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/event/EventCallbackManagerTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/event/EventCallbackManagerTest.java @@ -18,7 +18,6 @@ import com.github.peterbencze.serritor.api.CrawlCandidate; import com.github.peterbencze.serritor.api.PatternMatchingCallback; -import com.github.peterbencze.serritor.api.event.CrawlEvent; import com.github.peterbencze.serritor.api.event.PageLoadEvent; import java.net.URI; import java.util.function.Consumer; @@ -49,9 +48,8 @@ public void before() { @Test public void testCallWithNoCustomEventCallback() { - callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD, - mockedDefaultPageLoadCallback); - callbackManager.call(CrawlEvent.PAGE_LOAD, mockedPageLoadEvent); + callbackManager.setDefaultEventCallback(PageLoadEvent.class, mockedDefaultPageLoadCallback); + callbackManager.call(PageLoadEvent.class, mockedPageLoadEvent); Mockito.verify(mockedDefaultPageLoadCallback, Mockito.times(1)).accept(mockedPageLoadEvent); } @@ -60,24 +58,22 @@ public void testCallWithNoCustomEventCallback() { public void testCallWithNoApplicableCustomEventCallback() { Consumer mockedCustomPageLoadCallback = Mockito.mock(Consumer.class); - PatternMatchingCallback mockedPatternMatchingCallback + PatternMatchingCallback mockedPatternMatchingCallback = Mockito.mock(PatternMatchingCallback.class); Pattern mockedPattern = createMockedPattern(false); Mockito.when(mockedPatternMatchingCallback.getUrlPattern()).thenReturn(mockedPattern); - Mockito.doReturn(mockedCustomPageLoadCallback) - .when(mockedPatternMatchingCallback) - .getCallback(); + Mockito.when(mockedPatternMatchingCallback.getCallback()) + .thenReturn(mockedCustomPageLoadCallback); CrawlCandidate mockedCrawlCandidate = createMockedCrawlCandidate(); Mockito.when(mockedPageLoadEvent.getCrawlCandidate()).thenReturn(mockedCrawlCandidate); - callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD, - mockedDefaultPageLoadCallback); - callbackManager.addCustomEventCallback(CrawlEvent.PAGE_LOAD, mockedPatternMatchingCallback); - callbackManager.call(CrawlEvent.PAGE_LOAD, mockedPageLoadEvent); + callbackManager.setDefaultEventCallback(PageLoadEvent.class, mockedDefaultPageLoadCallback); + callbackManager.addCustomEventCallback(PageLoadEvent.class, mockedPatternMatchingCallback); + callbackManager.call(PageLoadEvent.class, mockedPageLoadEvent); Mockito.verify(mockedDefaultPageLoadCallback, Mockito.times(1)).accept(mockedPageLoadEvent); Mockito.verify(mockedCustomPageLoadCallback, Mockito.never()).accept(mockedPageLoadEvent); @@ -87,24 +83,22 @@ public void testCallWithNoApplicableCustomEventCallback() { public void testCallWithSingleApplicableCustomEventCallback() { Consumer mockedCustomPageLoadCallback = Mockito.mock(Consumer.class); - PatternMatchingCallback mockedPatternMatchingCallback + PatternMatchingCallback mockedPatternMatchingCallback = Mockito.mock(PatternMatchingCallback.class); Pattern mockedPattern = createMockedPattern(true); Mockito.when(mockedPatternMatchingCallback.getUrlPattern()).thenReturn(mockedPattern); - Mockito.doReturn(mockedCustomPageLoadCallback) - .when(mockedPatternMatchingCallback) - .getCallback(); + Mockito.when(mockedPatternMatchingCallback.getCallback()) + .thenReturn(mockedCustomPageLoadCallback); CrawlCandidate mockedCrawlCandidate = createMockedCrawlCandidate(); Mockito.when(mockedPageLoadEvent.getCrawlCandidate()).thenReturn(mockedCrawlCandidate); - callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD, - mockedDefaultPageLoadCallback); - callbackManager.addCustomEventCallback(CrawlEvent.PAGE_LOAD, mockedPatternMatchingCallback); - callbackManager.call(CrawlEvent.PAGE_LOAD, mockedPageLoadEvent); + callbackManager.setDefaultEventCallback(PageLoadEvent.class, mockedDefaultPageLoadCallback); + callbackManager.addCustomEventCallback(PageLoadEvent.class, mockedPatternMatchingCallback); + callbackManager.call(PageLoadEvent.class, mockedPageLoadEvent); Mockito.verify(mockedDefaultPageLoadCallback, Mockito.never()).accept(mockedPageLoadEvent); Mockito.verify(mockedCustomPageLoadCallback, Mockito.times(1)).accept(mockedPageLoadEvent); @@ -114,34 +108,29 @@ public void testCallWithSingleApplicableCustomEventCallback() { public void testCallWithMultipleApplicableCustomEventCallback() { Consumer mockedCustomPageLoadCallback = Mockito.mock(Consumer.class); - PatternMatchingCallback mockedPatternMatchingCallback1 + PatternMatchingCallback mockedPatternMatchingCallback1 = Mockito.mock(PatternMatchingCallback.class); - PatternMatchingCallback mockedPatternMatchingCallback2 + PatternMatchingCallback mockedPatternMatchingCallback2 = Mockito.mock(PatternMatchingCallback.class); Pattern mockedPattern = createMockedPattern(true); Mockito.when(mockedPatternMatchingCallback1.getUrlPattern()).thenReturn(mockedPattern); - Mockito.doReturn(mockedCustomPageLoadCallback) - .when(mockedPatternMatchingCallback1) - .getCallback(); + Mockito.when(mockedPatternMatchingCallback1.getCallback()) + .thenReturn(mockedCustomPageLoadCallback); Mockito.when(mockedPatternMatchingCallback2.getUrlPattern()).thenReturn(mockedPattern); - Mockito.doReturn(mockedCustomPageLoadCallback) - .when(mockedPatternMatchingCallback2) - .getCallback(); + Mockito.when(mockedPatternMatchingCallback2.getCallback()) + .thenReturn(mockedCustomPageLoadCallback); CrawlCandidate mockedCrawlCandidate = createMockedCrawlCandidate(); Mockito.when(mockedPageLoadEvent.getCrawlCandidate()).thenReturn(mockedCrawlCandidate); - callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD, - mockedDefaultPageLoadCallback); - callbackManager.addCustomEventCallback(CrawlEvent.PAGE_LOAD, - mockedPatternMatchingCallback1); - callbackManager.addCustomEventCallback(CrawlEvent.PAGE_LOAD, - mockedPatternMatchingCallback2); - callbackManager.call(CrawlEvent.PAGE_LOAD, mockedPageLoadEvent); + callbackManager.setDefaultEventCallback(PageLoadEvent.class, mockedDefaultPageLoadCallback); + callbackManager.addCustomEventCallback(PageLoadEvent.class, mockedPatternMatchingCallback1); + callbackManager.addCustomEventCallback(PageLoadEvent.class, mockedPatternMatchingCallback2); + callbackManager.call(PageLoadEvent.class, mockedPageLoadEvent); Mockito.verify(mockedDefaultPageLoadCallback, Mockito.never()).accept(mockedPageLoadEvent); Mockito.verify(mockedCustomPageLoadCallback, Mockito.times(2)).accept(mockedPageLoadEvent); From b4adee86f717c0c2816d44bd2d2acef72156a371 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Feb 2019 11:38:21 +0100 Subject: [PATCH 11/63] Fix missing Javadoc params --- .../github/peterbencze/serritor/api/BaseCrawler.java | 11 ++++++----- .../serritor/internal/event/EventCallbackManager.java | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index aea7c8c..37727c3 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -280,16 +280,17 @@ public final void resumeState(final Browser browser, final DesiredCapabilities c * Registers an operation which is invoked when the specific event occurs and the provided * pattern matches the request URL. * - * @param event the event for which the callback should be triggered - * @param callback the pattern matching callback to invoke + * @param the type of the input to the operation + * @param eventClass the runtime class of the event for which the callback should be invoked + * @param callback the pattern matching callback to invoke */ protected final void registerCustomEventCallback( - final Class event, + final Class eventClass, final PatternMatchingCallback callback) { - Validate.notNull(event, "The event cannot be null."); + Validate.notNull(eventClass, "The event cannot be null."); Validate.notNull(callback, "The callback cannot be null."); - callbackManager.addCustomEventCallback(event, callback); + callbackManager.addCustomEventCallback(eventClass, callback); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java b/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java index d270bd9..f87002a 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java @@ -62,6 +62,7 @@ public void setDefaultEventCallback( /** * Associates a pattern matching callback with the specific event. * + * @param the type of the input to the operation * @param eventClass the runtime class of the event for which the callback should be invoked * @param callback the pattern matching callback to invoke */ From 76082838647e9529c826404e86b474ec8114b23e Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Feb 2019 15:31:00 +0100 Subject: [PATCH 12/63] Add onBrowserInit callback to configure the browser before the crawling begins --- .../peterbencze/serritor/api/BaseCrawler.java | 18 +++++++++++++++++- .../serritor/api/CrawlerConfiguration.java | 2 ++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 37727c3..c88c184 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -69,6 +69,7 @@ import org.openqa.selenium.Proxy; import org.openqa.selenium.TimeoutException; import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebDriver.Options; import org.openqa.selenium.htmlunit.HtmlUnitDriver; import org.openqa.selenium.remote.CapabilityType; import org.openqa.selenium.remote.DesiredCapabilities; @@ -192,7 +193,7 @@ private void start(final Browser browser, ClientUtil.createSeleniumProxy(proxyServer)); webDriver = WebDriverFactory.createWebDriver(browser, capabilitiesClone); - webDriver.manage().window().maximize(); + onBrowserInit(webDriver.manage()); // If the crawl delay strategy is set to adaptive, we check if the browser supports the // Navigation Timing API or not. However HtmlUnit requires a page to be loaded first @@ -547,6 +548,21 @@ private void performDelay() { } } + /** + * Callback which is used to configure the browser before the crawling begins. + * + * @param options an interface for managing stuff you would do in a browser menu + */ + protected void onBrowserInit(final Options options) { + LOGGER.info("onBrowserInit"); + + options.timeouts() + .pageLoadTimeout(CrawlerConfiguration.DEFAULT_PAGE_LOAD_TIMEOUT_IN_MILLIS, + TimeUnit.MILLISECONDS); + + options.window().maximize(); + } + /** * Callback which gets called when the crawler is started. */ diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java index d5aef15..3e76c56 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java @@ -32,6 +32,8 @@ */ public final class CrawlerConfiguration implements Serializable { + public static final long DEFAULT_PAGE_LOAD_TIMEOUT_IN_MILLIS = Duration.ofMinutes(3).toMillis(); + private final Set allowedCrawlDomains; private final Set crawlSeeds; private final CrawlStrategy crawlStrategy; From 3cb725c8b3abd1f03679a23ae9f9af88ec4c376b Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Feb 2019 18:30:50 +0100 Subject: [PATCH 13/63] Update dependency versions --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index d97fff3..8063ae7 100644 --- a/pom.xml +++ b/pom.xml @@ -54,12 +54,12 @@ org.seleniumhq.selenium selenium-java - 3.14.0 + 3.141.59 org.seleniumhq.selenium htmlunit-driver - 2.33.0 + 2.33.3 net.lightbody.bmp @@ -69,7 +69,7 @@ com.google.guava guava - 27.0-jre + 27.0.1-jre junit From 1d543081bd3547976f37ef6047270f3176fe6023 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Feb 2019 18:34:48 +0100 Subject: [PATCH 14/63] Fix issue of Chrome and Firefox options not getting processed --- .../serritor/internal/WebDriverFactory.java | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java b/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java index 4f41e62..214d8d3 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java @@ -17,6 +17,8 @@ package com.github.peterbencze.serritor.internal; import com.github.peterbencze.serritor.api.Browser; +import java.util.List; +import java.util.Map; import org.openqa.selenium.Capabilities; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; @@ -74,13 +76,30 @@ private static HtmlUnitDriver createHtmlUnitDriver(final Capabilities extraCapab /** * Creates a ChromeDriver instance with the provided properties. * - * @param extraCapabilities the browser properties + * @param capabilities the browser properties * * @return the preconfigured ChromeDriver instance */ - private static ChromeDriver createChromeDriver(final Capabilities extraCapabilities) { + private static ChromeDriver createChromeDriver(final Capabilities capabilities) { ChromeOptions options = new ChromeOptions(); - options.merge(extraCapabilities); + options.merge(capabilities); + + // This should have been implemented in ChromeOptions, just like in FirefoxOptions... + Object capability = capabilities.getCapability(ChromeOptions.CAPABILITY); + if (capability instanceof Map) { + Map extraOptions = (Map) capability; + extraOptions.forEach((key, value) -> { + if ("binary".equals(key)) { + options.setBinary((String) extraOptions.get("binary")); + } else if ("args".equals(key)) { + options.addArguments((List) value); + } else if ("extensions".equals(key)) { + options.addEncodedExtensions((List) value); + } else { + options.setExperimentalOption((String) key, value); + } + }); + } return new ChromeDriver(options); } @@ -88,14 +107,11 @@ private static ChromeDriver createChromeDriver(final Capabilities extraCapabilit /** * Creates a FirefoxDriver instance with the provided properties. * - * @param extraCapabilities the browser properties + * @param capabilities the browser properties * * @return the preconfigured FirefoxDriver instance */ - private static FirefoxDriver createFirefoxDriver(final Capabilities extraCapabilities) { - FirefoxOptions options = new FirefoxOptions(); - options.merge(extraCapabilities); - - return new FirefoxDriver(options); + private static FirefoxDriver createFirefoxDriver(final Capabilities capabilities) { + return new FirefoxDriver(new FirefoxOptions(capabilities)); } } From 42e02e52218b3ad04b3f895b723e26dde3941a3c Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Feb 2019 18:38:20 +0100 Subject: [PATCH 15/63] Ignore unchecked conversion warnings --- .../github/peterbencze/serritor/internal/WebDriverFactory.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java b/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java index 214d8d3..7ea162f 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java @@ -80,6 +80,7 @@ private static HtmlUnitDriver createHtmlUnitDriver(final Capabilities extraCapab * * @return the preconfigured ChromeDriver instance */ + @SuppressWarnings("unchecked") private static ChromeDriver createChromeDriver(final Capabilities capabilities) { ChromeOptions options = new ChromeOptions(); options.merge(capabilities); From 78617c0726b115a97d1c5272fbe4943a4dafe5cc Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 7 Feb 2019 21:12:43 +0100 Subject: [PATCH 16/63] Use browser enum for comparison --- .../com/github/peterbencze/serritor/api/BaseCrawler.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index c88c184..37b728e 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -70,7 +70,6 @@ import org.openqa.selenium.TimeoutException; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebDriver.Options; -import org.openqa.selenium.htmlunit.HtmlUnitDriver; import org.openqa.selenium.remote.CapabilityType; import org.openqa.selenium.remote.DesiredCapabilities; @@ -198,8 +197,8 @@ private void start(final Browser browser, // If the crawl delay strategy is set to adaptive, we check if the browser supports the // Navigation Timing API or not. However HtmlUnit requires a page to be loaded first // before executing JavaScript, so we load a blank page. - if (webDriver instanceof HtmlUnitDriver - && config.getCrawlDelayStrategy().equals(CrawlDelayStrategy.ADAPTIVE)) { + if (Browser.HTML_UNIT.equals(browser) + && CrawlDelayStrategy.ADAPTIVE.equals(config.getCrawlDelayStrategy())) { webDriver.get(WebClient.ABOUT_BLANK); } From 80cd9bb86e38f326029aa2136fa35e93c58073bc Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 7 Feb 2019 21:16:45 +0100 Subject: [PATCH 17/63] Add missing parameter validations --- .../java/com/github/peterbencze/serritor/api/BaseCrawler.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 37b728e..577532c 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -161,6 +161,9 @@ public final void start(final Browser browser) { * @param capabilities the browser properties */ public final void start(final Browser browser, final DesiredCapabilities capabilities) { + Validate.notNull(browser, "The browser parameter cannot be null."); + Validate.notNull(capabilities, "The capabilities parameter cannot be null."); + start(browser, capabilities, false); } From 304f3663cc6e1415ab496ef9a2f4d48b3eb17eef Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 7 Feb 2019 21:37:07 +0100 Subject: [PATCH 18/63] Modify validation error messages --- .../peterbencze/serritor/api/BaseCrawler.java | 10 +++++----- .../peterbencze/serritor/api/CrawlRequest.java | 7 +++++-- .../serritor/api/CrawlerConfiguration.java | 15 +++++++++------ .../serritor/api/PatternMatchingCallback.java | 4 ++-- .../serritor/api/helper/UrlFinder.java | 14 ++++++++++---- 5 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 577532c..50915d1 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -290,8 +290,8 @@ public final void resumeState(final Browser browser, final DesiredCapabilities c protected final void registerCustomEventCallback( final Class eventClass, final PatternMatchingCallback callback) { - Validate.notNull(eventClass, "The event cannot be null."); - Validate.notNull(callback, "The callback cannot be null."); + Validate.notNull(eventClass, "The eventClass parameter cannot be null."); + Validate.notNull(callback, "The callback parameter cannot be null."); callbackManager.addCustomEventCallback(eventClass, callback); } @@ -317,7 +317,7 @@ protected final void crawl(final CrawlRequest request) { Validate.validState(!isStopped, "The crawler is not started. Maybe you meant to add this request as a crawl seed?"); Validate.validState(!isStopping, "Cannot add request when the crawler is stopping."); - Validate.notNull(request, "The request cannot be null."); + Validate.notNull(request, "The request parameter cannot be null."); crawlFrontier.feedRequest(request, false); } @@ -343,8 +343,8 @@ protected final void crawl(final List requests) { protected final void downloadFile(final URI source, final File destination) throws IOException { Validate.validState(!isStopped, "Cannot download file when the crawler is not started."); Validate.validState(!isStopping, "Cannot download file when the crawler is stopping."); - Validate.notNull(source, "The source URL cannot be null."); - Validate.notNull(destination, "The destination file cannot be null."); + Validate.notNull(source, "The source parameter cannot be null."); + Validate.notNull(destination, "The destination parameter cannot be null."); HttpGet request = new HttpGet(source); try (CloseableHttpResponse response = httpClient.execute(request)) { diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 936ef0d..3a04894 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -125,6 +125,8 @@ public static final class CrawlRequestBuilder { * @param requestUrl the request URL */ public CrawlRequestBuilder(final URI requestUrl) { + Validate.notNull(requestUrl, "The requestUrl parameter cannot be null."); + if (StringUtils.isEmpty(requestUrl.getPath())) { try { // Define a non-empty path for the URI @@ -149,7 +151,8 @@ public CrawlRequestBuilder(final URI requestUrl) { * @param requestUrl the request URL */ public CrawlRequestBuilder(final String requestUrl) { - this(URI.create(requestUrl)); + this(URI.create(Validate.notNull(requestUrl, + "The requestUrl parameter cannot be null"))); } /** @@ -172,7 +175,7 @@ public CrawlRequestBuilder setPriority(final int priority) { * @return the CrawlRequestBuilder instance */ public CrawlRequestBuilder setMetadata(final Serializable metadata) { - this.metadata = Validate.notNull(metadata, "The metadata cannot be null."); + this.metadata = Validate.notNull(metadata, "The metadata parameter cannot be null."); return this; } diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java index 3e76c56..197f0ab 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java @@ -233,7 +233,7 @@ public CrawlerConfigurationBuilder addAllowedCrawlDomains( * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder addCrawlSeed(final CrawlRequest request) { - Validate.notNull(request, "The request cannot be null."); + Validate.notNull(request, "The request parameter cannot be null."); crawlSeeds.add(request); return this; @@ -261,7 +261,7 @@ public CrawlerConfigurationBuilder addCrawlSeeds(final List reques * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setCrawlStrategy(final CrawlStrategy strategy) { - Validate.notNull(strategy, "The strategy cannot be null."); + Validate.notNull(strategy, "The strategy parameter cannot be null."); crawlStrategy = strategy; return this; @@ -319,7 +319,7 @@ public CrawlerConfigurationBuilder setMaximumCrawlDepth(final int maxCrawlDepth) */ public CrawlerConfigurationBuilder setCrawlDelayStrategy( final CrawlDelayStrategy strategy) { - Validate.notNull(strategy, "The strategy cannot be null."); + Validate.notNull(strategy, "The strategy parameter cannot be null."); crawlDelayStrategy = strategy; return this; @@ -334,7 +334,8 @@ public CrawlerConfigurationBuilder setCrawlDelayStrategy( */ public CrawlerConfigurationBuilder setFixedCrawlDelayDuration( final Duration fixedCrawlDelayDuration) { - Validate.notNull(fixedCrawlDelayDuration, "The duration cannot be null."); + Validate.notNull(fixedCrawlDelayDuration, + "The fixedCrawlDelayDuration parameter cannot be null."); fixedCrawlDelayDurationInMillis = fixedCrawlDelayDuration.toMillis(); return this; @@ -349,7 +350,8 @@ public CrawlerConfigurationBuilder setFixedCrawlDelayDuration( */ public CrawlerConfigurationBuilder setMinimumCrawlDelayDuration( final Duration minCrawlDelayDuration) { - Validate.notNull(minCrawlDelayDuration, "The duration cannot be null."); + Validate.notNull(minCrawlDelayDuration, + "The minCrawlDelayDuration parameter cannot be null."); Validate.isTrue(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative."); @@ -371,7 +373,8 @@ public CrawlerConfigurationBuilder setMinimumCrawlDelayDuration( */ public CrawlerConfigurationBuilder setMaximumCrawlDelayDuration( final Duration maxCrawlDelayDuration) { - Validate.notNull(maxCrawlDelayDuration, "The duration cannot be null."); + Validate.notNull(maxCrawlDelayDuration, + "The maxCrawlDelayDuration parameter cannot be null."); long maxDelayDurationInMillis = maxCrawlDelayDuration.toMillis(); diff --git a/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java b/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java index b3b5140..c0e87e1 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java +++ b/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java @@ -43,8 +43,8 @@ public final class PatternMatchingCallback { public PatternMatchingCallback( final Pattern urlPattern, final Consumer callback) { - Validate.notNull(urlPattern, "The pattern cannot be null."); - Validate.notNull(callback, "The callback cannot be null."); + Validate.notNull(urlPattern, "The urlPattern parameter cannot be null."); + Validate.notNull(callback, "The callback parameter cannot be null."); this.urlPattern = urlPattern; this.callback = callback; diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index 1d57e7b..6e2e62b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -71,6 +71,9 @@ public static UrlFinder createDefault() { * @return the list of found URLs */ public List findUrlsInPage(final CompleteCrawlResponse completeCrawlResponse) { + Validate.notNull(completeCrawlResponse, + "The completeCrawlResponse parameter cannot be null."); + Set foundUrls = new HashSet<>(); // Find elements using the specified locating mechanisms @@ -147,7 +150,8 @@ public UrlFinderBuilder(final Pattern urlPattern) { * @param urlPatterns the list of patterns to use to find URLs */ public UrlFinderBuilder(final List urlPatterns) { - Validate.noNullElements(urlPatterns, "URL patterns cannot be null."); + Validate.noNullElements(urlPatterns, + "The urlPatterns parameter cannot be null or contain null elements."); this.urlPatterns = Sets.newHashSet(urlPatterns); locatingMechanisms = DEFAULT_LOCATING_MECHANISMS; @@ -176,7 +180,8 @@ public UrlFinderBuilder setLocatingMechanism(final By locatingMechanism) { * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setLocatingMechanisms(final List locatingMechanisms) { - Validate.noNullElements(locatingMechanisms, "Locating mechanisms cannot be null."); + Validate.noNullElements(locatingMechanisms, + "The locatingMechanisms parameter cannot be null or contain null elements."); this.locatingMechanisms = Sets.newHashSet(locatingMechanisms); return this; @@ -190,7 +195,8 @@ public UrlFinderBuilder setLocatingMechanisms(final List locatingMechanisms) * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setAttributes(final List attributes) { - Validate.noNullElements(attributes, "Attributes cannot be null."); + Validate.noNullElements(attributes, + "The attributes parameter cannot be null or contain null elements."); this.attributes = Sets.newHashSet(attributes); return this; @@ -215,7 +221,7 @@ public UrlFinderBuilder setAttribute(final String attribute) { * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setValidator(final Predicate validator) { - Validate.notNull(validator, "The validator function cannot be null."); + Validate.notNull(validator, "The validator parameter cannot be null."); this.validator = validator; return this; From c525ff68258df4685811f67cfc49fb29ff4aebe9 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 7 Feb 2019 21:53:19 +0100 Subject: [PATCH 19/63] Use wiremock standalone version to avoid future dependency collisions --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 8063ae7..0ddab49 100644 --- a/pom.xml +++ b/pom.xml @@ -85,8 +85,8 @@ com.github.tomakehurst - wiremock - 2.19.0 + wiremock-jre8-standalone + 2.21.0 test From 4103c18d383d4940fa9a19e350f032171d970219 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 7 Feb 2019 22:07:15 +0100 Subject: [PATCH 20/63] Add missing final modifier --- .../java/com/github/peterbencze/serritor/api/BaseCrawler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 50915d1..e4fec76 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -83,8 +83,9 @@ public abstract class BaseCrawler { private static final Logger LOGGER = Logger.getLogger(BaseCrawler.class.getName()); + private final EventCallbackManager callbackManager; + private CrawlerConfiguration config; - private EventCallbackManager callbackManager; private CrawlFrontier crawlFrontier; private BasicCookieStore cookieStore; private CloseableHttpClient httpClient; From fba7ed186142b0d584d0aef1358f1c6b91b74712 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 8 Feb 2019 23:53:29 +0100 Subject: [PATCH 21/63] Modify the way a crawling session can be resumed --- .../peterbencze/serritor/api/BaseCrawler.java | 90 ++++---- .../{internal => api}/CrawlerState.java | 25 +-- .../serritor/internal/CrawlFrontier.java | 25 ++- .../serritor/internal/CrawlFrontierTest.java | 196 +++++++++--------- .../peterbencze/serritor/it/SerritorIT.java | 13 +- 5 files changed, 174 insertions(+), 175 deletions(-) rename src/main/java/com/github/peterbencze/serritor/{internal => api}/CrawlerState.java (61%) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index e4fec76..7acc3cf 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -26,7 +26,6 @@ import com.github.peterbencze.serritor.api.event.RequestRedirectEvent; import com.github.peterbencze.serritor.internal.CookieConverter; import com.github.peterbencze.serritor.internal.CrawlFrontier; -import com.github.peterbencze.serritor.internal.CrawlerState; import com.github.peterbencze.serritor.internal.WebDriverFactory; import com.github.peterbencze.serritor.internal.crawldelaymechanism.AdaptiveCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.crawldelaymechanism.CrawlDelayMechanism; @@ -36,12 +35,13 @@ import com.github.peterbencze.serritor.internal.event.EventObject; import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; +import java.io.Serializable; import java.net.InetSocketAddress; import java.net.URI; import java.nio.charset.UnsupportedCharsetException; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; @@ -49,7 +49,6 @@ import net.lightbody.bmp.client.ClientUtil; import net.lightbody.bmp.core.har.HarResponse; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.SerializationUtils; import org.apache.commons.lang3.Validate; import org.apache.http.Header; import org.apache.http.HttpEntity; @@ -83,15 +82,15 @@ public abstract class BaseCrawler { private static final Logger LOGGER = Logger.getLogger(BaseCrawler.class.getName()); + private final CrawlerConfiguration config; + private final CrawlFrontier crawlFrontier; private final EventCallbackManager callbackManager; + private final CrawlDelayMechanism crawlDelayMechanism; - private CrawlerConfiguration config; - private CrawlFrontier crawlFrontier; private BasicCookieStore cookieStore; private CloseableHttpClient httpClient; private BrowserMobProxyServer proxyServer; private WebDriver webDriver; - private CrawlDelayMechanism crawlDelayMechanism; private boolean isStopped; private boolean isStopping; @@ -101,28 +100,29 @@ public abstract class BaseCrawler { * @param config the configuration of the crawler */ protected BaseCrawler(final CrawlerConfiguration config) { - this(); - - this.config = config; + this(config, new CrawlFrontier(config)); } /** * Base constructor which loads a previously saved state. * - * @param inStream the input stream from which the state should be loaded + * @param state the state to be loaded */ - protected BaseCrawler(final InputStream inStream) { - this(); - - CrawlerState state = SerializationUtils.deserialize(inStream); - config = state.getStateObject(CrawlerConfiguration.class); - crawlFrontier = state.getStateObject(CrawlFrontier.class); + protected BaseCrawler(final CrawlerState state) { + this(state.getStateObject(CrawlerConfiguration.class), + state.getStateObject(CrawlFrontier.class)); } /** - * Private base constructor which does simple initialization. + * Private base constructor. + * + * @param config the configuration of the crawler + * @param crawlFrontier the crawl frontier */ - private BaseCrawler() { + private BaseCrawler(final CrawlerConfiguration config, final CrawlFrontier crawlFrontier) { + this.config = config; + this.crawlFrontier = crawlFrontier; + callbackManager = new EventCallbackManager(); callbackManager.setDefaultEventCallback(PageLoadEvent.class, this::onPageLoad); callbackManager.setDefaultEventCallback(NonHtmlContentEvent.class, this::onNonHtmlContent); @@ -133,6 +133,8 @@ private BaseCrawler() { callbackManager.setDefaultEventCallback(NetworkErrorEvent.class, this::onNetworkError); callbackManager.setDefaultEventCallback(RequestErrorEvent.class, this::onRequestError); + crawlDelayMechanism = createCrawlDelayMechanism(); + isStopping = false; isStopped = true; } @@ -206,17 +208,17 @@ private void start(final Browser browser, webDriver.get(WebClient.ABOUT_BLANK); } - if (!isResuming) { - crawlFrontier = new CrawlFrontier(config); - } - cookieStore = new BasicCookieStore(); httpClient = HttpClientBuilder.create() .disableRedirectHandling() .setDefaultCookieStore(cookieStore) .useSystemProperties() .build(); - crawlDelayMechanism = createCrawlDelayMechanism(); + + if (!isResuming) { + crawlFrontier.reset(); + } + isStopped = false; run(); @@ -235,48 +237,44 @@ private void start(final Browser browser, } /** - * Saves the current state of the crawler to the given output stream. + * Returns the current state of the crawler. * - * @param outStream the output stream + * @return the current state of the crawler */ - public final void saveState(final OutputStream outStream) { - Validate.validState(crawlFrontier != null, "Cannot save state at this point."); - - CrawlerState state = new CrawlerState(); - state.putStateObject(config); - state.putStateObject(crawlFrontier); + public final CrawlerState getState() { + Map, Serializable> stateObjects = new HashMap<>(); + stateObjects.put(config.getClass(), config); + stateObjects.put(crawlFrontier.getClass(), crawlFrontier); - SerializationUtils.serialize(state, outStream); + return new CrawlerState(stateObjects); } /** - * Resumes the previously loaded state. The crawler will use HtmlUnit headless browser to visit - * URLs. This method will block until the crawler finishes. + * Resumes the crawl. The crawler will use HtmlUnit headless browser to visit URLs. This method + * will block until the crawler finishes. */ - public final void resumeState() { - resumeState(Browser.HTML_UNIT); + public final void resume() { + resume(Browser.HTML_UNIT); } /** - * Resumes the previously loaded state. The crawler will use the specified browser to visit - * URLs. This method will block until the crawler finishes. + * Resumes the crawl. The crawler will use the specified browser to visit URLs. This method will + * block until the crawler finishes. * * @param browser the type of the browser to use for crawling */ - public final void resumeState(final Browser browser) { - resumeState(browser, new DesiredCapabilities()); + public final void resume(final Browser browser) { + resume(browser, new DesiredCapabilities()); } /** - * Resumes the previously loaded state. The crawler will use the specified browser to visit - * URLs. This method will block until the crawler finishes. + * Resumes the crawl. The crawler will use the specified browser to visit URLs. This method will + * block until the crawler finishes. * * @param browser the type of the browser to use for crawling * @param capabilities the browser properties */ - public final void resumeState(final Browser browser, final DesiredCapabilities capabilities) { - Validate.validState(crawlFrontier != null, "Cannot resume state at this point."); - + public final void resume(final Browser browser, final DesiredCapabilities capabilities) { start(browser, capabilities, true); } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerState.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java similarity index 61% rename from src/main/java/com/github/peterbencze/serritor/internal/CrawlerState.java rename to src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java index f82a61a..1cef38e 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlerState.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java @@ -14,36 +14,29 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal; +package com.github.peterbencze.serritor.api; import java.io.Serializable; -import java.util.HashMap; +import java.util.Map; /** - * Represents the state of a crawling session. More specifically, it contains a set of state objects - * that can be later used to resume that session. + * Represents the current state of the crawling session. More specifically, it contains a set of + * state objects that can be later reused to resume that particular session. * * @author Peter Bencze */ public final class CrawlerState implements Serializable { - private final HashMap, Serializable> stateObjects; + private final Map, Serializable> stateObjects; /** * Creates a {@link CrawlerState} instance. - */ - public CrawlerState() { - stateObjects = new HashMap<>(); - } - - /** - * Inserts the specified state object and its corresponding runtime class into the internal map - * used for storing these objects. * - * @param stateObject the state object that is required for resuming the crawling session + * @param stateObjects the map of state objects (that are required for resuming a crawl) and + * their corresponding runtime classes */ - public void putStateObject(final Serializable stateObject) { - stateObjects.put(stateObject.getClass(), stateObject); + public CrawlerState(final Map, Serializable> stateObjects) { + this.stateObjects = stateObjects; } /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 78901a3..79ff6a6 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -55,10 +55,7 @@ public CrawlFrontier(final CrawlerConfiguration config) { urlFingerprints = new HashSet<>(); candidates = createPriorityQueue(); - config.getCrawlSeeds() - .forEach((CrawlRequest request) -> { - feedRequest(request, true); - }); + feedCrawlSeeds(); } /** @@ -129,6 +126,26 @@ public CrawlCandidate getNextCandidate() { return currentCandidate; } + /** + * Resets the crawl frontier to its initial state. + */ + public void reset() { + urlFingerprints.clear(); + candidates.clear(); + + feedCrawlSeeds(); + } + + /** + * Feeds all the crawl seeds to the crawl frontier. + */ + private void feedCrawlSeeds() { + config.getCrawlSeeds() + .forEach((CrawlRequest request) -> { + feedRequest(request, true); + }); + } + /** * Creates the fingerprint of the given URL. If the URL contains query parameters, it sorts * them. This way URLs with different order of query parameters get the same fingerprint. diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 5cb556c..8d147b6 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -22,6 +22,7 @@ import com.github.peterbencze.serritor.api.CrawlStrategy; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import com.github.peterbencze.serritor.api.CrawlerConfiguration.CrawlerConfigurationBuilder; +import com.google.common.collect.Sets; import java.net.URI; import java.util.Arrays; import java.util.Collections; @@ -109,210 +110,209 @@ public final class CrawlFrontierTest { private static final int MAX_CRAWL_DEPTH = 1; private CrawlerConfiguration config; - private CrawlFrontier frontier; @Before - public void initialize() { + public void before() { config = Mockito.spy(new CrawlerConfigurationBuilder().setOffsiteRequestFiltering(true) .addAllowedCrawlDomains(ALLOWED_CRAWL_DOMAINS) .addCrawlSeeds(CRAWL_SEEDS) .build()); - - frontier = new CrawlFrontier(config); } @Test - public void testHasNextCandidateWithCandidatesInQueue() { - Assert.assertTrue(frontier.hasNextCandidate()); + public void testHasNextCandidateWithNonEmptyQueue() { + CrawlFrontier crawlFrontier = new CrawlFrontier(config); + + Assert.assertTrue(crawlFrontier.hasNextCandidate()); - frontier.getNextCandidate(); + crawlFrontier.getNextCandidate(); - Assert.assertTrue(frontier.hasNextCandidate()); + Assert.assertTrue(crawlFrontier.hasNextCandidate()); - frontier.getNextCandidate(); + crawlFrontier.getNextCandidate(); - Assert.assertFalse(frontier.hasNextCandidate()); + Assert.assertFalse(crawlFrontier.hasNextCandidate()); - frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); - frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); + crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); + crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - Assert.assertTrue(frontier.hasNextCandidate()); + Assert.assertTrue(crawlFrontier.hasNextCandidate()); - frontier.getNextCandidate(); + crawlFrontier.getNextCandidate(); - Assert.assertTrue(frontier.hasNextCandidate()); + Assert.assertTrue(crawlFrontier.hasNextCandidate()); - frontier.getNextCandidate(); + crawlFrontier.getNextCandidate(); - Assert.assertFalse(frontier.hasNextCandidate()); + Assert.assertFalse(crawlFrontier.hasNextCandidate()); } @Test public void testHasNextCandidateWithEmptyQueue() { - Mockito.when(config.getCrawlSeeds()) - .thenReturn(Collections.EMPTY_SET); + Mockito.when(config.getCrawlSeeds()).thenReturn(Collections.EMPTY_SET); - // Create crawl frontier without crawl seeds - frontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(config); - Assert.assertFalse(frontier.hasNextCandidate()); + Assert.assertFalse(crawlFrontier.hasNextCandidate()); } @Test public void testEnabledDuplicateRequestFiltering() { - clearCrawlCandidateQueue(); + CrawlFrontier crawlFrontier = new CrawlFrontier(config); - frontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, false); + clearCrawlCandidateQueue(crawlFrontier); + crawlFrontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, false); - Assert.assertFalse(frontier.hasNextCandidate()); + Assert.assertFalse(crawlFrontier.hasNextCandidate()); } @Test public void testDisabledDuplicateRequestFiltering() { - // Disable duplicate request filtering - Mockito.when(config.isDuplicateRequestFilteringEnabled()) - .thenReturn(false); + Mockito.when(config.isDuplicateRequestFilteringEnabled()).thenReturn(false); - clearCrawlCandidateQueue(); + CrawlFrontier crawlFrontier = new CrawlFrontier(config); - frontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, true); + clearCrawlCandidateQueue(crawlFrontier); + crawlFrontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, true); - Assert.assertTrue(frontier.hasNextCandidate()); - Assert.assertEquals(DUPLICATE_ROOT_URL_0, frontier.getNextCandidate().getRequestUrl()); + Assert.assertTrue(crawlFrontier.hasNextCandidate()); + Assert.assertEquals(DUPLICATE_ROOT_URL_0, crawlFrontier.getNextCandidate().getRequestUrl()); } @Test public void testEnabledOffsiteRequestFiltering() { - clearCrawlCandidateQueue(); + Mockito.when(config.getCrawlSeeds()).thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST)); - frontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, false); + CrawlFrontier crawlFrontier = new CrawlFrontier(config); - Assert.assertFalse(frontier.hasNextCandidate()); + Assert.assertFalse(crawlFrontier.hasNextCandidate()); } @Test public void testDisabledOffsiteRequestFiltering() { - // Disable offsite request filtering - Mockito.when(config.isOffsiteRequestFilteringEnabled()) - .thenReturn(false); - - clearCrawlCandidateQueue(); + Mockito.when(config.isOffsiteRequestFilteringEnabled()).thenReturn(false); + Mockito.when(config.getCrawlSeeds()).thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST)); - frontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, false); + CrawlFrontier crawlFrontier = new CrawlFrontier(config); - Assert.assertTrue(frontier.hasNextCandidate()); - Assert.assertEquals(OFFSITE_URL.toString(), - frontier.getNextCandidate().getRequestUrl().toString()); + Assert.assertTrue(crawlFrontier.hasNextCandidate()); + Assert.assertEquals(OFFSITE_URL, + crawlFrontier.getNextCandidate().getRequestUrl()); } @Test - public void testGetNextCandidateUsingBreadthFirstCrawlStrategy() { - CrawlCandidate nextCandidate = frontier.getNextCandidate(); + public void testGetNextCandidateWhenUsingBreadthFirstCrawlStrategy() { + CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate(); Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl()); Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); Assert.assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority()); - frontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false); - - nextCandidate = frontier.getNextCandidate(); + crawlFrontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false); + nextCandidate = crawlFrontier.getNextCandidate(); Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl()); Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); Assert.assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority()); - frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); - frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - - nextCandidate = frontier.getNextCandidate(); + crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); + crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - Assert.assertEquals(CHILD_URL_2.toString(), nextCandidate.getRequestUrl().toString()); + nextCandidate = crawlFrontier.getNextCandidate(); + Assert.assertEquals(CHILD_URL_2, nextCandidate.getRequestUrl()); Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); Assert.assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority()); - // a priority queue doesn't ensure FIFO order when elements have the same depth and priority - nextCandidate = frontier.getNextCandidate(); - - Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH)); - Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - + // A priority queue doesn't ensure FIFO order when elements have the same depth and priority + nextCandidate = crawlFrontier.getNextCandidate(); int previousChildCandidatePriority = nextCandidate.getPriority(); + Assert.assertTrue(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH)); + Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - nextCandidate = frontier.getNextCandidate(); - - Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH)); + nextCandidate = crawlFrontier.getNextCandidate(); + Assert.assertTrue(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH)); Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); Assert.assertEquals(previousChildCandidatePriority, nextCandidate.getPriority()); - Assert.assertFalse(frontier.hasNextCandidate()); + + Assert.assertFalse(crawlFrontier.hasNextCandidate()); } @Test - public void testGetNextCandidateUsingDepthFirstCrawlStrategy() { - Mockito.when(config.getCrawlStrategy()) - .thenReturn(CrawlStrategy.DEPTH_FIRST); - - // Create frontier with depth-first crawl strategy - frontier = new CrawlFrontier(config); + public void testGetNextCandidateWhenUsingDepthFirstCrawlStrategy() { + Mockito.when(config.getCrawlStrategy()).thenReturn(CrawlStrategy.DEPTH_FIRST); - CrawlCandidate nextCandidate = frontier.getNextCandidate(); + CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate(); Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl()); Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); Assert.assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority()); - frontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false); + crawlFrontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false); - // a priority queue doesn't ensure FIFO order when elements have the same depth and priority - nextCandidate = frontier.getNextCandidate(); - - Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH)); + // A priority queue doesn't ensure FIFO order when elements have the same depth and priority + nextCandidate = crawlFrontier.getNextCandidate(); + Assert.assertTrue(nextCandidate.getRequestUrl().getPath().contains(CHILD_URL_PATH)); Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); Assert.assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority()); - nextCandidate = frontier.getNextCandidate(); - + nextCandidate = crawlFrontier.getNextCandidate(); Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl()); Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); Assert.assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority()); - frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); - frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - - nextCandidate = frontier.getNextCandidate(); + crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); + crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - Assert.assertEquals(CHILD_URL_0.toString(), nextCandidate.getRequestUrl().toString()); + nextCandidate = crawlFrontier.getNextCandidate(); + Assert.assertEquals(CHILD_URL_0, nextCandidate.getRequestUrl()); Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); Assert.assertEquals(CHILD_URL_0_PRIORITY, nextCandidate.getPriority()); - nextCandidate = frontier.getNextCandidate(); - - Assert.assertEquals(CHILD_URL_1.toString(), nextCandidate.getRequestUrl().toString()); + nextCandidate = crawlFrontier.getNextCandidate(); + Assert.assertEquals(CHILD_URL_1, nextCandidate.getRequestUrl()); Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); Assert.assertEquals(CHILD_URL_1_PRIORITY, nextCandidate.getPriority()); - Assert.assertFalse(frontier.hasNextCandidate()); + Assert.assertFalse(crawlFrontier.hasNextCandidate()); } @Test public void testCrawlDepthLimitation() { - Mockito.when(config.getMaximumCrawlDepth()) - .thenReturn(MAX_CRAWL_DEPTH); - - clearCrawlCandidateQueue(); + Mockito.when(config.getMaximumCrawlDepth()).thenReturn(MAX_CRAWL_DEPTH); - frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); + CrawlFrontier crawlFrontier = new CrawlFrontier(config); - CrawlCandidate nextCandidate = frontier.getNextCandidate(); + clearCrawlCandidateQueue(crawlFrontier); + crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); + CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate(); Assert.assertTrue(nextCandidate.getCrawlDepth() <= MAX_CRAWL_DEPTH); - frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); + crawlFrontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); + + Assert.assertFalse(crawlFrontier.hasNextCandidate()); + } + + @Test + public void testReset() { + CrawlFrontier crawlFrontier = new CrawlFrontier(config); + + crawlFrontier.reset(); + + // Check if only the crawl seeds are present after the reset + Assert.assertTrue(crawlFrontier.hasNextCandidate()); + Assert.assertEquals(ROOT_URL_1, crawlFrontier.getNextCandidate().getRequestUrl()); + + Assert.assertTrue(crawlFrontier.hasNextCandidate()); + Assert.assertEquals(ROOT_URL_0, crawlFrontier.getNextCandidate().getRequestUrl()); - Assert.assertFalse(frontier.hasNextCandidate()); + Assert.assertFalse(crawlFrontier.hasNextCandidate()); } - private void clearCrawlCandidateQueue() { - while (frontier.hasNextCandidate()) { - frontier.getNextCandidate(); + private static void clearCrawlCandidateQueue(final CrawlFrontier crawlFrontier) { + while (crawlFrontier.hasNextCandidate()) { + crawlFrontier.getNextCandidate(); } } } diff --git a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java b/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java index 3b61ac3..ac20b35 100644 --- a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java +++ b/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java @@ -27,9 +27,6 @@ import com.github.tomakehurst.wiremock.core.WireMockConfiguration; import com.google.common.net.HttpHeaders; import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; @@ -132,21 +129,15 @@ public void testResumeState() throws IOException { protected void onPageLoad(final PageLoadEvent event) { super.onPageLoad(event); - try { - saveState(new FileOutputStream(destinationFile)); - } catch (FileNotFoundException ex) { - Assert.fail(ex.getMessage()); - } - stop(); } }; crawler.start(Browser.HTML_UNIT, capabilities); - crawler = new BaseCrawler(new FileInputStream(destinationFile)) { + crawler = new BaseCrawler(crawler.getState()) { }; - crawler.resumeState(Browser.HTML_UNIT, capabilities); + crawler.resume(Browser.HTML_UNIT, capabilities); WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/foo"))); WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/foo"))); From 1f4f380aad8c01ee9caea020c75c0ca3c99c04ed Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sat, 9 Feb 2019 22:21:01 +0100 Subject: [PATCH 22/63] Remove redundant constructor --- .../peterbencze/serritor/api/BaseCrawler.java | 35 ++++++------------- .../serritor/api/CrawlerState.java | 26 ++++++++++---- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 7acc3cf..3812b22 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -35,13 +35,11 @@ import com.github.peterbencze.serritor.internal.event.EventObject; import java.io.File; import java.io.IOException; -import java.io.Serializable; import java.net.InetSocketAddress; import java.net.URI; import java.nio.charset.UnsupportedCharsetException; -import java.util.HashMap; +import java.util.Arrays; import java.util.List; -import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; @@ -95,33 +93,24 @@ public abstract class BaseCrawler { private boolean isStopping; /** - * Base constructor which is used to configure the crawler. + * Base constructor which sets up the crawler with the provided configuration. * * @param config the configuration of the crawler */ protected BaseCrawler(final CrawlerConfiguration config) { - this(config, new CrawlFrontier(config)); + this(new CrawlerState(config)); } /** - * Base constructor which loads a previously saved state. + * Base constructor which restores the crawler to the provided state. * - * @param state the state to be loaded + * @param state the state to restore the crawler to */ protected BaseCrawler(final CrawlerState state) { - this(state.getStateObject(CrawlerConfiguration.class), - state.getStateObject(CrawlFrontier.class)); - } - - /** - * Private base constructor. - * - * @param config the configuration of the crawler - * @param crawlFrontier the crawl frontier - */ - private BaseCrawler(final CrawlerConfiguration config, final CrawlFrontier crawlFrontier) { - this.config = config; - this.crawlFrontier = crawlFrontier; + this.config = state.getStateObject(CrawlerConfiguration.class) + .orElseThrow(() -> new IllegalArgumentException("Invalid crawler state provided")); + this.crawlFrontier = state.getStateObject(CrawlFrontier.class) + .orElseGet(() -> new CrawlFrontier(config)); callbackManager = new EventCallbackManager(); callbackManager.setDefaultEventCallback(PageLoadEvent.class, this::onPageLoad); @@ -242,11 +231,7 @@ private void start(final Browser browser, * @return the current state of the crawler */ public final CrawlerState getState() { - Map, Serializable> stateObjects = new HashMap<>(); - stateObjects.put(config.getClass(), config); - stateObjects.put(crawlFrontier.getClass(), crawlFrontier); - - return new CrawlerState(stateObjects); + return new CrawlerState(Arrays.asList(config, crawlFrontier)); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java index 1cef38e..b50ceff 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java @@ -17,7 +17,11 @@ package com.github.peterbencze.serritor.api; import java.io.Serializable; +import java.util.Collections; +import java.util.List; import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; /** * Represents the current state of the crawling session. More specifically, it contains a set of @@ -32,11 +36,21 @@ public final class CrawlerState implements Serializable { /** * Creates a {@link CrawlerState} instance. * - * @param stateObjects the map of state objects (that are required for resuming a crawl) and - * their corresponding runtime classes + * @param stateObj the state object that is required to restore the state of the crawler */ - public CrawlerState(final Map, Serializable> stateObjects) { - this.stateObjects = stateObjects; + public CrawlerState(final Serializable stateObj) { + this(Collections.singletonList(stateObj)); + } + + /** + * Creates a {@link CrawlerState} instance. + * + * @param stateObjs the list of state objects that are required to restore the state of the + * crawler + */ + public CrawlerState(final List stateObjs) { + this.stateObjects = stateObjs.stream() + .collect(Collectors.toMap(Serializable::getClass, stateObj -> stateObj)); } /** @@ -47,7 +61,7 @@ public CrawlerState(final Map, Serializable> state * * @return the state object specified by its class */ - public T getStateObject(final Class stateObjectClass) { - return (T) stateObjects.get(stateObjectClass); + public Optional getStateObject(final Class stateObjectClass) { + return Optional.ofNullable((T) stateObjects.get(stateObjectClass)); } } From 39b43ddcd65deb52d3f4ce087025aab5b1443858 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 10 Feb 2019 00:37:14 +0100 Subject: [PATCH 23/63] Add missing validation --- .../java/com/github/peterbencze/serritor/api/BaseCrawler.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 3812b22..22b3632 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -98,7 +98,7 @@ public abstract class BaseCrawler { * @param config the configuration of the crawler */ protected BaseCrawler(final CrawlerConfiguration config) { - this(new CrawlerState(config)); + this(new CrawlerState(Validate.notNull(config, "The config parameter cannot be null"))); } /** @@ -107,6 +107,8 @@ protected BaseCrawler(final CrawlerConfiguration config) { * @param state the state to restore the crawler to */ protected BaseCrawler(final CrawlerState state) { + Validate.notNull(state, "The state parameter cannot be null"); + this.config = state.getStateObject(CrawlerConfiguration.class) .orElseThrow(() -> new IllegalArgumentException("Invalid crawler state provided")); this.crawlFrontier = state.getStateObject(CrawlFrontier.class) From 875321c9082fdad905bdbe2e779804e9e3225764 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 12 Feb 2019 23:19:32 +0100 Subject: [PATCH 24/63] Add RESTful crawler skeletal implementation --- pom.xml | 10 ++ .../peterbencze/serritor/api/BaseCrawler.java | 47 ++++--- .../serritor/api/CrawlRequest.java | 4 + .../serritor/api/RestServerConfiguration.java | 87 ++++++++++++ .../serritor/api/RestfulBaseCrawler.java | 125 ++++++++++++++++++ .../serritor/internal/CrawlDomain.java | 11 ++ 6 files changed, 266 insertions(+), 18 deletions(-) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/RestServerConfiguration.java create mode 100644 src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java diff --git a/pom.xml b/pom.xml index 0ddab49..41d009c 100644 --- a/pom.xml +++ b/pom.xml @@ -71,6 +71,16 @@ guava 27.0.1-jre + + io.javalin + javalin + 2.6.0 + + + org.slf4j + slf4j-simple + 1.7.25 + junit junit diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 22b3632..df54991 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -41,6 +41,7 @@ import java.util.Arrays; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.logging.Level; import java.util.logging.Logger; import net.lightbody.bmp.BrowserMobProxyServer; @@ -89,8 +90,8 @@ public abstract class BaseCrawler { private CloseableHttpClient httpClient; private BrowserMobProxyServer proxyServer; private WebDriver webDriver; - private boolean isStopped; - private boolean isStopping; + private AtomicBoolean isStopped; + private AtomicBoolean isStopping; /** * Base constructor which sets up the crawler with the provided configuration. @@ -126,8 +127,17 @@ protected BaseCrawler(final CrawlerState state) { crawlDelayMechanism = createCrawlDelayMechanism(); - isStopping = false; - isStopped = true; + isStopping = new AtomicBoolean(false); + isStopped = new AtomicBoolean(true); + } + + /** + * Returns the configuration of the crawler. + * + * @return the configuration of the crawler + */ + public final CrawlerConfiguration getCrawlerConfiguration() { + return config; } /** @@ -170,7 +180,9 @@ private void start(final Browser browser, final DesiredCapabilities capabilities, final boolean isResuming) { try { - Validate.validState(isStopped, "The crawler is already running."); + Validate.validState(isStopped.get(), "The crawler is already running."); + + isStopped.set(false); DesiredCapabilities capabilitiesClone = new DesiredCapabilities(capabilities); proxyServer = new BrowserMobProxyServer(); @@ -210,8 +222,6 @@ private void start(final Browser browser, crawlFrontier.reset(); } - isStopped = false; - run(); } finally { HttpClientUtils.closeQuietly(httpClient); @@ -222,8 +232,8 @@ private void start(final Browser browser, proxyServer.stop(); - isStopping = false; - isStopped = true; + isStopping.set(false); + isStopped.set(true); } } @@ -286,11 +296,10 @@ protected final void registerCustomEventCallback( * Gracefully stops the crawler. */ protected final void stop() { - Validate.validState(!isStopped, "The crawler is not started."); - Validate.validState(!isStopping, "The crawler is already stopping."); + Validate.validState(!isStopped.get(), "The crawler is not started."); // Indicate that the crawling should be stopped - isStopping = true; + isStopping.set(true); } /** @@ -300,9 +309,9 @@ protected final void stop() { * @param request the crawl request */ protected final void crawl(final CrawlRequest request) { - Validate.validState(!isStopped, + Validate.validState(!isStopped.get(), "The crawler is not started. Maybe you meant to add this request as a crawl seed?"); - Validate.validState(!isStopping, "Cannot add request when the crawler is stopping."); + Validate.validState(!isStopping.get(), "Cannot add request when the crawler is stopping."); Validate.notNull(request, "The request parameter cannot be null."); crawlFrontier.feedRequest(request, false); @@ -327,8 +336,10 @@ protected final void crawl(final List requests) { * @throws IOException if an I/O error occurs while downloading the file */ protected final void downloadFile(final URI source, final File destination) throws IOException { - Validate.validState(!isStopped, "Cannot download file when the crawler is not started."); - Validate.validState(!isStopping, "Cannot download file when the crawler is stopping."); + Validate.validState(!isStopped.get(), + "Cannot download file when the crawler is not started."); + Validate.validState(!isStopping.get(), + "Cannot download file when the crawler is stopping."); Validate.notNull(source, "The source parameter cannot be null."); Validate.notNull(destination, "The destination parameter cannot be null."); @@ -349,7 +360,7 @@ private void run() { boolean shouldPerformDelay = false; - while (!isStopping && crawlFrontier.hasNextCandidate()) { + while (!isStopping.get() && crawlFrontier.hasNextCandidate()) { // Do not perform delay in the first iteration if (shouldPerformDelay) { performDelay(); @@ -532,7 +543,7 @@ private void performDelay() { TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay()); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); - isStopping = true; + isStopping.set(true); } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 3a04894..5ec8ae0 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -16,6 +16,7 @@ package com.github.peterbencze.serritor.api; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.google.common.net.InternetDomainName; import java.io.IOException; import java.io.ObjectInputStream; @@ -37,8 +38,11 @@ public final class CrawlRequest implements Serializable { private final URI requestUrl; private final int priority; + + @JsonIgnore private final Serializable metadata; + @JsonIgnore private transient InternetDomainName domain; private CrawlRequest(final CrawlRequestBuilder builder) { diff --git a/src/main/java/com/github/peterbencze/serritor/api/RestServerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/RestServerConfiguration.java new file mode 100644 index 0000000..7486858 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/RestServerConfiguration.java @@ -0,0 +1,87 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api; + +/** + * Contains the settings of the REST server. + * + * @author Peter Bencze + */ +public final class RestServerConfiguration { + + private final int port; + + private RestServerConfiguration(final RestServerConfigurationBuilder builder) { + port = builder.port; + } + + /** + * Creates a default configuration for the REST server. + * + * @return a default configuration for the REST server + */ + public static RestServerConfiguration createDefault() { + return new RestServerConfigurationBuilder().build(); + } + + /** + * Returns the port of the REST server. + * + * @return the port of the REST server + */ + public int getPort() { + return port; + } + + /** + * Builds {@link RestServerConfiguration} instances. + */ + public static final class RestServerConfigurationBuilder { + + private static final int DEFAULT_PORT = 8080; + + private int port; + + /** + * Creates a {@link RestServerConfigurationBuilder} instance. + */ + public RestServerConfigurationBuilder() { + port = DEFAULT_PORT; + } + + /** + * Sets the port of the REST server. + * + * @param port the port number + * + * @return the RestServerConfigurationBuilder instance + */ + public RestServerConfigurationBuilder setPort(final int port) { + this.port = port; + return this; + } + + /** + * Builds the configured RestServerConfiguration instance. + * + * @return the configured RestServerConfiguration instance + */ + public RestServerConfiguration build() { + return new RestServerConfiguration(this); + } + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java new file mode 100644 index 0000000..3f35d82 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java @@ -0,0 +1,125 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api; + +import io.javalin.Javalin; +import io.javalin.apibuilder.ApiBuilder; + +/** + * Provides a skeletal implementation of a crawler to minimize the effort for users to implement + * their own. It also exposes a REST API that can be used to interact with the crawler while it is + * running. + * + * @author Peter Bencze + */ +public abstract class RestfulBaseCrawler extends BaseCrawler { + + private final RestServerConfiguration config; + private final Javalin restServer; + + /** + * Base constructor which sets up the crawler with the provided configuration. The REST server + * is initialized with the default settings. + * + * @param config the configuration of the crawler + */ + protected RestfulBaseCrawler(final CrawlerConfiguration config) { + this(RestServerConfiguration.createDefault(), config); + } + + /** + * Base constructor that sets up the REST server and the crawler with the provided + * configurations. + * + * @param restServerConfig the configuration of the REST server + * @param crawlerConfig the configuration of the crawler + */ + protected RestfulBaseCrawler(final RestServerConfiguration restServerConfig, + final CrawlerConfiguration crawlerConfig) { + this(restServerConfig, new CrawlerState(crawlerConfig)); + } + + /** + * Base constructor which restores the crawler to the provided state. The REST server is + * initialized with the default settings. + * + * @param state the state to restore the crawler to + */ + protected RestfulBaseCrawler(final CrawlerState state) { + this(RestServerConfiguration.createDefault(), state); + } + + /** + * Base constructor that sets up the REST server with the provided configuration and restores + * the crawler to the provided state. + * + * @param config the configuration of the REST server + * @param state the state to restore the crawler to + */ + protected RestfulBaseCrawler(final RestServerConfiguration config, final CrawlerState state) { + super(state); + + this.config = config; + restServer = Javalin.create(); + + configureRoutes(); + } + + /** + * Returns the configuration of the REST server. + * + * @return the configuration of the REST server + */ + public final RestServerConfiguration getRestServerConfiguration() { + return config; + } + + /** + * {@inheritDoc} + */ + @Override + protected void onStart() { + super.onStart(); + + restServer.start(config.getPort()); + } + + /** + * {@inheritDoc} + */ + @Override + protected void onStop() { + super.onStop(); + + restServer.stop(); + } + + /** + * Sets up the routes. + */ + private void configureRoutes() { + restServer.routes(() -> { + ApiBuilder.path("api", () -> { + ApiBuilder.path("crawler", () -> { + ApiBuilder.delete(ctx -> stop()); + + ApiBuilder.get("config", ctx -> ctx.json(getCrawlerConfiguration())); + }); + }); + }); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java index 3fec9fa..3138f07 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java @@ -27,6 +27,7 @@ */ public final class CrawlDomain implements Serializable { + private final String domain; private final ImmutableList parts; /** @@ -35,9 +36,19 @@ public final class CrawlDomain implements Serializable { * @param domain an immutable well-formed internet domain name */ public CrawlDomain(final InternetDomainName domain) { + this.domain = domain.toString(); parts = domain.parts(); } + /** + * Returns the domain name, normalized to all lower case. + * + * @return the domain name + */ + public String getDomain() { + return domain; + } + /** * Indicates if two CrawlDomain instances are equal. Crawl domains with the same * domain name are considered equal. From 7d87c438505179985d30395c82c7dab9b0480e89 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 20 Feb 2019 19:37:22 +0100 Subject: [PATCH 25/63] Fix crawl delay mechanism initialization --- .../peterbencze/serritor/api/BaseCrawler.java | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index df54991..74b3a30 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -84,12 +84,12 @@ public abstract class BaseCrawler { private final CrawlerConfiguration config; private final CrawlFrontier crawlFrontier; private final EventCallbackManager callbackManager; - private final CrawlDelayMechanism crawlDelayMechanism; private BasicCookieStore cookieStore; private CloseableHttpClient httpClient; private BrowserMobProxyServer proxyServer; private WebDriver webDriver; + private CrawlDelayMechanism crawlDelayMechanism; private AtomicBoolean isStopped; private AtomicBoolean isStopping; @@ -125,8 +125,6 @@ protected BaseCrawler(final CrawlerState state) { callbackManager.setDefaultEventCallback(NetworkErrorEvent.class, this::onNetworkError); callbackManager.setDefaultEventCallback(RequestErrorEvent.class, this::onRequestError); - crawlDelayMechanism = createCrawlDelayMechanism(); - isStopping = new AtomicBoolean(false); isStopped = new AtomicBoolean(true); } @@ -184,9 +182,19 @@ private void start(final Browser browser, isStopped.set(false); - DesiredCapabilities capabilitiesClone = new DesiredCapabilities(capabilities); + cookieStore = new BasicCookieStore(); + httpClient = HttpClientBuilder.create() + .disableRedirectHandling() + .setDefaultCookieStore(cookieStore) + .useSystemProperties() + .build(); + proxyServer = new BrowserMobProxyServer(); + // Create a copy of the original capabilities before we make changes to it (we don't + // want to cause any unwanted side effects) + DesiredCapabilities capabilitiesClone = new DesiredCapabilities(capabilities); + Proxy chainedProxy = (Proxy) capabilitiesClone.getCapability(CapabilityType.PROXY); if (chainedProxy != null && chainedProxy.getHttpProxy() != null) { String[] urlComponents = chainedProxy.getHttpProxy().split(":"); @@ -203,6 +211,10 @@ private void start(final Browser browser, webDriver = WebDriverFactory.createWebDriver(browser, capabilitiesClone); onBrowserInit(webDriver.manage()); + if (!isResuming) { + crawlFrontier.reset(); + } + // If the crawl delay strategy is set to adaptive, we check if the browser supports the // Navigation Timing API or not. However HtmlUnit requires a page to be loaded first // before executing JavaScript, so we load a blank page. @@ -211,16 +223,8 @@ private void start(final Browser browser, webDriver.get(WebClient.ABOUT_BLANK); } - cookieStore = new BasicCookieStore(); - httpClient = HttpClientBuilder.create() - .disableRedirectHandling() - .setDefaultCookieStore(cookieStore) - .useSystemProperties() - .build(); - - if (!isResuming) { - crawlFrontier.reset(); - } + // Must be created here (the adaptive crawl delay strategy depends on the WebDriver) + crawlDelayMechanism = createCrawlDelayMechanism(); run(); } finally { @@ -230,7 +234,9 @@ private void start(final Browser browser, webDriver.quit(); } - proxyServer.stop(); + if (proxyServer != null && proxyServer.isStarted()) { + proxyServer.stop(); + } isStopping.set(false); isStopped.set(true); From bd7e896117443b2b098835afd35354ef6de7a650 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 20 Feb 2019 20:49:58 +0100 Subject: [PATCH 26/63] Do not throw exception when stop is initiated --- .../peterbencze/serritor/api/BaseCrawler.java | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 74b3a30..c16520f 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -91,7 +91,7 @@ public abstract class BaseCrawler { private WebDriver webDriver; private CrawlDelayMechanism crawlDelayMechanism; private AtomicBoolean isStopped; - private AtomicBoolean isStopping; + private AtomicBoolean isStopInitiated; /** * Base constructor which sets up the crawler with the provided configuration. @@ -125,7 +125,7 @@ protected BaseCrawler(final CrawlerState state) { callbackManager.setDefaultEventCallback(NetworkErrorEvent.class, this::onNetworkError); callbackManager.setDefaultEventCallback(RequestErrorEvent.class, this::onRequestError); - isStopping = new AtomicBoolean(false); + isStopInitiated = new AtomicBoolean(false); isStopped = new AtomicBoolean(true); } @@ -238,7 +238,7 @@ private void start(final Browser browser, proxyServer.stop(); } - isStopping.set(false); + isStopInitiated.set(false); isStopped.set(true); } } @@ -305,7 +305,7 @@ protected final void stop() { Validate.validState(!isStopped.get(), "The crawler is not started."); // Indicate that the crawling should be stopped - isStopping.set(true); + isStopInitiated.set(true); } /** @@ -317,7 +317,6 @@ protected final void stop() { protected final void crawl(final CrawlRequest request) { Validate.validState(!isStopped.get(), "The crawler is not started. Maybe you meant to add this request as a crawl seed?"); - Validate.validState(!isStopping.get(), "Cannot add request when the crawler is stopping."); Validate.notNull(request, "The request parameter cannot be null."); crawlFrontier.feedRequest(request, false); @@ -344,8 +343,6 @@ protected final void crawl(final List requests) { protected final void downloadFile(final URI source, final File destination) throws IOException { Validate.validState(!isStopped.get(), "Cannot download file when the crawler is not started."); - Validate.validState(!isStopping.get(), - "Cannot download file when the crawler is stopping."); Validate.notNull(source, "The source parameter cannot be null."); Validate.notNull(destination, "The destination parameter cannot be null."); @@ -366,7 +363,7 @@ private void run() { boolean shouldPerformDelay = false; - while (!isStopping.get() && crawlFrontier.hasNextCandidate()) { + while (!isStopInitiated.get() && crawlFrontier.hasNextCandidate()) { // Do not perform delay in the first iteration if (shouldPerformDelay) { performDelay(); @@ -549,7 +546,7 @@ private void performDelay() { TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay()); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); - isStopping.set(true); + isStopInitiated.set(true); } } From 5ce6c89673d58a2f4e96a5ef84540ff1b0565be5 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 20 Feb 2019 21:24:31 +0100 Subject: [PATCH 27/63] Use slf4j instead of builtin logger --- .../peterbencze/serritor/api/BaseCrawler.java | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index c16520f..2f95a44 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -42,8 +42,6 @@ import java.util.List; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.logging.Level; -import java.util.logging.Logger; import net.lightbody.bmp.BrowserMobProxyServer; import net.lightbody.bmp.client.ClientUtil; import net.lightbody.bmp.core.har.HarResponse; @@ -70,6 +68,8 @@ import org.openqa.selenium.WebDriver.Options; import org.openqa.selenium.remote.CapabilityType; import org.openqa.selenium.remote.DesiredCapabilities; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Provides a skeletal implementation of a crawler to minimize the effort for users to implement @@ -79,7 +79,7 @@ */ public abstract class BaseCrawler { - private static final Logger LOGGER = Logger.getLogger(BaseCrawler.class.getName()); + private static final Logger LOGGER = LoggerFactory.getLogger(BaseCrawler.class); private final CrawlerConfiguration config; private final CrawlFrontier crawlFrontier; @@ -578,7 +578,7 @@ protected void onStart() { * @param event the PageLoadEvent instance */ protected void onPageLoad(final PageLoadEvent event) { - LOGGER.log(Level.INFO, "onPageLoad: {0}", event.getCrawlCandidate().getRequestUrl()); + LOGGER.info("onPageLoad: {}", event.getCrawlCandidate().getRequestUrl()); } /** @@ -587,7 +587,7 @@ protected void onPageLoad(final PageLoadEvent event) { * @param event the NonHtmlContentEvent instance */ protected void onNonHtmlContent(final NonHtmlContentEvent event) { - LOGGER.log(Level.INFO, "onNonHtmlContent: {0}", event.getCrawlCandidate().getRequestUrl()); + LOGGER.info("onNonHtmlContent: {}", event.getCrawlCandidate().getRequestUrl()); } /** @@ -596,7 +596,7 @@ protected void onNonHtmlContent(final NonHtmlContentEvent event) { * @param event the NetworkErrorEvent instance */ protected void onNetworkError(final NetworkErrorEvent event) { - LOGGER.log(Level.INFO, "onNetworkError: {0}", event.getErrorMessage()); + LOGGER.info("onNetworkError: {}", event.getErrorMessage()); } /** @@ -606,7 +606,7 @@ protected void onNetworkError(final NetworkErrorEvent event) { * @param event the RequestErrorEvent instance */ protected void onRequestError(final RequestErrorEvent event) { - LOGGER.log(Level.INFO, "onRequestError: {0}", event.getCrawlCandidate().getRequestUrl()); + LOGGER.info("onRequestError: {}", event.getCrawlCandidate().getRequestUrl()); } /** @@ -615,11 +615,8 @@ protected void onRequestError(final RequestErrorEvent event) { * @param event the RequestRedirectEvent instance */ protected void onRequestRedirect(final RequestRedirectEvent event) { - LOGGER.log(Level.INFO, "onRequestRedirect: {0} -> {1}", - new Object[]{ - event.getCrawlCandidate().getRequestUrl(), - event.getRedirectedCrawlRequest().getRequestUrl() - }); + LOGGER.info("onRequestRedirect: {} -> {}", event.getCrawlCandidate().getRequestUrl(), + event.getRedirectedCrawlRequest().getRequestUrl()); } /** @@ -629,7 +626,7 @@ protected void onRequestRedirect(final RequestRedirectEvent event) { * @param event the PageLoadTimeoutEvent instance */ protected void onPageLoadTimeout(final PageLoadTimeoutEvent event) { - LOGGER.log(Level.INFO, "onPageLoadTimeout: {0}", event.getCrawlCandidate().getRequestUrl()); + LOGGER.info("onPageLoadTimeout: {}", event.getCrawlCandidate().getRequestUrl()); } /** From 5d5e4a6984f3e8eeee0c03ae5e9ba5cbebf82af1 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 20 Feb 2019 21:44:35 +0100 Subject: [PATCH 28/63] Check HAR redirect URL instead of loaded page URL --- .../com/github/peterbencze/serritor/api/BaseCrawler.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 2f95a44..7670f51 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -435,11 +435,11 @@ private void run() { continue; } - String loadedPageUrl = webDriver.getCurrentUrl(); - if (!loadedPageUrl.equals(candidateUrl)) { + String redirectUrl = harResponse.getRedirectURL(); + if (!redirectUrl.isEmpty()) { // Create a new crawl request for the redirected URL (JS redirect) handleRequestRedirect(currentCandidate, - new PartialCrawlResponse(harResponse), loadedPageUrl); + new PartialCrawlResponse(harResponse), redirectUrl); continue; } From dfa15f4dfe4ef419d005ad16fa332c382db8bcc1 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 20 Feb 2019 22:48:42 +0100 Subject: [PATCH 29/63] Re-add loaded page URL check --- .../peterbencze/serritor/api/BaseCrawler.java | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 7670f51..d650792 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -386,9 +386,10 @@ private void run() { } int statusCode = httpHeadResponse.getStatusLine().getStatusCode(); + + // Check if there was an HTTP redirect Header locationHeader = httpHeadResponse.getFirstHeader(HttpHeaders.LOCATION); if (HttpStatus.isRedirection(statusCode) && locationHeader != null) { - // Create a new crawl request for the redirected URL (HTTP redirect) handleRequestRedirect(currentCandidate, new PartialCrawlResponse(httpHeadResponse), locationHeader.getValue()); @@ -435,9 +436,15 @@ private void run() { continue; } + // We need to check both the redirect URL in the HAR response and the URL of the + // loaded page to see if there was a JS redirect String redirectUrl = harResponse.getRedirectURL(); - if (!redirectUrl.isEmpty()) { - // Create a new crawl request for the redirected URL (JS redirect) + String loadedPageUrl = webDriver.getCurrentUrl(); + if (!redirectUrl.isEmpty() || !loadedPageUrl.equals(candidateUrl)) { + if (redirectUrl.isEmpty()) { + redirectUrl = loadedPageUrl; + } + handleRequestRedirect(currentCandidate, new PartialCrawlResponse(harResponse), redirectUrl); @@ -510,13 +517,13 @@ private static String getResponseMimeType(final HttpResponse httpHeadResponse) { * * @param crawlCandidate the current crawl candidate * @param partialCrawlResponse the partial crawl response - * @param redirectedUrl the URL of the redirected request + * @param redirectUrl the redirect URL */ private void handleRequestRedirect( final CrawlCandidate crawlCandidate, final PartialCrawlResponse partialCrawlResponse, - final String redirectedUrl) { - CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectedUrl) + final String redirectUrl) { + CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectUrl) .setPriority(crawlCandidate.getPriority()); crawlCandidate.getMetadata().ifPresent(builder::setMetadata); CrawlRequest redirectedRequest = builder.build(); From 9b26e86def86949f7f28285821572b4d908c5bb5 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 22 Feb 2019 20:50:16 +0100 Subject: [PATCH 30/63] Rename filter setters and getters in config --- .../serritor/api/CrawlerConfiguration.java | 54 +++++++++---------- .../serritor/internal/CrawlFrontier.java | 4 +- .../serritor/internal/CrawlFrontierTest.java | 6 +-- 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java index 197f0ab..1c26a72 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java @@ -37,8 +37,8 @@ public final class CrawlerConfiguration implements Serializable { private final Set allowedCrawlDomains; private final Set crawlSeeds; private final CrawlStrategy crawlStrategy; - private final boolean filterDuplicateRequests; - private final boolean filterOffsiteRequests; + private final boolean isDuplicateRequestFilterEnabled; + private final boolean isOffsiteRequestFilterEnabled; private final int maxCrawlDepth; private final CrawlDelayStrategy crawlDelayStrategy; private final long fixedCrawlDelayDurationInMillis; @@ -49,8 +49,8 @@ private CrawlerConfiguration(final CrawlerConfigurationBuilder builder) { allowedCrawlDomains = builder.allowedCrawlDomains; crawlSeeds = builder.crawlSeeds; crawlStrategy = builder.crawlStrategy; - filterDuplicateRequests = builder.filterDuplicateRequests; - filterOffsiteRequests = builder.filterOffsiteRequests; + isDuplicateRequestFilterEnabled = builder.isDuplicateRequestFilterEnabled; + isOffsiteRequestFilterEnabled = builder.isOffsiteRequestFilterEnabled; maxCrawlDepth = builder.maxCrawlDepth; crawlDelayStrategy = builder.crawlDelayStrategy; fixedCrawlDelayDurationInMillis = builder.fixedCrawlDelayDurationInMillis; @@ -86,21 +86,21 @@ public CrawlStrategy getCrawlStrategy() { } /** - * Indicates if duplicate request filtering is enabled. + * Indicates if the duplicate request filter is enabled. * * @return true if enabled, false otherwise */ - public boolean isDuplicateRequestFilteringEnabled() { - return filterDuplicateRequests; + public boolean isDuplicateRequestFilterEnabled() { + return isDuplicateRequestFilterEnabled; } /** - * Indicates if offsite request filtering is enabled. + * Indicates if the offsite request filter is enabled. * * @return true if enabled, false otherwise */ - public boolean isOffsiteRequestFilteringEnabled() { - return filterOffsiteRequests; + public boolean isOffsiteRequestFilterEnabled() { + return isOffsiteRequestFilterEnabled; } /** @@ -154,8 +154,8 @@ public long getMaximumCrawlDelayDurationInMillis() { public static final class CrawlerConfigurationBuilder { private static final CrawlStrategy DEFAULT_CRAWL_STRATEGY = CrawlStrategy.BREADTH_FIRST; - private static final boolean FILTER_DUPLICATE_REQUESTS_BY_DEFAULT = true; - private static final boolean FILTER_OFFSITE_REQUESTS_BY_DEFAULT = false; + private static final boolean IS_DUPLICATE_REQUEST_FILTER_ENABLED_BY_DEFAULT = true; + private static final boolean IS_OFFSITE_REQUEST_FILTER_ENABLED_BY_DEFAULT = false; private static final int DEFAULT_MAX_CRAWL_DEPTH = 0; private static final CrawlDelayStrategy DEFAULT_CRAWL_DELAY = CrawlDelayStrategy.FIXED; private static final long DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS @@ -169,8 +169,8 @@ public static final class CrawlerConfigurationBuilder { private final Set crawlSeeds; private CrawlStrategy crawlStrategy; - private boolean filterDuplicateRequests; - private boolean filterOffsiteRequests; + private boolean isDuplicateRequestFilterEnabled; + private boolean isOffsiteRequestFilterEnabled; private int maxCrawlDepth; private CrawlDelayStrategy crawlDelayStrategy; private long fixedCrawlDelayDurationInMillis; @@ -185,8 +185,8 @@ public CrawlerConfigurationBuilder() { allowedCrawlDomains = new HashSet<>(); crawlSeeds = new HashSet<>(); crawlStrategy = DEFAULT_CRAWL_STRATEGY; - filterDuplicateRequests = FILTER_DUPLICATE_REQUESTS_BY_DEFAULT; - filterOffsiteRequests = FILTER_OFFSITE_REQUESTS_BY_DEFAULT; + isDuplicateRequestFilterEnabled = IS_DUPLICATE_REQUEST_FILTER_ENABLED_BY_DEFAULT; + isOffsiteRequestFilterEnabled = IS_OFFSITE_REQUEST_FILTER_ENABLED_BY_DEFAULT; maxCrawlDepth = DEFAULT_MAX_CRAWL_DEPTH; crawlDelayStrategy = DEFAULT_CRAWL_DELAY; fixedCrawlDelayDurationInMillis = DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS; @@ -268,30 +268,28 @@ public CrawlerConfigurationBuilder setCrawlStrategy(final CrawlStrategy strategy } /** - * Enables or disables duplicate request filtering. + * Enables or disables the duplicate request filter. * - * @param filterDuplicateRequests true means enabled, false means - * disabled + * @param filterEnabled true enables, false disables the filter * * @return the CrawlerConfigurationBuilder instance */ - public CrawlerConfigurationBuilder setDuplicateRequestFiltering( - final boolean filterDuplicateRequests) { - this.filterDuplicateRequests = filterDuplicateRequests; + public CrawlerConfigurationBuilder setDuplicateRequestFilterEnabled( + final boolean filterEnabled) { + this.isDuplicateRequestFilterEnabled = filterEnabled; return this; } /** - * Enables or disables offsite request filtering. + * Enables or disables the offsite request filter. * - * @param filterOffsiteRequests true means enabled, false means - * disabled + * @param filterEnabled true enables, false disables the filter * * @return the CrawlerConfigurationBuilder instance */ - public CrawlerConfigurationBuilder setOffsiteRequestFiltering( - final boolean filterOffsiteRequests) { - this.filterOffsiteRequests = filterOffsiteRequests; + public CrawlerConfigurationBuilder setOffsiteRequestFilterEnabled( + final boolean filterEnabled) { + this.isOffsiteRequestFilterEnabled = filterEnabled; return this; } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 79ff6a6..2c80052 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -65,7 +65,7 @@ public CrawlFrontier(final CrawlerConfiguration config) { * @param isCrawlSeed indicates if the request is a crawl seed */ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { - if (config.isOffsiteRequestFilteringEnabled()) { + if (config.isOffsiteRequestFilterEnabled()) { boolean inCrawlDomain = false; for (CrawlDomain allowedCrawlDomain : config.getAllowedCrawlDomains()) { @@ -80,7 +80,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { } } - if (config.isDuplicateRequestFilteringEnabled()) { + if (config.isDuplicateRequestFilterEnabled()) { String urlFingerprint = createFingerprintForUrl(request.getRequestUrl()); if (urlFingerprints.contains(urlFingerprint)) { diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 8d147b6..681c569 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -113,7 +113,7 @@ public final class CrawlFrontierTest { @Before public void before() { - config = Mockito.spy(new CrawlerConfigurationBuilder().setOffsiteRequestFiltering(true) + config = Mockito.spy(new CrawlerConfigurationBuilder().setOffsiteRequestFilterEnabled(true) .addAllowedCrawlDomains(ALLOWED_CRAWL_DOMAINS) .addCrawlSeeds(CRAWL_SEEDS) .build()); @@ -168,7 +168,7 @@ public void testEnabledDuplicateRequestFiltering() { @Test public void testDisabledDuplicateRequestFiltering() { - Mockito.when(config.isDuplicateRequestFilteringEnabled()).thenReturn(false); + Mockito.when(config.isDuplicateRequestFilterEnabled()).thenReturn(false); CrawlFrontier crawlFrontier = new CrawlFrontier(config); @@ -190,7 +190,7 @@ public void testEnabledOffsiteRequestFiltering() { @Test public void testDisabledOffsiteRequestFiltering() { - Mockito.when(config.isOffsiteRequestFilteringEnabled()).thenReturn(false); + Mockito.when(config.isOffsiteRequestFilterEnabled()).thenReturn(false); Mockito.when(config.getCrawlSeeds()).thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST)); CrawlFrontier crawlFrontier = new CrawlFrontier(config); From e10e22cb82d6a7d9c985054406629efb99457cc9 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Mar 2019 21:51:50 +0100 Subject: [PATCH 31/63] Add serializable stopwatch implementation --- .../internal/stopwatch/Stopwatch.java | 95 ++++++++++++++++ .../internal/stopwatch/TimeSource.java | 35 ++++++ .../internal/stopwatch/UtcTimeSource.java | 37 +++++++ .../internal/stopwatch/StopwatchTest.java | 102 ++++++++++++++++++ 4 files changed, 269 insertions(+) create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/stopwatch/TimeSource.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/stopwatch/UtcTimeSource.java create mode 100644 src/test/java/com/github/peterbencze/serritor/internal/stopwatch/StopwatchTest.java diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java b/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java new file mode 100644 index 0000000..1cc49b1 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java @@ -0,0 +1,95 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.stopwatch; + +import java.io.Serializable; +import java.time.Duration; +import java.time.Instant; +import org.apache.commons.lang3.Validate; + +/** + * A serializable stopwatch implementation that can be used to measure elapsed time. + * + * @author Peter Bencze + */ +public final class Stopwatch implements Serializable { + + private TimeSource timeSource; + private Instant startTime; + private Duration elapsedDuration; + private boolean isRunning; + + /** + * Creates a {@link Stopwatch} instance. + * + * @param timeSource a source providing access to the current instant + */ + public Stopwatch(final TimeSource timeSource) { + this.timeSource = timeSource; + elapsedDuration = Duration.ZERO; + isRunning = false; + } + + /** + * Creates a {@link Stopwatch} instance. + */ + public Stopwatch() { + this(new UtcTimeSource()); + } + + /** + * Starts the stopwatch. + */ + public void start() { + Validate.validState(!isRunning, "The stopwatch is already running."); + + startTime = timeSource.getTime(); + isRunning = true; + } + + /** + * Indicates if the stopwatch is running. + * + * @return true if the stopwatch is running, false otherwise + */ + public boolean isRunning() { + return isRunning; + } + + /** + * Stops the stopwatch. + */ + public void stop() { + Validate.validState(isRunning, "The stopwatch is not running."); + + elapsedDuration = elapsedDuration.plus(Duration.between(startTime, timeSource.getTime())); + isRunning = false; + } + + /** + * Returns the current elapsed duration. + * + * @return the current elapsed duration + */ + public Duration getElapsedDuration() { + if (isRunning) { + return Duration.between(startTime, timeSource.getTime()).plus(elapsedDuration); + } + + return elapsedDuration; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/TimeSource.java b/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/TimeSource.java new file mode 100644 index 0000000..dd08797 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/TimeSource.java @@ -0,0 +1,35 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.stopwatch; + +import java.io.Serializable; +import java.time.Instant; + +/** + * A source providing access to the current instant. All implementations should be serializable. + * + * @author Peter Bencze + */ +public interface TimeSource extends Serializable { + + /** + * Returns the current instant. + * + * @return the current instant + */ + Instant getTime(); +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/UtcTimeSource.java b/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/UtcTimeSource.java new file mode 100644 index 0000000..d1b7b5f --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/UtcTimeSource.java @@ -0,0 +1,37 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.stopwatch; + +import java.time.Instant; + +/** + * A source providing access to the current UTC instant. + * + * @author Peter Bencze + */ +public final class UtcTimeSource implements TimeSource { + + /** + * Returns the current instant from the system UTC clock. + * + * @return the current instant from the system UTC clock + */ + @Override + public Instant getTime() { + return Instant.now(); + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/stopwatch/StopwatchTest.java b/src/test/java/com/github/peterbencze/serritor/internal/stopwatch/StopwatchTest.java new file mode 100644 index 0000000..e28130c --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/internal/stopwatch/StopwatchTest.java @@ -0,0 +1,102 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.stopwatch; + +import java.time.Duration; +import java.time.Instant; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +/** + * Test cases for {@link Stopwatch} + * + * @author Peter Bencze + */ +public final class StopwatchTest { + + private TimeSource timeSourceMock; + private Stopwatch stopwatch; + + @Before + public void before() { + timeSourceMock = Mockito.mock(UtcTimeSource.class); + Mockito.doCallRealMethod().when(timeSourceMock).getTime(); + + stopwatch = new Stopwatch(timeSourceMock); + } + + @Test + public void testStartWhenStopwatchIsNotRunning() { + stopwatch.start(); + + Assert.assertTrue(stopwatch.isRunning()); + } + + @Test(expected = IllegalStateException.class) + public void testStartWhenStopwatchIsAlreadyRunning() { + stopwatch.start(); + stopwatch.start(); + } + + @Test + public void testStopWhenStopwatchIsRunning() { + stopwatch.start(); + stopwatch.stop(); + + Assert.assertFalse(stopwatch.isRunning()); + } + + @Test(expected = IllegalStateException.class) + public void testStopWhenStopwatchIsNotRunning() { + stopwatch.stop(); + } + + @Test + public void testGetElapsedDurationWhenStopwatchHasNotYetBeenStarted() { + Assert.assertTrue(stopwatch.getElapsedDuration().isZero()); + } + + @Test + public void testGetElapsedDurationWhenStopwatchIsRunning() { + Instant now = Instant.now(); + Instant oneMinuteLater = now.plus(Duration.ofMinutes(1)); + Instant twoMinutesLater = now.plus(Duration.ofMinutes(2)); + + Mockito.when(timeSourceMock.getTime()).thenReturn(now, oneMinuteLater, twoMinutesLater); + + stopwatch.start(); + + Assert.assertEquals(Duration.ofMinutes(1), stopwatch.getElapsedDuration()); + Assert.assertEquals(Duration.ofMinutes(2), stopwatch.getElapsedDuration()); + } + + @Test + public void testGetElapsedDurationWhenStopwatchIsStopped() { + Instant now = Instant.now(); + Instant oneMinuteLater = now.plus(Duration.ofMinutes(1)); + + Mockito.when(timeSourceMock.getTime()).thenReturn(now, oneMinuteLater); + + stopwatch.start(); + stopwatch.stop(); + + Assert.assertEquals(Duration.ofMinutes(1), stopwatch.getElapsedDuration()); + Assert.assertEquals(Duration.ofMinutes(1), stopwatch.getElapsedDuration()); + } +} From 64cfede9b136f58606d693138d71b1908bfc37d4 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 13 Mar 2019 23:43:38 +0100 Subject: [PATCH 32/63] Add the possibility to retrieve stats about the crawl progress --- pom.xml | 5 + .../peterbencze/serritor/api/BaseCrawler.java | 169 ++++++++--- .../peterbencze/serritor/api/CrawlStats.java | 278 +++++++++++++++++ .../serritor/api/RestfulBaseCrawler.java | 7 + .../serritor/internal/CrawlFrontier.java | 30 +- .../internal/stats/DurationSerializer.java | 56 ++++ .../serritor/internal/stats/StatsCounter.java | 282 ++++++++++++++++++ .../internal/stats/StatsCounterSnapshot.java | 154 ++++++++++ .../internal/stopwatch/Stopwatch.java | 36 ++- .../FunctionalReentrantReadWriteLock.java | 93 ++++++ .../serritor/internal/CrawlFrontierTest.java | 45 +-- .../internal/stats/StatsCounterTest.java | 136 +++++++++ 12 files changed, 1205 insertions(+), 86 deletions(-) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/stats/DurationSerializer.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounterSnapshot.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/util/FunctionalReentrantReadWriteLock.java create mode 100644 src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java diff --git a/pom.xml b/pom.xml index 41d009c..1f899f3 100644 --- a/pom.xml +++ b/pom.xml @@ -76,6 +76,11 @@ javalin 2.6.0 + + com.fasterxml.jackson.datatype + jackson-datatype-jdk8 + 2.8.9 + org.slf4j slf4j-simple diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index d650792..0561bc9 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -33,6 +33,8 @@ import com.github.peterbencze.serritor.internal.crawldelaymechanism.RandomCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.event.EventCallbackManager; import com.github.peterbencze.serritor.internal.event.EventObject; +import com.github.peterbencze.serritor.internal.stats.StatsCounter; +import com.github.peterbencze.serritor.internal.stopwatch.Stopwatch; import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; @@ -82,6 +84,8 @@ public abstract class BaseCrawler { private static final Logger LOGGER = LoggerFactory.getLogger(BaseCrawler.class); private final CrawlerConfiguration config; + private final Stopwatch runTimeStopwatch; + private final StatsCounter statsCounter; private final CrawlFrontier crawlFrontier; private final EventCallbackManager callbackManager; @@ -110,10 +114,12 @@ protected BaseCrawler(final CrawlerConfiguration config) { protected BaseCrawler(final CrawlerState state) { Validate.notNull(state, "The state parameter cannot be null"); - this.config = state.getStateObject(CrawlerConfiguration.class) + config = state.getStateObject(CrawlerConfiguration.class) .orElseThrow(() -> new IllegalArgumentException("Invalid crawler state provided")); - this.crawlFrontier = state.getStateObject(CrawlFrontier.class) - .orElseGet(() -> new CrawlFrontier(config)); + runTimeStopwatch = state.getStateObject(Stopwatch.class).orElseGet(Stopwatch::new); + statsCounter = state.getStateObject(StatsCounter.class).orElseGet(StatsCounter::new); + crawlFrontier = state.getStateObject(CrawlFrontier.class) + .orElseGet(() -> new CrawlFrontier(config, statsCounter)); callbackManager = new EventCallbackManager(); callbackManager.setDefaultEventCallback(PageLoadEvent.class, this::onPageLoad); @@ -138,6 +144,15 @@ public final CrawlerConfiguration getCrawlerConfiguration() { return config; } + /** + * Returns summary statistics about the crawl progress. + * + * @return summary statistics about the crawl progress + */ + public final CrawlStats getCrawlStats() { + return new CrawlStats(runTimeStopwatch.getElapsedDuration(), statsCounter.getSnapshot()); + } + /** * Starts the crawler. The crawler will use HtmlUnit headless browser to visit URLs. This method * will block until the crawler finishes. @@ -182,6 +197,8 @@ private void start(final Browser browser, isStopped.set(false); + runTimeStopwatch.start(); + cookieStore = new BasicCookieStore(); httpClient = HttpClientBuilder.create() .disableRedirectHandling() @@ -238,6 +255,8 @@ private void start(final Browser browser, proxyServer.stop(); } + runTimeStopwatch.stop(); + isStopInitiated.set(false); isStopped.set(true); } @@ -249,7 +268,8 @@ private void start(final Browser browser, * @return the current state of the crawler */ public final CrawlerState getState() { - return new CrawlerState(Arrays.asList(config, crawlFrontier)); + return new CrawlerState(Arrays.asList(config, crawlFrontier, runTimeStopwatch, + statsCounter)); } /** @@ -379,8 +399,8 @@ private void run() { try { httpHeadResponse = httpClient.execute(new HttpHead(candidateUrl)); } catch (IOException exception) { - callbackManager.call(NetworkErrorEvent.class, - new NetworkErrorEvent(currentCandidate, exception.toString())); + handleNetworkError(new NetworkErrorEvent(currentCandidate, + exception.toString())); continue; } @@ -390,8 +410,13 @@ private void run() { // Check if there was an HTTP redirect Header locationHeader = httpHeadResponse.getFirstHeader(HttpHeaders.LOCATION); if (HttpStatus.isRedirection(statusCode) && locationHeader != null) { - handleRequestRedirect(currentCandidate, - new PartialCrawlResponse(httpHeadResponse), locationHeader.getValue()); + // Create a new crawl request for the redirected URL (HTTP redirect) + CrawlRequest redirectedRequest = + createCrawlRequestForRedirect(currentCandidate, + locationHeader.getValue()); + + handleRequestRedirect(new RequestRedirectEvent(currentCandidate, + new PartialCrawlResponse(httpHeadResponse), redirectedRequest)); continue; } @@ -399,9 +424,8 @@ private void run() { String mimeType = getResponseMimeType(httpHeadResponse); if (!mimeType.equals(ContentType.TEXT_HTML.getMimeType())) { // URLs that point to non-HTML content should not be opened in the browser - callbackManager.call(NonHtmlContentEvent.class, - new NonHtmlContentEvent(currentCandidate, - new PartialCrawlResponse(httpHeadResponse))); + handleNonHtmlContent(new NonHtmlContentEvent(currentCandidate, + new PartialCrawlResponse(httpHeadResponse))); continue; } @@ -414,9 +438,8 @@ private void run() { // Ensure HTTP client and Selenium have the same cookies syncHttpClientCookies(); } catch (TimeoutException exception) { - callbackManager.call(PageLoadTimeoutEvent.class, - new PageLoadTimeoutEvent(currentCandidate, - new PartialCrawlResponse(httpHeadResponse))); + handlePageLoadTimeout(new PageLoadTimeoutEvent(currentCandidate, + new PartialCrawlResponse(httpHeadResponse))); continue; } @@ -430,8 +453,7 @@ private void run() { .get(0) .getResponse(); if (harResponse.getError() != null) { - callbackManager.call(NetworkErrorEvent.class, - new NetworkErrorEvent(currentCandidate, harResponse.getError())); + handleNetworkError(new NetworkErrorEvent(currentCandidate, harResponse.getError())); continue; } @@ -445,24 +467,24 @@ private void run() { redirectUrl = loadedPageUrl; } - handleRequestRedirect(currentCandidate, - new PartialCrawlResponse(harResponse), redirectUrl); + CrawlRequest request = createCrawlRequestForRedirect(currentCandidate, redirectUrl); + + handleRequestRedirect(new RequestRedirectEvent(currentCandidate, + new PartialCrawlResponse(harResponse), request)); continue; } int statusCode = harResponse.getStatus(); if (HttpStatus.isClientError(statusCode) || HttpStatus.isServerError(statusCode)) { - callbackManager.call(RequestErrorEvent.class, - new RequestErrorEvent(currentCandidate, - new CompleteCrawlResponse(harResponse, webDriver))); + handleRequestError(new RequestErrorEvent(currentCandidate, + new CompleteCrawlResponse(harResponse, webDriver))); continue; } - callbackManager.call(PageLoadEvent.class, - new PageLoadEvent(currentCandidate, - new CompleteCrawlResponse(harResponse, webDriver))); + handlePageLoad(new PageLoadEvent(currentCandidate, + new CompleteCrawlResponse(harResponse, webDriver))); } onStop(); @@ -512,26 +534,74 @@ private static String getResponseMimeType(final HttpResponse httpHeadResponse) { } /** - * Creates a crawl request for the redirected URL, feeds it to the crawler and calls the - * appropriate event callback. + * Handles network errors that occur during the crawl. * - * @param crawlCandidate the current crawl candidate - * @param partialCrawlResponse the partial crawl response - * @param redirectUrl the redirect URL + * @param event the event which gets delivered when a network error occurs */ - private void handleRequestRedirect( - final CrawlCandidate crawlCandidate, - final PartialCrawlResponse partialCrawlResponse, - final String redirectUrl) { - CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectUrl) - .setPriority(crawlCandidate.getPriority()); - crawlCandidate.getMetadata().ifPresent(builder::setMetadata); - CrawlRequest redirectedRequest = builder.build(); + private void handleNetworkError(final NetworkErrorEvent event) { + callbackManager.call(NetworkErrorEvent.class, event); + + statsCounter.recordNetworkError(); + } + + /** + * Handles request redirects that occur during the crawl. + * + * @param event the event which gets delivered when a request is redirected + */ + private void handleRequestRedirect(final RequestRedirectEvent event) { + crawl(event.getRedirectedCrawlRequest()); + + callbackManager.call(RequestRedirectEvent.class, event); + + statsCounter.recordRequestRedirect(); + } + + /** + * Handles responses with non-HTML content that occur during the crawl. + * + * @param event the event which gets delivered when the MIME type of the response is not + * text/html + */ + private void handleNonHtmlContent(final NonHtmlContentEvent event) { + callbackManager.call(NonHtmlContentEvent.class, event); + + statsCounter.recordNonHtmlContent(); + } - crawlFrontier.feedRequest(redirectedRequest, false); + /** + * Handles page load timeout that occur during the crawl. + * + * @param event the event which gets delivered when a page does not load in the browser within + * the timeout period + */ + private void handlePageLoadTimeout(final PageLoadTimeoutEvent event) { + callbackManager.call(PageLoadTimeoutEvent.class, event); - callbackManager.call(RequestRedirectEvent.class, - new RequestRedirectEvent(crawlCandidate, partialCrawlResponse, redirectedRequest)); + statsCounter.recordPageLoadTimeout(); + } + + /** + * Handles request errors that occur during the crawl. + * + * @param event the event which gets delivered when a request error (an error with HTTP status + * code 4xx or 5xx) occurs + */ + private void handleRequestError(final RequestErrorEvent event) { + callbackManager.call(RequestErrorEvent.class, event); + + statsCounter.recordRequestError(); + } + + /** + * Handles successful page loads that occur during the crawl. + * + * @param event the event which gets delivered when the browser loads the page + */ + private void handlePageLoad(final PageLoadEvent event) { + callbackManager.call(PageLoadEvent.class, event); + + statsCounter.recordPageLoad(); } /** @@ -557,6 +627,25 @@ private void performDelay() { } } + /** + * Helper method that is used to create crawl requests for redirects. The newly created request + * will have the same attributes as the redirected one. + * + * @param currentCandidate the current crawl candidate + * @param redirectUrl the redirect URL + * + * @return the crawl request for the redirect URL + */ + private static CrawlRequest createCrawlRequestForRedirect( + final CrawlCandidate currentCandidate, + final String redirectUrl) { + CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectUrl) + .setPriority(currentCandidate.getPriority()); + currentCandidate.getMetadata().ifPresent(builder::setMetadata); + + return builder.build(); + } + /** * Callback which is used to configure the browser before the crawling begins. * diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java new file mode 100644 index 0000000..34e1f9d --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java @@ -0,0 +1,278 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api; + +import com.fasterxml.jackson.annotation.JsonPropertyOrder; +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import com.github.peterbencze.serritor.internal.stats.DurationSerializer; +import com.github.peterbencze.serritor.internal.stats.StatsCounterSnapshot; +import java.time.Duration; +import java.util.Optional; +import org.apache.commons.lang3.Validate; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; +import org.apache.commons.lang3.time.DurationFormatUtils; + +/** + * Summary statistics about the crawl progress. + * + * @author Peter Bencze + */ +@JsonPropertyOrder({ + "runDuration", + "crawlRate", + "remainingDurationEstimate", + "remainingCrawlCandidateCount", + "processedCrawlCandidateCount", + "pageLoadCount", + "pageLoadTimeoutCount", + "requestRedirectCount", + "nonHtmlContentCount", + "requestErrorCount", + "networkErrorCount", + "filteredDuplicateRequestCount", + "filteredOffsiteRequestCount", + "filteredCrawlDepthLimitExceedingRequestCount" +}) +public final class CrawlStats { + + private final Duration runDuration; + private final StatsCounterSnapshot statsCounterSnapshot; + + // Derived stats + private final double crawlRate; + private final Duration remainingDurationEstimate; + + /** + * Creates a {@link CrawlStats} instance. + * + * @param runDuration the current run duration + * @param statsCounterSnapshot a snapshot of the stats counter values + */ + public CrawlStats(final Duration runDuration, final StatsCounterSnapshot statsCounterSnapshot) { + this.runDuration = runDuration; + this.statsCounterSnapshot = statsCounterSnapshot; + + // Calculate derived stats + crawlRate = calculateCrawlRate(runDuration, getProcessedCrawlCandidateCount()); + + // Remaining duration can only be calculated when at least one crawl candidate has been + // processed + if (getProcessedCrawlCandidateCount() > 0) { + remainingDurationEstimate = calculateRemainingDurationEstimate(crawlRate, + getRemainingCrawlCandidateCount()); + } else { + remainingDurationEstimate = null; + } + } + + /** + * Returns the current run duration. + * + * @return the current run duration + */ + @JsonSerialize(using = DurationSerializer.class) + public Duration getRunDuration() { + return runDuration; + } + + /** + * Returns the number of crawl candidates processed per minute. + * + * @return the number of crawl candidates processed per minute + */ + public double getCrawlRate() { + return crawlRate; + } + + /** + * Returns the remaining duration estimate. + * + *

Note: At least one crawl candidate needs to be processed before it is possible to + * calculate an estimate. + * + * @return the remaining duration estimate + */ + @JsonSerialize(contentUsing = DurationSerializer.class) + public Optional getRemainingDurationEstimate() { + return Optional.ofNullable(remainingDurationEstimate); + } + + /** + * Returns the number of remaining crawl candidates. + * + * @return the number of remaining crawl candidates + */ + public int getRemainingCrawlCandidateCount() { + return statsCounterSnapshot.getRemainingCrawlCandidateCount(); + } + + /** + * Returns the number of processed crawl candidates. + * + * @return the number of processed crawl candidates + */ + public int getProcessedCrawlCandidateCount() { + return statsCounterSnapshot.getProcessedCrawlCandidateCount(); + } + + /** + * Returns the number of successful page loads that occurred during the crawl. + * + * @return the number of successful page loads that occurred during the crawl + */ + public int getPageLoadCount() { + return statsCounterSnapshot.getPageLoadCount(); + } + + /** + * Returns the number of page load timeouts that occurred during the crawl. + * + * @return the number of page load timeouts that occurred during the crawl + */ + public int getPageLoadTimeoutCount() { + return statsCounterSnapshot.getPageLoadTimeoutCount(); + } + + /** + * Returns the number of request redirects that occurred during the crawl. + * + * @return the number of request redirects that occurred during the crawl. + */ + public int getRequestRedirectCount() { + return statsCounterSnapshot.getRequestRedirectCount(); + } + + /** + * Returns the number of responses with non-HTML content that occurred during the crawl. + * + * @return the number of responses with non-HTML content that occurred during the crawl + */ + public int getNonHtmlContentCount() { + return statsCounterSnapshot.getNonHtmlContentCount(); + } + + /** + * Returns the number of request errors that occurred during the crawl. + * + * @return the number of request errors that occurred during the crawl + */ + public int getRequestErrorCount() { + return statsCounterSnapshot.getRequestErrorCount(); + } + + /** + * Returns the number of network errors that occurred during the crawl. + * + * @return the number of network errors that occurred during the crawl + */ + public int getNetworkErrorCount() { + return statsCounterSnapshot.getNetworkErrorCount(); + } + + /** + * Returns the number of filtered duplicate requests. + * + * @return the number of filtered duplicate requests + */ + public int getFilteredDuplicateRequestCount() { + return statsCounterSnapshot.getFilteredDuplicateRequestCount(); + } + + /** + * Returns the number of filtered offsite requests. + * + * @return the number of filtered offsite requests + */ + public int getFilteredOffsiteRequestCount() { + return statsCounterSnapshot.getFilteredOffsiteRequestCount(); + } + + /** + * Returns the number of filtered crawl depth limit exceeding requests. + * + * @return the number of filtered crawl depth limit exceeding requests + */ + public int getFilteredCrawlDepthLimitExceedingRequestCount() { + return statsCounterSnapshot.getFilteredCrawlDepthLimitExceedingRequestCount(); + } + + /** + * Returns a string representation of the statistics. + * + * @return a string representation of the statistics + */ + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("runDuration", + DurationFormatUtils.formatDurationWords(runDuration.toMillis(), true, true)) + .append("crawlRate", crawlRate) + .append("remainingDurationEstimate", + DurationFormatUtils.formatDurationWords( + remainingDurationEstimate.toMillis(), true, true)) + .append("remainingCrawlCandidateCount", getRemainingCrawlCandidateCount()) + .append("processedCrawlCandidateCount", getProcessedCrawlCandidateCount()) + .append("pageLoadCount", getPageLoadCount()) + .append("pageLoadTimeoutCount", getPageLoadTimeoutCount()) + .append("requestRedirectCount", getRequestRedirectCount()) + .append("nonHtmlContentCount", getNonHtmlContentCount()) + .append("requestErrorCount", getRequestErrorCount()) + .append("networkErrorCount", getNetworkErrorCount()) + .append("filteredDuplicateRequestCount", getFilteredDuplicateRequestCount()) + .append("filteredOffsiteRequestCount", getFilteredOffsiteRequestCount()) + .append("filteredCrawlDepthLimitExceedingRequestCount", + getFilteredCrawlDepthLimitExceedingRequestCount()) + .toString(); + } + + /** + * Calculates the number of crawl candidates processed per minute. + * + * @param processedCrawlCandidateCount the number of processed crawl candidates + * @param runDuration the current run duration + * + * @return the number of crawl candidates processed per minute + */ + private static double calculateCrawlRate( + final Duration runDuration, + final int processedCrawlCandidateCount) { + long runDurationInMinutes = runDuration.toMinutes(); + if (runDurationInMinutes == 0) { + return processedCrawlCandidateCount; + } + + return (double) processedCrawlCandidateCount / runDurationInMinutes; + } + + /** + * Calculates the remaining duration estimate. + * + * @param remainingCrawlCandidateCount the number of remaining crawl candidates + * @param crawlRate the number of crawl candidates processed per minute + * + * @return the remaining duration estimate + */ + private static Duration calculateRemainingDurationEstimate( + final double crawlRate, + final int remainingCrawlCandidateCount) { + Validate.finite(crawlRate, "The crawlRate parameter must be finite."); + Validate.isTrue(crawlRate > 0, "The crawlRate parameter must be larger than 0."); + + return Duration.ofMinutes((long) Math.ceil(remainingCrawlCandidateCount / crawlRate)); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java index 3f35d82..95be1fc 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java @@ -16,8 +16,11 @@ package com.github.peterbencze.serritor.api; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; import io.javalin.Javalin; import io.javalin.apibuilder.ApiBuilder; +import io.javalin.json.JavalinJackson; /** * Provides a skeletal implementation of a crawler to minimize the effort for users to implement @@ -76,6 +79,8 @@ protected RestfulBaseCrawler(final RestServerConfiguration config, final Crawler this.config = config; restServer = Javalin.create(); + JavalinJackson.configure(new ObjectMapper().registerModule(new Jdk8Module())); + configureRoutes(); } @@ -118,6 +123,8 @@ private void configureRoutes() { ApiBuilder.delete(ctx -> stop()); ApiBuilder.get("config", ctx -> ctx.json(getCrawlerConfiguration())); + + ApiBuilder.get("stats", ctx -> ctx.json(getCrawlStats())); }); }); }); diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 2c80052..804fc4d 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -20,6 +20,7 @@ import com.github.peterbencze.serritor.api.CrawlCandidate.CrawlCandidateBuilder; import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlerConfiguration; +import com.github.peterbencze.serritor.internal.stats.StatsCounter; import java.io.Serializable; import java.net.URI; import java.util.Arrays; @@ -40,6 +41,7 @@ public final class CrawlFrontier implements Serializable { private final CrawlerConfiguration config; + private final StatsCounter statsCounter; private final Set urlFingerprints; private final Queue candidates; @@ -48,10 +50,13 @@ public final class CrawlFrontier implements Serializable { /** * Creates a {@link CrawlFrontier} instance. * - * @param config the crawler configuration + * @param config the crawler configuration + * @param statsCounter the stats counter which accumulates statistics during the operation of + * the crawler */ - public CrawlFrontier(final CrawlerConfiguration config) { + public CrawlFrontier(final CrawlerConfiguration config, final StatsCounter statsCounter) { this.config = config; + this.statsCounter = statsCounter; urlFingerprints = new HashSet<>(); candidates = createPriorityQueue(); @@ -66,24 +71,20 @@ public CrawlFrontier(final CrawlerConfiguration config) { */ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { if (config.isOffsiteRequestFilterEnabled()) { - boolean inCrawlDomain = false; - - for (CrawlDomain allowedCrawlDomain : config.getAllowedCrawlDomains()) { - if (allowedCrawlDomain.contains(request.getDomain())) { - inCrawlDomain = true; - break; - } - } + boolean inCrawlDomain = config.getAllowedCrawlDomains() + .stream() + .anyMatch(crawlDomain -> crawlDomain.contains(request.getDomain())); if (!inCrawlDomain) { + statsCounter.recordOffsiteRequest(); return; } } if (config.isDuplicateRequestFilterEnabled()) { String urlFingerprint = createFingerprintForUrl(request.getRequestUrl()); - if (urlFingerprints.contains(urlFingerprint)) { + statsCounter.recordDuplicateRequest(); return; } @@ -97,6 +98,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1; if (crawlDepthLimit != 0 && nextCrawlDepth > crawlDepthLimit) { + statsCounter.recordCrawlDepthLimitExceedingRequest(); return; } @@ -105,6 +107,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { } candidates.add(builder.build()); + statsCounter.recordRemainingCrawlCandidate(); } /** @@ -140,10 +143,7 @@ public void reset() { * Feeds all the crawl seeds to the crawl frontier. */ private void feedCrawlSeeds() { - config.getCrawlSeeds() - .forEach((CrawlRequest request) -> { - feedRequest(request, true); - }); + config.getCrawlSeeds().forEach((CrawlRequest request) -> feedRequest(request, true)); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stats/DurationSerializer.java b/src/main/java/com/github/peterbencze/serritor/internal/stats/DurationSerializer.java new file mode 100644 index 0000000..849edb6 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/stats/DurationSerializer.java @@ -0,0 +1,56 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.stats; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.ser.std.StdSerializer; +import java.io.IOException; +import java.time.Duration; +import org.apache.commons.lang3.time.DurationFormatUtils; + +/** + * A custom serializer mechanism that is used to serialize durations in crawl statistics. With this + * the given duration can be represented both in words and in numerical format. + */ +public final class DurationSerializer extends StdSerializer { + + /** + * Creates a {@link DurationSerializer} instance. + */ + public DurationSerializer() { + super(Duration.class); + } + + /** + * {@inheritDoc} + */ + @Override + public void serialize( + final Duration value, + final JsonGenerator gen, + final SerializerProvider provider) throws IOException { + long durationInMillis = value.toMillis(); + String durationInWords = DurationFormatUtils.formatDurationWords(durationInMillis, true, + true); + + gen.writeStartObject(); + gen.writeStringField("inWords", durationInWords); + gen.writeNumberField("inMillis", durationInMillis); + gen.writeEndObject(); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java new file mode 100644 index 0000000..64183dd --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java @@ -0,0 +1,282 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.stats; + +import com.github.peterbencze.serritor.internal.util.FunctionalReentrantReadWriteLock; +import java.io.Serializable; +import org.apache.commons.lang3.Validate; + +/** + * Accumulates statistics during the operation of the crawler. + */ +public final class StatsCounter implements Serializable { + + private final FunctionalReentrantReadWriteLock lock; + + private int remainingCrawlCandidateCount; + private int processedCrawlCandidateCount; + private int pageLoadCount; + private int pageLoadTimeoutCount; + private int requestRedirectCount; + private int nonHtmlContentCount; + private int requestErrorCount; + private int networkErrorCount; + private int filteredDuplicateRequestCount; + private int filteredOffsiteRequestCount; + private int filteredCrawlDepthLimitExceedingRequestCount; + + /** + * Creates a {@link StatsCounter} instance. + */ + public StatsCounter() { + lock = new FunctionalReentrantReadWriteLock(); + } + + /** + * Returns the number of remaining crawl candidates. + * + * @return the number of remaining crawl candidates + */ + public int getRemainingCrawlCandidateCount() { + return lock.readWithLock(() -> remainingCrawlCandidateCount); + } + + /** + * Records an added crawl candidate. This should be called when a crawl candidate is added to + * the crawl frontier. + */ + public void recordRemainingCrawlCandidate() { + lock.writeWithLock(() -> ++remainingCrawlCandidateCount); + } + + /** + * Returns the number of processed crawl candidates. + * + * @return the number of processed crawl candidates + */ + public int getProcessedCrawlCandidateCount() { + return lock.readWithLock(() -> processedCrawlCandidateCount); + } + + /** + * Returns the number of successful page loads that occurred during the crawl. + * + * @return the number of successful page loads that occurred during the crawl + */ + public int getPageLoadCount() { + return lock.readWithLock(() -> pageLoadCount); + } + + /** + * Records a successful page load. This should be called when the status code of the response is + * successful. + */ + public void recordPageLoad() { + lock.writeWithLock(() -> { + decrementRemainingCrawlCandidateCount(); + + ++pageLoadCount; + incrementProcessedCrawlCandidateCount(); + }); + } + + /** + * Returns the number of page load timeouts that occurred during the crawl. + * + * @return the number of page load timeouts that occurred during the crawl + */ + public int getPageLoadTimeoutCount() { + return lock.readWithLock(() -> pageLoadTimeoutCount); + } + + /** + * Records a page load timeout. This should be called when a page does not load in the browser + * within the timeout period. + */ + public void recordPageLoadTimeout() { + lock.writeWithLock(() -> { + decrementRemainingCrawlCandidateCount(); + + ++pageLoadTimeoutCount; + incrementProcessedCrawlCandidateCount(); + }); + } + + /** + * Returns the number of request redirects that occurred during the crawl. + * + * @return the number of request redirects that occurred during the crawl. + */ + public int getRequestRedirectCount() { + return lock.readWithLock(() -> requestRedirectCount); + } + + /** + * Records a request redirect. This should be called when a request is redirected. + */ + public void recordRequestRedirect() { + lock.writeWithLock(() -> { + decrementRemainingCrawlCandidateCount(); + + ++requestRedirectCount; + incrementProcessedCrawlCandidateCount(); + }); + } + + /** + * Returns the number of responses with non-HTML content that occurred during the crawl. + * + * @return the number of responses with non-HTML content that occurred during the crawl + */ + public int getNonHtmlContentCount() { + return lock.readWithLock(() -> nonHtmlContentCount); + } + + /** + * Records a response with non-HTML content. This should be called when the MIME type of a + * response is not text/html. + */ + public void recordNonHtmlContent() { + lock.writeWithLock(() -> { + decrementRemainingCrawlCandidateCount(); + + ++nonHtmlContentCount; + incrementProcessedCrawlCandidateCount(); + }); + } + + /** + * Returns the number of request errors that occurred during the crawl. + * + * @return the number of request errors that occurred during the crawl + */ + public int getRequestErrorCount() { + return lock.readWithLock(() -> requestErrorCount); + } + + /** + * Records an error response. This should be called when the status code of the response is 4xx + * or 5xx. + */ + public void recordRequestError() { + lock.writeWithLock(() -> { + decrementRemainingCrawlCandidateCount(); + + ++requestErrorCount; + incrementProcessedCrawlCandidateCount(); + }); + } + + /** + * Returns the number of network errors that occurred during the crawl. + * + * @return the number of network errors that occurred during the crawl + */ + public int getNetworkErrorCount() { + return lock.readWithLock(() -> networkErrorCount); + } + + /** + * Records a network error. This should be called when a network error occurs while trying to + * fulfill a request. + */ + public void recordNetworkError() { + lock.writeWithLock(() -> { + decrementRemainingCrawlCandidateCount(); + + ++networkErrorCount; + incrementProcessedCrawlCandidateCount(); + }); + } + + /** + * Returns the number of filtered duplicate requests. + * + * @return the number of filtered duplicate requests + */ + public int getFilteredDuplicateRequestCount() { + return lock.readWithLock(() -> filteredDuplicateRequestCount); + } + + /** + * Records a duplicate request. This should be called when the duplicate request filter is + * enabled and a duplicate request is encountered. + */ + public void recordDuplicateRequest() { + lock.writeWithLock(() -> ++filteredDuplicateRequestCount); + } + + /** + * Returns the number of filtered offsite requests. + * + * @return the number of filtered offsite requests + */ + public int getFilteredOffsiteRequestCount() { + return lock.readWithLock(() -> filteredOffsiteRequestCount); + } + + /** + * Records an offsite request. This should be called when the offsite request filter is enabled + * and an offsite request is encountered. + */ + public void recordOffsiteRequest() { + lock.writeWithLock(() -> ++filteredOffsiteRequestCount); + } + + /** + * Returns the number of filtered crawl depth limit exceeding requests. + * + * @return the number of filtered crawl depth limit exceeding requests + */ + public int getFilteredCrawlDepthLimitExceedingRequestCount() { + return lock.readWithLock(() -> filteredCrawlDepthLimitExceedingRequestCount); + } + + /** + * Records a crawl depth limit exceeding request. This should be called when a crawl depth limit + * is set and the request's crawl depth exceeds this limit. + */ + public void recordCrawlDepthLimitExceedingRequest() { + lock.writeWithLock(() -> ++filteredCrawlDepthLimitExceedingRequestCount); + } + + /** + * Returns a snapshot of this counter's values. + * + * @return a snapshot of this counter's values + */ + public StatsCounterSnapshot getSnapshot() { + return lock.readWithLock(() -> new StatsCounterSnapshot(this)); + } + + /** + * Increments the number of processed crawl candidates. + */ + private void incrementProcessedCrawlCandidateCount() { + ++processedCrawlCandidateCount; + } + + /** + * Decrements the number of remaining crawl candidates. This number cannot be negative. + */ + private void decrementRemainingCrawlCandidateCount() { + Validate.validState(remainingCrawlCandidateCount > 0, + "The number of remaining crawl candidates cannot be negative."); + + --remainingCrawlCandidateCount; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounterSnapshot.java b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounterSnapshot.java new file mode 100644 index 0000000..c92dc5b --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounterSnapshot.java @@ -0,0 +1,154 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.stats; + +/** + * Represents a snapshot of the stats counter values. + */ +public final class StatsCounterSnapshot { + + private final int remainingCrawlCandidateCount; + private final int processedCrawlCandidateCount; + private final int pageLoadCount; + private final int pageLoadTimeoutCount; + private final int requestRedirectCount; + private final int nonHtmlContentCount; + private final int requestErrorCount; + private final int networkErrorCount; + private final int filteredDuplicateRequestCount; + private final int filteredOffsiteRequestCount; + private final int filteredCrawlDepthLimitExceedingRequestCount; + + /** + * Creates a {@link StatsCounterSnapshot} instance. + * + * @param statsCounter the stats counter object to create the snapshot from + */ + public StatsCounterSnapshot(final StatsCounter statsCounter) { + remainingCrawlCandidateCount = statsCounter.getRemainingCrawlCandidateCount(); + processedCrawlCandidateCount = statsCounter.getProcessedCrawlCandidateCount(); + pageLoadCount = statsCounter.getPageLoadCount(); + pageLoadTimeoutCount = statsCounter.getPageLoadTimeoutCount(); + requestRedirectCount = statsCounter.getRequestRedirectCount(); + nonHtmlContentCount = statsCounter.getNonHtmlContentCount(); + requestErrorCount = statsCounter.getRequestErrorCount(); + networkErrorCount = statsCounter.getNetworkErrorCount(); + filteredDuplicateRequestCount = statsCounter.getFilteredDuplicateRequestCount(); + filteredOffsiteRequestCount = statsCounter.getFilteredOffsiteRequestCount(); + filteredCrawlDepthLimitExceedingRequestCount = + statsCounter.getFilteredCrawlDepthLimitExceedingRequestCount(); + } + + /** + * Returns the number of remaining crawl candidates. + * + * @return the number of remaining crawl candidates + */ + public int getRemainingCrawlCandidateCount() { + return remainingCrawlCandidateCount; + } + + /** + * Returns the number of processed crawl candidates. + * + * @return the number of processed crawl candidates + */ + public int getProcessedCrawlCandidateCount() { + return processedCrawlCandidateCount; + } + + /** + * Returns the number of successful page loads that occurred during the crawl. + * + * @return the number of successful page loads that occurred during the crawl + */ + public int getPageLoadCount() { + return pageLoadCount; + } + + /** + * Returns the number of page load timeouts that occurred during the crawl. + * + * @return the number of page load timeouts that occurred during the crawl + */ + public int getPageLoadTimeoutCount() { + return pageLoadTimeoutCount; + } + + /** + * Returns the number of request redirects that occurred during the crawl. + * + * @return the number of request redirects that occurred during the crawl. + */ + public int getRequestRedirectCount() { + return requestRedirectCount; + } + + /** + * Returns the number of responses with non-HTML content that occurred during the crawl. + * + * @return the number of responses with non-HTML content that occurred during the crawl + */ + public int getNonHtmlContentCount() { + return nonHtmlContentCount; + } + + /** + * Returns the number of request errors that occurred during the crawl. + * + * @return the number of request errors that occurred during the crawl + */ + public int getRequestErrorCount() { + return requestErrorCount; + } + + /** + * Returns the number of network errors that occurred during the crawl. + * + * @return the number of network errors that occurred during the crawl + */ + public int getNetworkErrorCount() { + return networkErrorCount; + } + + /** + * Returns the number of filtered duplicate requests. + * + * @return the number of filtered duplicate requests + */ + public int getFilteredDuplicateRequestCount() { + return filteredDuplicateRequestCount; + } + + /** + * Returns the number of filtered offsite requests. + * + * @return the number of filtered offsite requests + */ + public int getFilteredOffsiteRequestCount() { + return filteredOffsiteRequestCount; + } + + /** + * Returns the number of filtered crawl depth limit exceeding requests. + * + * @return the number of filtered crawl depth limit exceeding requests + */ + public int getFilteredCrawlDepthLimitExceedingRequestCount() { + return filteredCrawlDepthLimitExceedingRequestCount; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java b/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java index 1cc49b1..051d18c 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java @@ -16,18 +16,22 @@ package com.github.peterbencze.serritor.internal.stopwatch; +import com.github.peterbencze.serritor.internal.util.FunctionalReentrantReadWriteLock; import java.io.Serializable; import java.time.Duration; import java.time.Instant; import org.apache.commons.lang3.Validate; /** - * A serializable stopwatch implementation that can be used to measure elapsed time. + * A serializable and thread-safe stopwatch implementation that can be used to measure elapsed + * time. * * @author Peter Bencze */ public final class Stopwatch implements Serializable { + private final FunctionalReentrantReadWriteLock lock; + private TimeSource timeSource; private Instant startTime; private Duration elapsedDuration; @@ -40,6 +44,7 @@ public final class Stopwatch implements Serializable { */ public Stopwatch(final TimeSource timeSource) { this.timeSource = timeSource; + lock = new FunctionalReentrantReadWriteLock(); elapsedDuration = Duration.ZERO; isRunning = false; } @@ -55,10 +60,12 @@ public Stopwatch() { * Starts the stopwatch. */ public void start() { - Validate.validState(!isRunning, "The stopwatch is already running."); + lock.writeWithLock(() -> { + Validate.validState(!isRunning, "The stopwatch is already running."); - startTime = timeSource.getTime(); - isRunning = true; + startTime = timeSource.getTime(); + isRunning = true; + }); } /** @@ -67,17 +74,20 @@ public void start() { * @return true if the stopwatch is running, false otherwise */ public boolean isRunning() { - return isRunning; + return lock.readWithLock(() -> isRunning); } /** * Stops the stopwatch. */ public void stop() { - Validate.validState(isRunning, "The stopwatch is not running."); + lock.writeWithLock(() -> { + Validate.validState(isRunning, "The stopwatch is not running."); - elapsedDuration = elapsedDuration.plus(Duration.between(startTime, timeSource.getTime())); - isRunning = false; + elapsedDuration = elapsedDuration.plus(Duration.between(startTime, + timeSource.getTime())); + isRunning = false; + }); } /** @@ -86,10 +96,12 @@ public void stop() { * @return the current elapsed duration */ public Duration getElapsedDuration() { - if (isRunning) { - return Duration.between(startTime, timeSource.getTime()).plus(elapsedDuration); - } + return lock.readWithLock(() -> { + if (isRunning) { + return Duration.between(startTime, timeSource.getTime()).plus(elapsedDuration); + } - return elapsedDuration; + return elapsedDuration; + }); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/util/FunctionalReentrantReadWriteLock.java b/src/main/java/com/github/peterbencze/serritor/internal/util/FunctionalReentrantReadWriteLock.java new file mode 100644 index 0000000..9519752 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/util/FunctionalReentrantReadWriteLock.java @@ -0,0 +1,93 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.util; + +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.Supplier; + +/** + * An implementation of {@link ReentrantReadWriteLock} that adds the possibility of specifying + * actions (in a functional manner) which are executed under the lock. + */ +public final class FunctionalReentrantReadWriteLock extends ReentrantReadWriteLock { + + /** + * Executes the given action under the read lock of this lock. + * + * @param action the action to execute + * @param the type of result supplied by the action + * + * @return the result of the action + */ + public T readWithLock(final Supplier action) { + readLock().lock(); + + try { + return action.get(); + } finally { + readLock().unlock(); + } + } + + /** + * Executes the given action under the read lock of this lock. + * + * @param action the action to execute + */ + public void readWithLock(final Runnable action) { + readLock().lock(); + + try { + action.run(); + } finally { + readLock().unlock(); + } + } + + /** + * Executes the given action under the write lock of this lock. + * + * @param action the action to execute + * @param the type of result supplied by the action + * + * @return the result of the action + */ + public T writeWithLock(final Supplier action) { + writeLock().lock(); + + try { + return action.get(); + } finally { + writeLock().unlock(); + } + } + + /** + * Executes the given action under the write lock of this lock. + * + * @param action the action to execute + */ + public void writeWithLock(final Runnable action) { + writeLock().lock(); + + try { + action.run(); + } finally { + writeLock().unlock(); + } + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 681c569..5f6bcf1 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -22,6 +22,7 @@ import com.github.peterbencze.serritor.api.CrawlStrategy; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import com.github.peterbencze.serritor.api.CrawlerConfiguration.CrawlerConfigurationBuilder; +import com.github.peterbencze.serritor.internal.stats.StatsCounter; import com.google.common.collect.Sets; import java.net.URI; import java.util.Arrays; @@ -109,19 +110,23 @@ public final class CrawlFrontierTest { // Max crawl depth private static final int MAX_CRAWL_DEPTH = 1; - private CrawlerConfiguration config; + private CrawlerConfiguration configMock; + private StatsCounter statsCounterMock; @Before public void before() { - config = Mockito.spy(new CrawlerConfigurationBuilder().setOffsiteRequestFilterEnabled(true) + configMock = Mockito.spy(new CrawlerConfigurationBuilder() + .setOffsiteRequestFilterEnabled(true) .addAllowedCrawlDomains(ALLOWED_CRAWL_DOMAINS) .addCrawlSeeds(CRAWL_SEEDS) .build()); + + statsCounterMock = Mockito.mock(StatsCounter.class); } @Test public void testHasNextCandidateWithNonEmptyQueue() { - CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); Assert.assertTrue(crawlFrontier.hasNextCandidate()); @@ -149,16 +154,16 @@ public void testHasNextCandidateWithNonEmptyQueue() { @Test public void testHasNextCandidateWithEmptyQueue() { - Mockito.when(config.getCrawlSeeds()).thenReturn(Collections.EMPTY_SET); + Mockito.when(configMock.getCrawlSeeds()).thenReturn(Collections.EMPTY_SET); - CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); Assert.assertFalse(crawlFrontier.hasNextCandidate()); } @Test public void testEnabledDuplicateRequestFiltering() { - CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); clearCrawlCandidateQueue(crawlFrontier); crawlFrontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, false); @@ -168,9 +173,9 @@ public void testEnabledDuplicateRequestFiltering() { @Test public void testDisabledDuplicateRequestFiltering() { - Mockito.when(config.isDuplicateRequestFilterEnabled()).thenReturn(false); + Mockito.when(configMock.isDuplicateRequestFilterEnabled()).thenReturn(false); - CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); clearCrawlCandidateQueue(crawlFrontier); crawlFrontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, true); @@ -181,19 +186,21 @@ public void testDisabledDuplicateRequestFiltering() { @Test public void testEnabledOffsiteRequestFiltering() { - Mockito.when(config.getCrawlSeeds()).thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST)); + Mockito.when(configMock.getCrawlSeeds()) + .thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST)); - CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); Assert.assertFalse(crawlFrontier.hasNextCandidate()); } @Test public void testDisabledOffsiteRequestFiltering() { - Mockito.when(config.isOffsiteRequestFilterEnabled()).thenReturn(false); - Mockito.when(config.getCrawlSeeds()).thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST)); + Mockito.when(configMock.isOffsiteRequestFilterEnabled()).thenReturn(false); + Mockito.when(configMock.getCrawlSeeds()) + .thenReturn(Sets.newHashSet(OFFSITE_URL_CRAWL_REQUEST)); - CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); Assert.assertTrue(crawlFrontier.hasNextCandidate()); Assert.assertEquals(OFFSITE_URL, @@ -202,7 +209,7 @@ public void testDisabledOffsiteRequestFiltering() { @Test public void testGetNextCandidateWhenUsingBreadthFirstCrawlStrategy() { - CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate(); Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl()); @@ -240,9 +247,9 @@ public void testGetNextCandidateWhenUsingBreadthFirstCrawlStrategy() { @Test public void testGetNextCandidateWhenUsingDepthFirstCrawlStrategy() { - Mockito.when(config.getCrawlStrategy()).thenReturn(CrawlStrategy.DEPTH_FIRST); + Mockito.when(configMock.getCrawlStrategy()).thenReturn(CrawlStrategy.DEPTH_FIRST); - CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); CrawlCandidate nextCandidate = crawlFrontier.getNextCandidate(); Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl()); @@ -279,9 +286,9 @@ public void testGetNextCandidateWhenUsingDepthFirstCrawlStrategy() { @Test public void testCrawlDepthLimitation() { - Mockito.when(config.getMaximumCrawlDepth()).thenReturn(MAX_CRAWL_DEPTH); + Mockito.when(configMock.getMaximumCrawlDepth()).thenReturn(MAX_CRAWL_DEPTH); - CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); clearCrawlCandidateQueue(crawlFrontier); crawlFrontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); @@ -296,7 +303,7 @@ public void testCrawlDepthLimitation() { @Test public void testReset() { - CrawlFrontier crawlFrontier = new CrawlFrontier(config); + CrawlFrontier crawlFrontier = new CrawlFrontier(configMock, statsCounterMock); crawlFrontier.reset(); diff --git a/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java b/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java new file mode 100644 index 0000000..e89e2fc --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java @@ -0,0 +1,136 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.stats; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * Test cases for {@link StatsCounter}. + */ +public final class StatsCounterTest { + + private StatsCounter statsCounter; + + @Before + public void before() { + statsCounter = new StatsCounter(); + } + + @Test + public void testRecordPageLoad() { + statsCounter.recordRemainingCrawlCandidate(); + + int remainingCrawlCandidateCountBefore = statsCounter.getRemainingCrawlCandidateCount(); + int pageLoadCountBefore = statsCounter.getPageLoadCount(); + int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); + + statsCounter.recordPageLoad(); + + Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, + statsCounter.getRemainingCrawlCandidateCount()); + Assert.assertEquals(pageLoadCountBefore + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(processedCrawlCandidateCountBefore + 1, + statsCounter.getProcessedCrawlCandidateCount()); + } + + @Test + public void testRecordPageLoadTimeout() { + statsCounter.recordRemainingCrawlCandidate(); + + int remainingCrawlCandidateCountBefore = statsCounter.getRemainingCrawlCandidateCount(); + int pageLoadTimeoutCountBefore = statsCounter.getPageLoadTimeoutCount(); + int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); + + statsCounter.recordPageLoad(); + + Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, + statsCounter.getRemainingCrawlCandidateCount()); + Assert.assertEquals(pageLoadTimeoutCountBefore + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(processedCrawlCandidateCountBefore + 1, + statsCounter.getProcessedCrawlCandidateCount()); + } + + @Test + public void testRecordRequestRedirect() { + statsCounter.recordRemainingCrawlCandidate(); + + int remainingCrawlCandidateCountBefore = statsCounter.getRemainingCrawlCandidateCount(); + int requestRedirectCountBefore = statsCounter.getRequestRedirectCount(); + int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); + + statsCounter.recordPageLoad(); + + Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, + statsCounter.getRemainingCrawlCandidateCount()); + Assert.assertEquals(requestRedirectCountBefore + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(processedCrawlCandidateCountBefore + 1, + statsCounter.getProcessedCrawlCandidateCount()); + } + + @Test + public void testRecordNonHtmlContent() { + statsCounter.recordRemainingCrawlCandidate(); + + int remainingCrawlCandidateCountBefore = statsCounter.getRemainingCrawlCandidateCount(); + int nonHtmlContentCount = statsCounter.getNonHtmlContentCount(); + int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); + + statsCounter.recordPageLoad(); + + Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, + statsCounter.getRemainingCrawlCandidateCount()); + Assert.assertEquals(nonHtmlContentCount + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(processedCrawlCandidateCountBefore + 1, + statsCounter.getProcessedCrawlCandidateCount()); + } + + @Test + public void testRecordRequestError() { + statsCounter.recordRemainingCrawlCandidate(); + + int remainingCrawlCandidateCountBefore = statsCounter.getRemainingCrawlCandidateCount(); + int requestErrorCount = statsCounter.getRequestErrorCount(); + int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); + + statsCounter.recordPageLoad(); + + Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, + statsCounter.getRemainingCrawlCandidateCount()); + Assert.assertEquals(requestErrorCount + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(processedCrawlCandidateCountBefore + 1, + statsCounter.getProcessedCrawlCandidateCount()); + } + + @Test + public void testRecordNetworkError() { + statsCounter.recordRemainingCrawlCandidate(); + + int remainingCrawlCandidateCountBefore = statsCounter.getRemainingCrawlCandidateCount(); + int networkErrorCount = statsCounter.getNetworkErrorCount(); + int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); + + statsCounter.recordPageLoad(); + + Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, + statsCounter.getRemainingCrawlCandidateCount()); + Assert.assertEquals(networkErrorCount + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(processedCrawlCandidateCountBefore + 1, + statsCounter.getProcessedCrawlCandidateCount()); + } +} From 50050a16339f7bbda609902e8138c0cf5ea4bfae Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 14 Mar 2019 01:12:57 +0100 Subject: [PATCH 33/63] Add relative redirect URL handling --- .../com/github/peterbencze/serritor/api/BaseCrawler.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 0561bc9..f809c0c 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -639,7 +639,10 @@ private void performDelay() { private static CrawlRequest createCrawlRequestForRedirect( final CrawlCandidate currentCandidate, final String redirectUrl) { - CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectUrl) + // Handle relative redirect URLs + URI resolvedUrl = currentCandidate.getRequestUrl().resolve(redirectUrl); + + CrawlRequestBuilder builder = new CrawlRequestBuilder(resolvedUrl) .setPriority(currentCandidate.getPriority()); currentCandidate.getMetadata().ifPresent(builder::setMetadata); From e763e1af4880b10d625e91d91e98ad154f974342 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 14 Mar 2019 01:27:54 +0100 Subject: [PATCH 34/63] Use parseInt instead of valueOf to parse proxy port number --- .../java/com/github/peterbencze/serritor/api/BaseCrawler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index f809c0c..1a4f69c 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -216,7 +216,7 @@ private void start(final Browser browser, if (chainedProxy != null && chainedProxy.getHttpProxy() != null) { String[] urlComponents = chainedProxy.getHttpProxy().split(":"); String host = urlComponents[0]; - int port = Integer.valueOf(urlComponents[1]); + int port = Integer.parseInt(urlComponents[1]); proxyServer.setChainedProxy(new InetSocketAddress(host, port)); } From 6540ce35bd1889038c1d01b7315fc1ed32a28f43 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 14 Mar 2019 21:59:06 +0100 Subject: [PATCH 35/63] Find HAR entry by candidate URL --- .../com/github/peterbencze/serritor/api/BaseCrawler.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 1a4f69c..24ec67b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -447,10 +447,10 @@ private void run() { HttpClientUtils.closeQuietly(httpHeadResponse); } - HarResponse harResponse = proxyServer.getHar() - .getLog() - .getEntries() - .get(0) + HarResponse harResponse = proxyServer.getHar().getLog().getEntries().stream() + .filter(harEntry -> candidateUrl.equals(harEntry.getRequest().getUrl())) + .findFirst() + .orElseThrow(() -> new IllegalStateException("No HAR entry for candidate URL")) .getResponse(); if (harResponse.getError() != null) { handleNetworkError(new NetworkErrorEvent(currentCandidate, harResponse.getError())); From d005988e6db42347f1c5395ccbad6b62a566b968 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 14 Mar 2019 23:03:27 +0100 Subject: [PATCH 36/63] Move stopwatch to util package --- .../java/com/github/peterbencze/serritor/api/BaseCrawler.java | 2 +- .../serritor/internal/{ => util}/stopwatch/Stopwatch.java | 2 +- .../serritor/internal/{ => util}/stopwatch/TimeSource.java | 2 +- .../serritor/internal/{ => util}/stopwatch/UtcTimeSource.java | 2 +- .../serritor/internal/{ => util}/stopwatch/StopwatchTest.java | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename src/main/java/com/github/peterbencze/serritor/internal/{ => util}/stopwatch/Stopwatch.java (97%) rename src/main/java/com/github/peterbencze/serritor/internal/{ => util}/stopwatch/TimeSource.java (93%) rename src/main/java/com/github/peterbencze/serritor/internal/{ => util}/stopwatch/UtcTimeSource.java (93%) rename src/test/java/com/github/peterbencze/serritor/internal/{ => util}/stopwatch/StopwatchTest.java (97%) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 24ec67b..026193e 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -34,7 +34,7 @@ import com.github.peterbencze.serritor.internal.event.EventCallbackManager; import com.github.peterbencze.serritor.internal.event.EventObject; import com.github.peterbencze.serritor.internal.stats.StatsCounter; -import com.github.peterbencze.serritor.internal.stopwatch.Stopwatch; +import com.github.peterbencze.serritor.internal.util.stopwatch.Stopwatch; import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/Stopwatch.java similarity index 97% rename from src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java rename to src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/Stopwatch.java index 051d18c..c78efd1 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/Stopwatch.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/Stopwatch.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal.stopwatch; +package com.github.peterbencze.serritor.internal.util.stopwatch; import com.github.peterbencze.serritor.internal.util.FunctionalReentrantReadWriteLock; import java.io.Serializable; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/TimeSource.java b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/TimeSource.java similarity index 93% rename from src/main/java/com/github/peterbencze/serritor/internal/stopwatch/TimeSource.java rename to src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/TimeSource.java index dd08797..3f74564 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/TimeSource.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/TimeSource.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal.stopwatch; +package com.github.peterbencze.serritor.internal.util.stopwatch; import java.io.Serializable; import java.time.Instant; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/UtcTimeSource.java b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/UtcTimeSource.java similarity index 93% rename from src/main/java/com/github/peterbencze/serritor/internal/stopwatch/UtcTimeSource.java rename to src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/UtcTimeSource.java index d1b7b5f..263ca35 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/stopwatch/UtcTimeSource.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/UtcTimeSource.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal.stopwatch; +package com.github.peterbencze.serritor.internal.util.stopwatch; import java.time.Instant; diff --git a/src/test/java/com/github/peterbencze/serritor/internal/stopwatch/StopwatchTest.java b/src/test/java/com/github/peterbencze/serritor/internal/util/stopwatch/StopwatchTest.java similarity index 97% rename from src/test/java/com/github/peterbencze/serritor/internal/stopwatch/StopwatchTest.java rename to src/test/java/com/github/peterbencze/serritor/internal/util/stopwatch/StopwatchTest.java index e28130c..dd302f6 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/stopwatch/StopwatchTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/util/stopwatch/StopwatchTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal.stopwatch; +package com.github.peterbencze.serritor.internal.util.stopwatch; import java.time.Duration; import java.time.Instant; From 5322589eaf456cd237c4e6118e06829b78cd9673 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 14 Mar 2019 23:22:51 +0100 Subject: [PATCH 37/63] Update checkstyle --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 1f899f3..878508b 100644 --- a/pom.xml +++ b/pom.xml @@ -148,7 +148,7 @@ com.puppycrawl.tools checkstyle - 8.14 + 8.18 From 9b8ecca492e4f67476a463d3b22f44d826d69f77 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 14 Mar 2019 23:42:01 +0100 Subject: [PATCH 38/63] Move cookie converter to util package --- .../peterbencze/serritor/api/BaseCrawler.java | 2 +- .../internal/{ => util}/CookieConverter.java | 10 ++++++++-- .../internal/{ => util}/CookieConverterTest.java | 13 ++----------- 3 files changed, 11 insertions(+), 14 deletions(-) rename src/main/java/com/github/peterbencze/serritor/internal/{ => util}/CookieConverter.java (88%) rename src/test/java/com/github/peterbencze/serritor/internal/{ => util}/CookieConverterTest.java (80%) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 026193e..6747a1f 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -24,7 +24,6 @@ import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent; import com.github.peterbencze.serritor.api.event.RequestErrorEvent; import com.github.peterbencze.serritor.api.event.RequestRedirectEvent; -import com.github.peterbencze.serritor.internal.CookieConverter; import com.github.peterbencze.serritor.internal.CrawlFrontier; import com.github.peterbencze.serritor.internal.WebDriverFactory; import com.github.peterbencze.serritor.internal.crawldelaymechanism.AdaptiveCrawlDelayMechanism; @@ -34,6 +33,7 @@ import com.github.peterbencze.serritor.internal.event.EventCallbackManager; import com.github.peterbencze.serritor.internal.event.EventObject; import com.github.peterbencze.serritor.internal.stats.StatsCounter; +import com.github.peterbencze.serritor.internal.util.CookieConverter; import com.github.peterbencze.serritor.internal.util.stopwatch.Stopwatch; import java.io.File; import java.io.IOException; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CookieConverter.java b/src/main/java/com/github/peterbencze/serritor/internal/util/CookieConverter.java similarity index 88% rename from src/main/java/com/github/peterbencze/serritor/internal/CookieConverter.java rename to src/main/java/com/github/peterbencze/serritor/internal/util/CookieConverter.java index 5bbf1f3..4b104e1 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CookieConverter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/util/CookieConverter.java @@ -14,14 +14,14 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal; +package com.github.peterbencze.serritor.internal.util; import org.apache.commons.lang3.StringUtils; import org.apache.http.impl.cookie.BasicClientCookie; import org.openqa.selenium.Cookie; /** - * Converts Selenium cookies to HTTP client ones and vice-versa. + * Converts Selenium cookies to HTTP client ones. * * @author Peter Bencze */ @@ -29,6 +29,12 @@ public final class CookieConverter { private static final String HTTP_ONLY_ATTRIBUTE = "httponly"; + /** + * Private constructor to hide the implicit public one. + */ + private CookieConverter() { + } + /** * Converts a Selenium cookie to a HTTP client one. * diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CookieConverterTest.java b/src/test/java/com/github/peterbencze/serritor/internal/util/CookieConverterTest.java similarity index 80% rename from src/test/java/com/github/peterbencze/serritor/internal/CookieConverterTest.java rename to src/test/java/com/github/peterbencze/serritor/internal/util/CookieConverterTest.java index a07ac29..e651673 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CookieConverterTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/util/CookieConverterTest.java @@ -14,10 +14,9 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal; +package com.github.peterbencze.serritor.internal.util; import java.util.Date; -import org.apache.commons.lang3.StringUtils; import org.apache.http.impl.cookie.BasicClientCookie; import org.junit.Assert; import org.junit.Before; @@ -41,19 +40,11 @@ public final class CookieConverterTest { private static final String HTTP_ONLY_ATTRIBUTE = "httponly"; private Cookie seleniumCookie; - private BasicClientCookie httpClientCookie; @Before - public void initialize() { + public void before() { seleniumCookie = new Cookie(NAME, VALUE, DOMAIN, PATH, EXPIRY_DATE, IS_SECURE, IS_HTTP_ONLY); - - httpClientCookie = new BasicClientCookie(NAME, VALUE); - httpClientCookie.setDomain(seleniumCookie.getDomain()); - httpClientCookie.setPath(seleniumCookie.getPath()); - httpClientCookie.setExpiryDate(seleniumCookie.getExpiry()); - httpClientCookie.setSecure(seleniumCookie.isSecure()); - httpClientCookie.setAttribute(HTTP_ONLY_ATTRIBUTE, StringUtils.EMPTY); } @Test From b8e2661b40a04fb90a49b53f98caa739b0b400dd Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 15 Mar 2019 11:23:45 +0100 Subject: [PATCH 39/63] Refact custom callback manager --- .../peterbencze/serritor/api/BaseCrawler.java | 35 ++-- .../serritor/api/PatternMatchingCallback.java | 2 +- .../serritor/api/event/NetworkErrorEvent.java | 2 +- .../api/event/NonHtmlContentEvent.java | 2 +- .../serritor/api/event/PageLoadEvent.java | 2 +- .../api/event/PageLoadTimeoutEvent.java | 2 +- .../serritor/api/event/RequestErrorEvent.java | 2 +- .../api/event/RequestRedirectEvent.java | 2 +- .../internal/CustomCallbackManager.java | 86 ++++++++++ .../internal/{event => }/EventObject.java | 94 +++++------ .../internal/event/EventCallbackManager.java | 107 ------------ .../internal/CustomCallbackManagerTest.java | 145 ++++++++++++++++ .../event/EventCallbackManagerTest.java | 156 ------------------ 13 files changed, 300 insertions(+), 337 deletions(-) create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/CustomCallbackManager.java rename src/main/java/com/github/peterbencze/serritor/internal/{event => }/EventObject.java (92%) delete mode 100644 src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java create mode 100644 src/test/java/com/github/peterbencze/serritor/internal/CustomCallbackManagerTest.java delete mode 100644 src/test/java/com/github/peterbencze/serritor/internal/event/EventCallbackManagerTest.java diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 6747a1f..274e39b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -25,13 +25,13 @@ import com.github.peterbencze.serritor.api.event.RequestErrorEvent; import com.github.peterbencze.serritor.api.event.RequestRedirectEvent; import com.github.peterbencze.serritor.internal.CrawlFrontier; +import com.github.peterbencze.serritor.internal.CustomCallbackManager; +import com.github.peterbencze.serritor.internal.EventObject; import com.github.peterbencze.serritor.internal.WebDriverFactory; import com.github.peterbencze.serritor.internal.crawldelaymechanism.AdaptiveCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.crawldelaymechanism.CrawlDelayMechanism; import com.github.peterbencze.serritor.internal.crawldelaymechanism.FixedCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.crawldelaymechanism.RandomCrawlDelayMechanism; -import com.github.peterbencze.serritor.internal.event.EventCallbackManager; -import com.github.peterbencze.serritor.internal.event.EventObject; import com.github.peterbencze.serritor.internal.stats.StatsCounter; import com.github.peterbencze.serritor.internal.util.CookieConverter; import com.github.peterbencze.serritor.internal.util.stopwatch.Stopwatch; @@ -87,7 +87,7 @@ public abstract class BaseCrawler { private final Stopwatch runTimeStopwatch; private final StatsCounter statsCounter; private final CrawlFrontier crawlFrontier; - private final EventCallbackManager callbackManager; + private final CustomCallbackManager callbackManager; private BasicCookieStore cookieStore; private CloseableHttpClient httpClient; @@ -121,15 +121,7 @@ protected BaseCrawler(final CrawlerState state) { crawlFrontier = state.getStateObject(CrawlFrontier.class) .orElseGet(() -> new CrawlFrontier(config, statsCounter)); - callbackManager = new EventCallbackManager(); - callbackManager.setDefaultEventCallback(PageLoadEvent.class, this::onPageLoad); - callbackManager.setDefaultEventCallback(NonHtmlContentEvent.class, this::onNonHtmlContent); - callbackManager.setDefaultEventCallback(PageLoadTimeoutEvent.class, - this::onPageLoadTimeout); - callbackManager.setDefaultEventCallback(RequestRedirectEvent.class, - this::onRequestRedirect); - callbackManager.setDefaultEventCallback(NetworkErrorEvent.class, this::onNetworkError); - callbackManager.setDefaultEventCallback(RequestErrorEvent.class, this::onRequestError); + callbackManager = new CustomCallbackManager(); isStopInitiated = new AtomicBoolean(false); isStopped = new AtomicBoolean(true); @@ -309,13 +301,13 @@ public final void resume(final Browser browser, final DesiredCapabilities capabi * @param eventClass the runtime class of the event for which the callback should be invoked * @param callback the pattern matching callback to invoke */ - protected final void registerCustomEventCallback( + protected final void registerCustomCallback( final Class eventClass, final PatternMatchingCallback callback) { Validate.notNull(eventClass, "The eventClass parameter cannot be null."); Validate.notNull(callback, "The callback parameter cannot be null."); - callbackManager.addCustomEventCallback(eventClass, callback); + callbackManager.addCustomCallback(eventClass, callback); } /** @@ -539,7 +531,7 @@ private static String getResponseMimeType(final HttpResponse httpHeadResponse) { * @param event the event which gets delivered when a network error occurs */ private void handleNetworkError(final NetworkErrorEvent event) { - callbackManager.call(NetworkErrorEvent.class, event); + callbackManager.callCustomOrDefault(NetworkErrorEvent.class, event, this::onNetworkError); statsCounter.recordNetworkError(); } @@ -552,7 +544,8 @@ private void handleNetworkError(final NetworkErrorEvent event) { private void handleRequestRedirect(final RequestRedirectEvent event) { crawl(event.getRedirectedCrawlRequest()); - callbackManager.call(RequestRedirectEvent.class, event); + callbackManager.callCustomOrDefault(RequestRedirectEvent.class, event, + this::onRequestRedirect); statsCounter.recordRequestRedirect(); } @@ -564,7 +557,8 @@ private void handleRequestRedirect(final RequestRedirectEvent event) { * text/html */ private void handleNonHtmlContent(final NonHtmlContentEvent event) { - callbackManager.call(NonHtmlContentEvent.class, event); + callbackManager.callCustomOrDefault(NonHtmlContentEvent.class, event, + this::onNonHtmlContent); statsCounter.recordNonHtmlContent(); } @@ -576,7 +570,8 @@ private void handleNonHtmlContent(final NonHtmlContentEvent event) { * the timeout period */ private void handlePageLoadTimeout(final PageLoadTimeoutEvent event) { - callbackManager.call(PageLoadTimeoutEvent.class, event); + callbackManager.callCustomOrDefault(PageLoadTimeoutEvent.class, event, + this::onPageLoadTimeout); statsCounter.recordPageLoadTimeout(); } @@ -588,7 +583,7 @@ private void handlePageLoadTimeout(final PageLoadTimeoutEvent event) { * code 4xx or 5xx) occurs */ private void handleRequestError(final RequestErrorEvent event) { - callbackManager.call(RequestErrorEvent.class, event); + callbackManager.callCustomOrDefault(RequestErrorEvent.class, event, this::onRequestError); statsCounter.recordRequestError(); } @@ -599,7 +594,7 @@ private void handleRequestError(final RequestErrorEvent event) { * @param event the event which gets delivered when the browser loads the page */ private void handlePageLoad(final PageLoadEvent event) { - callbackManager.call(PageLoadEvent.class, event); + callbackManager.callCustomOrDefault(PageLoadEvent.class, event, this::onPageLoad); statsCounter.recordPageLoad(); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java b/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java index c0e87e1..063e4e9 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java +++ b/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java @@ -16,7 +16,7 @@ package com.github.peterbencze.serritor.api; -import com.github.peterbencze.serritor.internal.event.EventObject; +import com.github.peterbencze.serritor.internal.EventObject; import java.util.function.Consumer; import java.util.regex.Pattern; import org.apache.commons.lang3.Validate; diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java index f6e1aff..0c248d6 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java @@ -17,7 +17,7 @@ package com.github.peterbencze.serritor.api.event; import com.github.peterbencze.serritor.api.CrawlCandidate; -import com.github.peterbencze.serritor.internal.event.EventObject; +import com.github.peterbencze.serritor.internal.EventObject; /** * Event which gets delivered when a network error occurs. diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java index 99b26d2..95259a9 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java @@ -18,7 +18,7 @@ import com.github.peterbencze.serritor.api.CrawlCandidate; import com.github.peterbencze.serritor.api.PartialCrawlResponse; -import com.github.peterbencze.serritor.internal.event.EventObject; +import com.github.peterbencze.serritor.internal.EventObject; /** * Event which gets delivered when the MIME type of the response is not "text/html". diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java index 4657e4f..4ccf8e3 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java @@ -18,7 +18,7 @@ import com.github.peterbencze.serritor.api.CompleteCrawlResponse; import com.github.peterbencze.serritor.api.CrawlCandidate; -import com.github.peterbencze.serritor.internal.event.EventObject; +import com.github.peterbencze.serritor.internal.EventObject; /** * Event which gets delivered when the browser loads the page. diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java index 77b5984..e50b068 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java @@ -18,7 +18,7 @@ import com.github.peterbencze.serritor.api.CrawlCandidate; import com.github.peterbencze.serritor.api.PartialCrawlResponse; -import com.github.peterbencze.serritor.internal.event.EventObject; +import com.github.peterbencze.serritor.internal.EventObject; /** * Event which gets delivered when a page does not load in the browser within the timeout period. diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java index ef36537..8c576f0 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java @@ -18,7 +18,7 @@ import com.github.peterbencze.serritor.api.CompleteCrawlResponse; import com.github.peterbencze.serritor.api.CrawlCandidate; -import com.github.peterbencze.serritor.internal.event.EventObject; +import com.github.peterbencze.serritor.internal.EventObject; /** * Event which gets delivered when a request error (an error with HTTP status code 4xx or 5xx) diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java index 98e68cd..a9f4154 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java @@ -19,7 +19,7 @@ import com.github.peterbencze.serritor.api.CrawlCandidate; import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.PartialCrawlResponse; -import com.github.peterbencze.serritor.internal.event.EventObject; +import com.github.peterbencze.serritor.internal.EventObject; /** * Event which gets delivered when a request is redirected. diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CustomCallbackManager.java b/src/main/java/com/github/peterbencze/serritor/internal/CustomCallbackManager.java new file mode 100644 index 0000000..289dc22 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/CustomCallbackManager.java @@ -0,0 +1,86 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal; + +import com.github.peterbencze.serritor.api.PatternMatchingCallback; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import java.util.stream.Collectors; + +/** + * Manages custom callbacks associated with events. + */ +public final class CustomCallbackManager { + + private final Map, + List>> customCallbacks; + + /** + * Creates a {@link CustomCallbackManager} instance. + */ + public CustomCallbackManager() { + customCallbacks = new HashMap<>(); + } + + /** + * Associates a pattern matching callback with the specific event. + * + * @param the type of the input to the operation + * @param eventClass the runtime class of the event for which the callback should be invoked + * @param callback the pattern matching callback to invoke + */ + public void addCustomCallback( + final Class eventClass, + final PatternMatchingCallback callback) { + customCallbacks.computeIfAbsent(eventClass, key -> new ArrayList<>()).add(callback); + } + + /** + * Invokes all the custom callbacks associated with the event whose pattern matches the request + * URL. If no custom callbacks are registered for the event, it calls the provided default + * callback instead. + * + * @param the type of the input to the operation + * @param eventClass the runtime class of the event for which the callback should be + * invoked + * @param eventObject the input parameter for the callback + * @param defaultCallback the default callback for the event + */ + @SuppressWarnings("unchecked") + public void callCustomOrDefault( + final Class eventClass, + final T eventObject, + final Consumer defaultCallback) { + String requestUrl = eventObject.getCrawlCandidate().getRequestUrl().toString(); + List> applicableCustomCallbacks = + customCallbacks.getOrDefault(eventClass, Collections.emptyList()) + .stream() + .filter(callback -> callback.getUrlPattern().matcher(requestUrl).matches()) + .map(PatternMatchingCallback::getCallback) + .collect(Collectors.toList()); + + if (!applicableCustomCallbacks.isEmpty()) { + applicableCustomCallbacks.forEach(op -> ((Consumer) op).accept(eventObject)); + } else { + defaultCallback.accept(eventObject); + } + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/event/EventObject.java b/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java similarity index 92% rename from src/main/java/com/github/peterbencze/serritor/internal/event/EventObject.java rename to src/main/java/com/github/peterbencze/serritor/internal/EventObject.java index dea995c..ab17594 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/event/EventObject.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java @@ -1,47 +1,47 @@ -/* - * Copyright 2017 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.event; - -import com.github.peterbencze.serritor.api.CrawlCandidate; - -/** - * Base class from which all event objects shall be derived. - * - * @author Peter Bencze - */ -public abstract class EventObject { - - private final CrawlCandidate crawlCandidate; - - /** - * Base constructor of all event objects. - * - * @param crawlCandidate the current crawl candidate - */ - protected EventObject(final CrawlCandidate crawlCandidate) { - this.crawlCandidate = crawlCandidate; - } - - /** - * Returns the current crawl candidate. - * - * @return the current crawl candidate - */ - public final CrawlCandidate getCrawlCandidate() { - return crawlCandidate; - } -} +/* + * Copyright 2017 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal; + +import com.github.peterbencze.serritor.api.CrawlCandidate; + +/** + * Base class from which all event objects shall be derived. + * + * @author Peter Bencze + */ +public abstract class EventObject { + + private final CrawlCandidate crawlCandidate; + + /** + * Base constructor of all event objects. + * + * @param crawlCandidate the current crawl candidate + */ + protected EventObject(final CrawlCandidate crawlCandidate) { + this.crawlCandidate = crawlCandidate; + } + + /** + * Returns the current crawl candidate. + * + * @return the current crawl candidate + */ + public final CrawlCandidate getCrawlCandidate() { + return crawlCandidate; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java b/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java deleted file mode 100644 index f87002a..0000000 --- a/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright 2018 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.event; - -import com.github.peterbencze.serritor.api.PatternMatchingCallback; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.function.Consumer; -import java.util.stream.Collectors; - -/** - * Manages the default and custom callbacks associated with specific events. When an event occurs, - * it calls the default callback for it, or the associated custom ones whose pattern matches the URL - * of the request. - * - * @author Peter Bencze - */ -public final class EventCallbackManager { - - private final Map, - Consumer> defaultCallbacks; - private final Map, - List>> customCallbacks; - - /** - * Creates an {@link EventCallbackManager} instance. - */ - public EventCallbackManager() { - defaultCallbacks = new HashMap<>(); - customCallbacks = new HashMap<>(); - } - - /** - * Sets the default callback for the specific event. - * - * @param the type of the input to the operation - * @param eventClass the runtime class of the event for which the callback should be invoked - * @param callback the operation to be performed - */ - public void setDefaultEventCallback( - final Class eventClass, - final Consumer callback) { - defaultCallbacks.put(eventClass, callback); - } - - /** - * Associates a pattern matching callback with the specific event. - * - * @param the type of the input to the operation - * @param eventClass the runtime class of the event for which the callback should be invoked - * @param callback the pattern matching callback to invoke - */ - public void addCustomEventCallback( - final Class eventClass, - final PatternMatchingCallback callback) { - customCallbacks.computeIfAbsent(eventClass, key -> new ArrayList<>()).add(callback); - } - - /** - * Invokes the default callback for the specific event, if no custom callbacks are registered - * for it. Otherwise, it calls all the associated callbacks whose pattern matches the URL of the - * request. - * - * @param the type of the input to the operation - * @param eventClass the runtime class of the event for which the callback should be invoked - * @param eventObject the input parameter for the callback - */ - @SuppressWarnings("unchecked") - public void call(final Class eventClass, final T eventObject) { - if (!customCallbacks.containsKey(eventClass)) { - ((Consumer) defaultCallbacks.get(eventClass)).accept(eventObject); - return; - } - - String requestUrl = eventObject.getCrawlCandidate().getRequestUrl().toString(); - List> applicableCallbacks = - customCallbacks.get(eventClass) - .stream() - .filter(callback -> callback.getUrlPattern().matcher(requestUrl).matches()) - .collect(Collectors.toList()); - - if (applicableCallbacks.isEmpty()) { - ((Consumer) defaultCallbacks.get(eventClass)).accept(eventObject); - return; - } - - applicableCallbacks.stream() - .map(PatternMatchingCallback::getCallback) - .forEach(op -> ((Consumer) op).accept(eventObject)); - } -} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CustomCallbackManagerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CustomCallbackManagerTest.java new file mode 100644 index 0000000..c867f9a --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/internal/CustomCallbackManagerTest.java @@ -0,0 +1,145 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal; + +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.api.PatternMatchingCallback; +import com.github.peterbencze.serritor.api.event.PageLoadEvent; +import java.net.URI; +import java.util.function.Consumer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +/** + * Test cases for {@link CustomCallbackManager}. + */ +public final class CustomCallbackManagerTest { + + private CustomCallbackManager callbackManager; + private Consumer defaultPageLoadCallbackMock; + private PageLoadEvent pageLoadEventMock; + + @Before + public void before() { + callbackManager = new CustomCallbackManager(); + + defaultPageLoadCallbackMock = Mockito.mock(Consumer.class); + pageLoadEventMock = Mockito.mock(PageLoadEvent.class); + } + + @Test + public void testCallWithNoCustomCallback() { + CrawlCandidate crawlCandidateMock = createCrawlCandidateMock(); + Mockito.when(pageLoadEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); + + callbackManager.callCustomOrDefault(PageLoadEvent.class, pageLoadEventMock, + defaultPageLoadCallbackMock); + + Mockito.verify(defaultPageLoadCallbackMock, Mockito.times(1)).accept(pageLoadEventMock); + } + + @Test + public void testCallWithNoApplicableCustomCallback() { + PatternMatchingCallback patternMatchingCallbackMock + = Mockito.mock(PatternMatchingCallback.class); + Pattern patternMock = createPatternMock(false); + Consumer customPageLoadCallbackMock = Mockito.mock(Consumer.class); + Mockito.when(patternMatchingCallbackMock.getUrlPattern()).thenReturn(patternMock); + Mockito.when(patternMatchingCallbackMock.getCallback()) + .thenReturn(customPageLoadCallbackMock); + + CrawlCandidate crawlCandidateMock = createCrawlCandidateMock(); + Mockito.when(pageLoadEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); + + callbackManager.addCustomCallback(PageLoadEvent.class, patternMatchingCallbackMock); + callbackManager.callCustomOrDefault(PageLoadEvent.class, pageLoadEventMock, + defaultPageLoadCallbackMock); + + Mockito.verify(defaultPageLoadCallbackMock, Mockito.times(1)).accept(pageLoadEventMock); + Mockito.verify(customPageLoadCallbackMock, Mockito.never()).accept(pageLoadEventMock); + } + + @Test + public void testCallWithSingleApplicableCustomCallback() { + Pattern patternMock = createPatternMock(true); + Consumer customPageLoadCallbackMock = Mockito.mock(Consumer.class); + PatternMatchingCallback patternMatchingCallbackMock + = Mockito.mock(PatternMatchingCallback.class); + Mockito.when(patternMatchingCallbackMock.getUrlPattern()).thenReturn(patternMock); + Mockito.when(patternMatchingCallbackMock.getCallback()) + .thenReturn(customPageLoadCallbackMock); + + CrawlCandidate crawlCandidateMock = createCrawlCandidateMock(); + Mockito.when(pageLoadEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); + + callbackManager.addCustomCallback(PageLoadEvent.class, patternMatchingCallbackMock); + callbackManager.callCustomOrDefault(PageLoadEvent.class, pageLoadEventMock, + defaultPageLoadCallbackMock); + + Mockito.verify(defaultPageLoadCallbackMock, Mockito.never()).accept(pageLoadEventMock); + Mockito.verify(customPageLoadCallbackMock, Mockito.times(1)).accept(pageLoadEventMock); + } + + @Test + public void testCallWithMultipleApplicableCustomCallback() { + Pattern patternMock = createPatternMock(true); + Consumer customPageLoadCallbackMock = Mockito.mock(Consumer.class); + + PatternMatchingCallback patternMatchingCallbackMock1 + = Mockito.mock(PatternMatchingCallback.class); + Mockito.when(patternMatchingCallbackMock1.getUrlPattern()).thenReturn(patternMock); + Mockito.when(patternMatchingCallbackMock1.getCallback()) + .thenReturn(customPageLoadCallbackMock); + + PatternMatchingCallback patternMatchingCallbackMock2 + = Mockito.mock(PatternMatchingCallback.class); + Mockito.when(patternMatchingCallbackMock2.getUrlPattern()).thenReturn(patternMock); + Mockito.when(patternMatchingCallbackMock2.getCallback()) + .thenReturn(customPageLoadCallbackMock); + + CrawlCandidate crawlCandidateMock = createCrawlCandidateMock(); + Mockito.when(pageLoadEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); + + callbackManager.addCustomCallback(PageLoadEvent.class, patternMatchingCallbackMock1); + callbackManager.addCustomCallback(PageLoadEvent.class, patternMatchingCallbackMock2); + callbackManager.callCustomOrDefault(PageLoadEvent.class, pageLoadEventMock, + defaultPageLoadCallbackMock); + + Mockito.verify(defaultPageLoadCallbackMock, Mockito.never()).accept(pageLoadEventMock); + Mockito.verify(customPageLoadCallbackMock, Mockito.times(2)).accept(pageLoadEventMock); + } + + private static Pattern createPatternMock(final boolean shouldMatch) { + Matcher matcherMock = Mockito.mock(Matcher.class); + Mockito.when(matcherMock.matches()).thenReturn(shouldMatch); + + Pattern patternMock = Mockito.mock(Pattern.class); + Mockito.when(patternMock.matcher(Mockito.anyString())).thenReturn(matcherMock); + + return patternMock; + } + + private static CrawlCandidate createCrawlCandidateMock() { + CrawlCandidate mockedCrawlCandidate = Mockito.mock(CrawlCandidate.class); + Mockito.when(mockedCrawlCandidate.getRequestUrl()).thenReturn(Mockito.mock(URI.class)); + + return mockedCrawlCandidate; + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/event/EventCallbackManagerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/event/EventCallbackManagerTest.java deleted file mode 100644 index 382f227..0000000 --- a/src/test/java/com/github/peterbencze/serritor/internal/event/EventCallbackManagerTest.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2018 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.event; - -import com.github.peterbencze.serritor.api.CrawlCandidate; -import com.github.peterbencze.serritor.api.PatternMatchingCallback; -import com.github.peterbencze.serritor.api.event.PageLoadEvent; -import java.net.URI; -import java.util.function.Consumer; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.junit.Before; -import org.junit.Test; -import org.mockito.Mockito; - -/** - * Test cases for {@link EventCallbackManager}. - * - * @author Peter Bencze - */ -public final class EventCallbackManagerTest { - - private EventCallbackManager callbackManager; - private Consumer mockedDefaultPageLoadCallback; - private PageLoadEvent mockedPageLoadEvent; - - @Before - public void before() { - callbackManager = new EventCallbackManager(); - - mockedDefaultPageLoadCallback = Mockito.mock(Consumer.class); - mockedPageLoadEvent = Mockito.mock(PageLoadEvent.class); - } - - @Test - public void testCallWithNoCustomEventCallback() { - callbackManager.setDefaultEventCallback(PageLoadEvent.class, mockedDefaultPageLoadCallback); - callbackManager.call(PageLoadEvent.class, mockedPageLoadEvent); - - Mockito.verify(mockedDefaultPageLoadCallback, Mockito.times(1)).accept(mockedPageLoadEvent); - } - - @Test - public void testCallWithNoApplicableCustomEventCallback() { - Consumer mockedCustomPageLoadCallback = Mockito.mock(Consumer.class); - - PatternMatchingCallback mockedPatternMatchingCallback - = Mockito.mock(PatternMatchingCallback.class); - - Pattern mockedPattern = createMockedPattern(false); - - Mockito.when(mockedPatternMatchingCallback.getUrlPattern()).thenReturn(mockedPattern); - Mockito.when(mockedPatternMatchingCallback.getCallback()) - .thenReturn(mockedCustomPageLoadCallback); - - CrawlCandidate mockedCrawlCandidate = createMockedCrawlCandidate(); - - Mockito.when(mockedPageLoadEvent.getCrawlCandidate()).thenReturn(mockedCrawlCandidate); - - callbackManager.setDefaultEventCallback(PageLoadEvent.class, mockedDefaultPageLoadCallback); - callbackManager.addCustomEventCallback(PageLoadEvent.class, mockedPatternMatchingCallback); - callbackManager.call(PageLoadEvent.class, mockedPageLoadEvent); - - Mockito.verify(mockedDefaultPageLoadCallback, Mockito.times(1)).accept(mockedPageLoadEvent); - Mockito.verify(mockedCustomPageLoadCallback, Mockito.never()).accept(mockedPageLoadEvent); - } - - @Test - public void testCallWithSingleApplicableCustomEventCallback() { - Consumer mockedCustomPageLoadCallback = Mockito.mock(Consumer.class); - - PatternMatchingCallback mockedPatternMatchingCallback - = Mockito.mock(PatternMatchingCallback.class); - - Pattern mockedPattern = createMockedPattern(true); - - Mockito.when(mockedPatternMatchingCallback.getUrlPattern()).thenReturn(mockedPattern); - Mockito.when(mockedPatternMatchingCallback.getCallback()) - .thenReturn(mockedCustomPageLoadCallback); - - CrawlCandidate mockedCrawlCandidate = createMockedCrawlCandidate(); - - Mockito.when(mockedPageLoadEvent.getCrawlCandidate()).thenReturn(mockedCrawlCandidate); - - callbackManager.setDefaultEventCallback(PageLoadEvent.class, mockedDefaultPageLoadCallback); - callbackManager.addCustomEventCallback(PageLoadEvent.class, mockedPatternMatchingCallback); - callbackManager.call(PageLoadEvent.class, mockedPageLoadEvent); - - Mockito.verify(mockedDefaultPageLoadCallback, Mockito.never()).accept(mockedPageLoadEvent); - Mockito.verify(mockedCustomPageLoadCallback, Mockito.times(1)).accept(mockedPageLoadEvent); - } - - @Test - public void testCallWithMultipleApplicableCustomEventCallback() { - Consumer mockedCustomPageLoadCallback = Mockito.mock(Consumer.class); - - PatternMatchingCallback mockedPatternMatchingCallback1 - = Mockito.mock(PatternMatchingCallback.class); - PatternMatchingCallback mockedPatternMatchingCallback2 - = Mockito.mock(PatternMatchingCallback.class); - - Pattern mockedPattern = createMockedPattern(true); - - Mockito.when(mockedPatternMatchingCallback1.getUrlPattern()).thenReturn(mockedPattern); - Mockito.when(mockedPatternMatchingCallback1.getCallback()) - .thenReturn(mockedCustomPageLoadCallback); - - Mockito.when(mockedPatternMatchingCallback2.getUrlPattern()).thenReturn(mockedPattern); - Mockito.when(mockedPatternMatchingCallback2.getCallback()) - .thenReturn(mockedCustomPageLoadCallback); - - CrawlCandidate mockedCrawlCandidate = createMockedCrawlCandidate(); - - Mockito.when(mockedPageLoadEvent.getCrawlCandidate()).thenReturn(mockedCrawlCandidate); - - callbackManager.setDefaultEventCallback(PageLoadEvent.class, mockedDefaultPageLoadCallback); - callbackManager.addCustomEventCallback(PageLoadEvent.class, mockedPatternMatchingCallback1); - callbackManager.addCustomEventCallback(PageLoadEvent.class, mockedPatternMatchingCallback2); - callbackManager.call(PageLoadEvent.class, mockedPageLoadEvent); - - Mockito.verify(mockedDefaultPageLoadCallback, Mockito.never()).accept(mockedPageLoadEvent); - Mockito.verify(mockedCustomPageLoadCallback, Mockito.times(2)).accept(mockedPageLoadEvent); - } - - private static Pattern createMockedPattern(boolean shouldMatch) { - Matcher mockedMatcher = Mockito.mock(Matcher.class); - Mockito.when(mockedMatcher.matches()).thenReturn(shouldMatch); - - Pattern mockedPattern = Mockito.mock(Pattern.class); - Mockito.when(mockedPattern.matcher(Mockito.anyString())) - .thenReturn(mockedMatcher); - - return mockedPattern; - } - - private static CrawlCandidate createMockedCrawlCandidate() { - CrawlCandidate mockedCrawlCandidate = Mockito.mock(CrawlCandidate.class); - Mockito.when(mockedCrawlCandidate.getRequestUrl()).thenReturn(Mockito.mock(URI.class)); - - return mockedCrawlCandidate; - } -} From 19ab8438f1f708d187b51e78f4689b27b7b7c876 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 15 Mar 2019 12:36:27 +0100 Subject: [PATCH 40/63] Always call onStop even if an unhandled exception is thrown --- .../peterbencze/serritor/api/BaseCrawler.java | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 274e39b..e608803 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -235,22 +235,28 @@ private void start(final Browser browser, // Must be created here (the adaptive crawl delay strategy depends on the WebDriver) crawlDelayMechanism = createCrawlDelayMechanism(); + onStart(); + run(); } finally { - HttpClientUtils.closeQuietly(httpClient); + try { + onStop(); + } finally { + HttpClientUtils.closeQuietly(httpClient); - if (webDriver != null) { - webDriver.quit(); - } + if (webDriver != null) { + webDriver.quit(); + } - if (proxyServer != null && proxyServer.isStarted()) { - proxyServer.stop(); - } + if (proxyServer != null && proxyServer.isStarted()) { + proxyServer.stop(); + } - runTimeStopwatch.stop(); + runTimeStopwatch.stop(); - isStopInitiated.set(false); - isStopped.set(true); + isStopInitiated.set(false); + isStopped.set(true); + } } } @@ -371,8 +377,6 @@ protected final void downloadFile(final URI source, final File destination) thro * Defines the workflow of the crawler. */ private void run() { - onStart(); - boolean shouldPerformDelay = false; while (!isStopInitiated.get() && crawlFrontier.hasNextCandidate()) { @@ -478,8 +482,6 @@ private void run() { handlePageLoad(new PageLoadEvent(currentCandidate, new CompleteCrawlResponse(harResponse, webDriver))); } - - onStop(); } /** From 3922c85c57f436c202465f0fd949a67c718b2c3f Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Fri, 15 Mar 2019 12:42:58 +0100 Subject: [PATCH 41/63] Change exception message --- .../java/com/github/peterbencze/serritor/api/BaseCrawler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index e608803..35c9c08 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -446,7 +446,7 @@ private void run() { HarResponse harResponse = proxyServer.getHar().getLog().getEntries().stream() .filter(harEntry -> candidateUrl.equals(harEntry.getRequest().getUrl())) .findFirst() - .orElseThrow(() -> new IllegalStateException("No HAR entry for candidate URL")) + .orElseThrow(() -> new IllegalStateException("No HAR entry for request URL")) .getResponse(); if (harResponse.getError() != null) { handleNetworkError(new NetworkErrorEvent(currentCandidate, harResponse.getError())); From e6cfd038811a6815aa8dda69f652951d5dd89353 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sat, 16 Mar 2019 12:29:56 +0100 Subject: [PATCH 42/63] Change the way a fingerprint is generated for a URL --- .../serritor/internal/CrawlFrontier.java | 34 ++++++++++--------- .../serritor/internal/CrawlFrontierTest.java | 4 +-- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 804fc4d..ea1ead5 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -23,7 +23,6 @@ import com.github.peterbencze.serritor.internal.stats.StatsCounter; import java.io.Serializable; import java.net.URI; -import java.util.Arrays; import java.util.Comparator; import java.util.HashSet; import java.util.List; @@ -32,11 +31,11 @@ import java.util.Set; import java.util.function.Function; import org.apache.commons.codec.digest.DigestUtils; +import org.apache.http.NameValuePair; +import org.apache.http.client.utils.URIBuilder; /** * Manages crawl requests and provides crawl candidates to the crawler. - * - * @author Peter Bencze */ public final class CrawlFrontier implements Serializable { @@ -147,29 +146,32 @@ private void feedCrawlSeeds() { } /** - * Creates the fingerprint of the given URL. If the URL contains query parameters, it sorts - * them. This way URLs with different order of query parameters get the same fingerprint. + * Creates the fingerprint of the given URL. If the URL contains query params, it sorts them by + * key and value. This way URLs that have the same query params but in different order will have + * the same fingerprint. Fragments are ignored. * * @param url the URL for which the fingerprint is created * * @return the fingerprint of the URL */ private static String createFingerprintForUrl(final URI url) { - StringBuilder truncatedUrl = new StringBuilder(url.getHost()).append(url.getPath()); + URIBuilder builder = new URIBuilder(url); - String query = url.getQuery(); - if (query != null) { - truncatedUrl.append("?"); + // Change scheme and host to lowercase + builder.setScheme(builder.getScheme().toLowerCase()) + .setHost(builder.getHost().toLowerCase()); - String[] queryParams = url.getQuery().split("&"); + // Sort query params by key and value + List queryParams = builder.getQueryParams(); + queryParams.sort(Comparator.comparing(NameValuePair::getName) + .thenComparing(NameValuePair::getValue)); - List queryParamList = Arrays.asList(queryParams); - queryParamList.stream() - .sorted(String::compareToIgnoreCase) - .forEachOrdered(truncatedUrl::append); - } + builder.setParameters(queryParams); + + // Remove fragment + builder.setFragment(null); - return DigestUtils.sha256Hex(truncatedUrl.toString()); + return DigestUtils.sha256Hex(builder.toString()); } /** diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 5f6bcf1..2f63098 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -35,8 +35,6 @@ /** * Test cases for {@link CrawlFrontier}. - * - * @author Peter Bencze */ public final class CrawlFrontierTest { @@ -50,7 +48,7 @@ public final class CrawlFrontierTest { private static final URI ROOT_URL_0 = URI.create("http://root-url-0.com/?param1=foo¶m2=bar#fragment"); private static final URI DUPLICATE_ROOT_URL_0 - = URI.create("https://root-url-0.com/?param2=bar¶m1=foo"); + = URI.create("HTTP://ROOT-URL-0.COM/?param2=bar¶m1=foo"); private static final URI ROOT_URL_1 = URI.create("http://root-url-1.com/"); // Root URL crawl depth From 988de5ff9ebbd00f718d689cf38c3dad2d393772 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sat, 16 Mar 2019 12:54:50 +0100 Subject: [PATCH 43/63] Fix SonarLint warnings --- .../serritor/internal/CrawlFrontier.java | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index ea1ead5..381eca1 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -181,25 +181,26 @@ private static String createFingerprintForUrl(final URI url) { */ @SuppressWarnings("checkstyle:MissingSwitchDefault") private PriorityQueue createPriorityQueue() { - Function crawlDepthGetter - = (Function & Serializable) CrawlCandidate::getCrawlDepth; - Function priorityGetter - = (Function & Serializable) CrawlCandidate::getPriority; + Function crawlDepthGetter = + (Function & Serializable) CrawlCandidate::getCrawlDepth; + Function priorityGetter = + (Function & Serializable) CrawlCandidate::getPriority; switch (config.getCrawlStrategy()) { case BREADTH_FIRST: - Comparator breadthFirstComparator = Comparator.comparing(crawlDepthGetter) - .thenComparing(priorityGetter, Comparator.reverseOrder()); + Comparator breadthFirstComparator = + Comparator.comparing(crawlDepthGetter) + .thenComparing(priorityGetter, Comparator.reverseOrder()); return new PriorityQueue<>(breadthFirstComparator); case DEPTH_FIRST: - Comparator depthFirstComparator - = Comparator.comparing(crawlDepthGetter, Comparator.reverseOrder()) - .thenComparing(priorityGetter, Comparator.reverseOrder()); + Comparator depthFirstComparator = + Comparator.comparing(crawlDepthGetter, Comparator.reverseOrder()) + .thenComparing(priorityGetter, Comparator.reverseOrder()); return new PriorityQueue<>(depthFirstComparator); } - throw new IllegalArgumentException("Unsupported crawl strategy."); + throw new IllegalArgumentException("Unsupported crawl strategy"); } } From 64733110f3837504c6fa8bfaadf4d3c26821da51 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sat, 16 Mar 2019 23:00:06 +0100 Subject: [PATCH 44/63] Add string representation for crawl candidates --- .../serritor/api/CrawlCandidate.java | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java index 9b238d7..11f74ca 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java @@ -20,11 +20,11 @@ import java.io.Serializable; import java.net.URI; import java.util.Optional; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Represents a candidate for crawling. - * - * @author Peter Bencze */ public final class CrawlCandidate implements Serializable { @@ -92,6 +92,22 @@ public Optional getMetadata() { return crawlRequest.getMetadata(); } + /** + * Returns a string representation of this crawl candidate. + * + * @return a string representation of this crawl candidate + */ + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("refererUrl", refererUrl) + .append("requestUrl", getRequestUrl()) + .append("domain", getDomain()) + .append("crawlDepth", crawlDepth) + .append("priority", getPriority()) + .toString(); + } + /** * Builds {@link CrawlCandidate} instances. */ From fe1f1459005aa4872a528ac272571b4bf13a743a Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sat, 16 Mar 2019 23:43:21 +0100 Subject: [PATCH 45/63] Add string representation for crawl requests --- .../serritor/api/CrawlRequest.java | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 5ec8ae0..7d2694c 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -26,13 +26,13 @@ import java.util.Optional; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.http.client.utils.URIBuilder; /** - * Represents a crawl request that may be completed by the crawler. If request filtering is enabled, - * it could get filtered out. - * - * @author Peter Bencze + * Represents a crawl request that may be completed by the crawler in the future. If request + * filtering is enabled, it could be filtered out. */ public final class CrawlRequest implements Serializable { @@ -110,6 +110,20 @@ public Optional getMetadata() { return Optional.ofNullable(metadata); } + /** + * Returns a string representation of this crawl request. + * + * @return a string representation of this crawl request + */ + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("requestUrl", requestUrl) + .append("domain", domain) + .append("priority", priority) + .toString(); + } + /** * Builds {@link CrawlRequest} instances. */ From b9ae2dc95c17a531face1549f94c7397e3f00845 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sat, 16 Mar 2019 23:49:50 +0100 Subject: [PATCH 46/63] Add string representation for crawl domains --- .../peterbencze/serritor/internal/CrawlDomain.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java index 3138f07..fe220af 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java @@ -22,8 +22,6 @@ /** * Represents an internet domain in which crawling is allowed. - * - * @author Peter Bencze */ public final class CrawlDomain implements Serializable { @@ -101,4 +99,14 @@ public boolean contains(final InternetDomainName domain) { return parts.reverse() .equals(otherDomainParts); } + + /** + * Returns the string representation of this crawl domain. + * + * @return the string representation of this crawl domain + */ + @Override + public String toString() { + return domain; + } } From f63d544f5425f98fd16c28f8e836275b1d510566 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 17 Mar 2019 00:26:38 +0100 Subject: [PATCH 47/63] Add string representation for crawler configuration --- .../serritor/api/CrawlerConfiguration.java | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java index 1c26a72..8473041 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java @@ -24,6 +24,8 @@ import java.util.List; import java.util.Set; import org.apache.commons.lang3.Validate; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Contains the settings of the crawler. @@ -148,6 +150,27 @@ public long getMaximumCrawlDelayDurationInMillis() { return maxCrawlDelayDurationInMillis; } + /** + * Returns the string representation of this crawler configuration. + * + * @return the string representation of this crawler configuration + */ + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("crawlSeeds", crawlSeeds) + .append("crawlStrategy", crawlStrategy) + .append("maxCrawlDepth", maxCrawlDepth) + .append("isDuplicateRequestFilterEnabled", isDuplicateRequestFilterEnabled) + .append("isOffsiteRequestFilterEnabled", isOffsiteRequestFilterEnabled) + .append("allowedCrawlDomains", allowedCrawlDomains) + .append("crawlDelayStrategy", crawlDelayStrategy) + .append("fixedCrawlDelayDurationInMillis", fixedCrawlDelayDurationInMillis) + .append("minCrawlDelayDurationInMillis", minCrawlDelayDurationInMillis) + .append("maxCrawlDelayDurationInMillis", maxCrawlDelayDurationInMillis) + .toString(); + } + /** * Builds {@link CrawlerConfiguration} instances. */ From b4204d118587b478baaf0e61dd4c392ce8b6a5c7 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 17 Mar 2019 01:11:42 +0100 Subject: [PATCH 48/63] Specify JSON property order of crawler config --- .../serritor/api/CrawlerConfiguration.java | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java index 8473041..86e4113 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java @@ -16,6 +16,7 @@ package com.github.peterbencze.serritor.api; +import com.fasterxml.jackson.annotation.JsonPropertyOrder; import com.github.peterbencze.serritor.internal.CrawlDomain; import com.google.common.net.InternetDomainName; import java.io.Serializable; @@ -29,9 +30,19 @@ /** * Contains the settings of the crawler. - * - * @author Peter Bencze */ +@JsonPropertyOrder({ + "crawlSeeds", + "crawlStrategy", + "maximumCrawlDepth", + "duplicateRequestFilterEnabled", + "offsiteRequestFilterEnabled", + "allowedCrawlDomains", + "crawlDelayStrategy", + "fixedCrawlDelayDurationInMillis", + "minimumCrawlDelayDurationInMillis", + "maximumCrawlDelayDurationInMillis" +}) public final class CrawlerConfiguration implements Serializable { public static final long DEFAULT_PAGE_LOAD_TIMEOUT_IN_MILLIS = Duration.ofMinutes(3).toMillis(); @@ -160,14 +171,14 @@ public String toString() { return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) .append("crawlSeeds", crawlSeeds) .append("crawlStrategy", crawlStrategy) - .append("maxCrawlDepth", maxCrawlDepth) + .append("maximumCrawlDepth", maxCrawlDepth) .append("isDuplicateRequestFilterEnabled", isDuplicateRequestFilterEnabled) .append("isOffsiteRequestFilterEnabled", isOffsiteRequestFilterEnabled) .append("allowedCrawlDomains", allowedCrawlDomains) .append("crawlDelayStrategy", crawlDelayStrategy) .append("fixedCrawlDelayDurationInMillis", fixedCrawlDelayDurationInMillis) - .append("minCrawlDelayDurationInMillis", minCrawlDelayDurationInMillis) - .append("maxCrawlDelayDurationInMillis", maxCrawlDelayDurationInMillis) + .append("minimumCrawlDelayDurationInMillis", minCrawlDelayDurationInMillis) + .append("maximumCrawlDelayDurationInMillis", maxCrawlDelayDurationInMillis) .toString(); } From 16915974bbc13bad9dbf7cb675894767d8a458f3 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 31 Mar 2019 13:52:11 +0200 Subject: [PATCH 49/63] Implement web API feature --- pom.xml | 10 + .../serritor/api/RestServerConfiguration.java | 87 -------- .../serritor/api/RestfulBaseCrawler.java | 191 +++++++++++++----- .../api/web/AccessControlConfiguration.java | 156 ++++++++++++++ .../api/web/SslContextConfiguration.java | 87 ++++++++ .../peterbencze/serritor/api/web/User.java | 73 +++++++ .../serritor/api/web/WebApiConfiguration.java | 171 ++++++++++++++++ .../serritor/internal/web/ApiEndpoint.java | 69 +++++++ .../serritor/internal/web/HttpMethod.java | 32 +++ .../serritor/internal/web/UserRole.java | 28 +++ .../web/accessmanager/JwtAccessManager.java | 76 +++++++ .../web/accessmanager/NoopAccessManager.java | 44 ++++ .../serritor/internal/web/dto/JwtDto.java | 69 +++++++ .../serritor/internal/web/dto/LoginDto.java | 59 ++++++ .../internal/web/handler/JwtHandler.java | 75 +++++++ .../internal/web/handler/LoginHandler.java | 102 ++++++++++ .../web/handler/XsrfTokenHandler.java | 57 ++++++ .../accessmanager/JwtAccessManagerTest.java | 82 ++++++++ .../internal/web/handler/JwtHandlerTest.java | 59 ++++++ .../web/handler/LoginHandlerTest.java | 118 +++++++++++ .../web/handler/XsrfTokenHandlerTest.java | 54 +++++ 21 files changed, 1565 insertions(+), 134 deletions(-) delete mode 100644 src/main/java/com/github/peterbencze/serritor/api/RestServerConfiguration.java create mode 100644 src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java create mode 100644 src/main/java/com/github/peterbencze/serritor/api/web/SslContextConfiguration.java create mode 100644 src/main/java/com/github/peterbencze/serritor/api/web/User.java create mode 100644 src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/ApiEndpoint.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/HttpMethod.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/UserRole.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/dto/JwtDto.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/dto/LoginDto.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java create mode 100644 src/test/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManagerTest.java create mode 100644 src/test/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandlerTest.java create mode 100644 src/test/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandlerTest.java create mode 100644 src/test/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandlerTest.java diff --git a/pom.xml b/pom.xml index 878508b..071e5f5 100644 --- a/pom.xml +++ b/pom.xml @@ -86,6 +86,16 @@ slf4j-simple 1.7.25 + + com.auth0 + java-jwt + 3.8.0 + + + org.mindrot + jbcrypt + 0.4 + junit junit diff --git a/src/main/java/com/github/peterbencze/serritor/api/RestServerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/RestServerConfiguration.java deleted file mode 100644 index 7486858..0000000 --- a/src/main/java/com/github/peterbencze/serritor/api/RestServerConfiguration.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.api; - -/** - * Contains the settings of the REST server. - * - * @author Peter Bencze - */ -public final class RestServerConfiguration { - - private final int port; - - private RestServerConfiguration(final RestServerConfigurationBuilder builder) { - port = builder.port; - } - - /** - * Creates a default configuration for the REST server. - * - * @return a default configuration for the REST server - */ - public static RestServerConfiguration createDefault() { - return new RestServerConfigurationBuilder().build(); - } - - /** - * Returns the port of the REST server. - * - * @return the port of the REST server - */ - public int getPort() { - return port; - } - - /** - * Builds {@link RestServerConfiguration} instances. - */ - public static final class RestServerConfigurationBuilder { - - private static final int DEFAULT_PORT = 8080; - - private int port; - - /** - * Creates a {@link RestServerConfigurationBuilder} instance. - */ - public RestServerConfigurationBuilder() { - port = DEFAULT_PORT; - } - - /** - * Sets the port of the REST server. - * - * @param port the port number - * - * @return the RestServerConfigurationBuilder instance - */ - public RestServerConfigurationBuilder setPort(final int port) { - this.port = port; - return this; - } - - /** - * Builds the configured RestServerConfiguration instance. - * - * @return the configured RestServerConfiguration instance - */ - public RestServerConfiguration build() { - return new RestServerConfiguration(this); - } - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java index 95be1fc..5a76d1a 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java @@ -16,81 +16,136 @@ package com.github.peterbencze.serritor.api; +import com.auth0.jwt.algorithms.Algorithm; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; +import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; +import com.github.peterbencze.serritor.api.web.WebApiConfiguration; +import com.github.peterbencze.serritor.internal.web.ApiEndpoint; +import com.github.peterbencze.serritor.internal.web.accessmanager.JwtAccessManager; +import com.github.peterbencze.serritor.internal.web.accessmanager.NoopAccessManager; +import com.github.peterbencze.serritor.internal.web.handler.JwtHandler; +import com.github.peterbencze.serritor.internal.web.handler.LoginHandler; +import com.github.peterbencze.serritor.internal.web.handler.XsrfTokenHandler; +import io.javalin.Handler; import io.javalin.Javalin; import io.javalin.apibuilder.ApiBuilder; import io.javalin.json.JavalinJackson; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; +import javax.crypto.KeyGenerator; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.ServerConnector; +import org.eclipse.jetty.util.ssl.SslContextFactory; +import org.eclipse.jetty.util.thread.QueuedThreadPool; /** * Provides a skeletal implementation of a crawler to minimize the effort for users to implement - * their own. It also exposes a REST API that can be used to interact with the crawler while it is - * running. - * - * @author Peter Bencze + * their own. It also exposes a RESTful web API that can be used to interact with the crawler while + * it is running. */ public abstract class RestfulBaseCrawler extends BaseCrawler { - private final RestServerConfiguration config; - private final Javalin restServer; + private final WebApiConfiguration webApiConfig; + private final Javalin webServer; /** - * Base constructor which sets up the crawler with the provided configuration. The REST server - * is initialized with the default settings. + * Base constructor which sets up the crawler with the provided configuration. The web API is + * initialized with the default settings. * - * @param config the configuration of the crawler + * @param crawlerConfig the configuration of the crawler */ - protected RestfulBaseCrawler(final CrawlerConfiguration config) { - this(RestServerConfiguration.createDefault(), config); + protected RestfulBaseCrawler(final CrawlerConfiguration crawlerConfig) { + this(WebApiConfiguration.createDefault(), crawlerConfig); } /** - * Base constructor that sets up the REST server and the crawler with the provided - * configurations. + * Base constructor which sets up the web API and the crawler with the provided configurations. * - * @param restServerConfig the configuration of the REST server - * @param crawlerConfig the configuration of the crawler + * @param webApiConfig the configuration of the web API + * @param crawlerConfig the configuration of the crawler */ - protected RestfulBaseCrawler(final RestServerConfiguration restServerConfig, - final CrawlerConfiguration crawlerConfig) { - this(restServerConfig, new CrawlerState(crawlerConfig)); + protected RestfulBaseCrawler( + final WebApiConfiguration webApiConfig, + final CrawlerConfiguration crawlerConfig) { + this(webApiConfig, new CrawlerState(crawlerConfig)); } /** - * Base constructor which restores the crawler to the provided state. The REST server is - * initialized with the default settings. + * Base constructor which restores the crawler to the provided state. The web API is initialized + * with the default settings. * * @param state the state to restore the crawler to */ protected RestfulBaseCrawler(final CrawlerState state) { - this(RestServerConfiguration.createDefault(), state); + this(WebApiConfiguration.createDefault(), state); } /** - * Base constructor that sets up the REST server with the provided configuration and restores - * the crawler to the provided state. + * Base constructor which sets up the web API with the provided configuration and restores the + * crawler to the provided state. * - * @param config the configuration of the REST server - * @param state the state to restore the crawler to + * @param webApiConfig the configuration of the web API + * @param state the state to restore the crawler to */ - protected RestfulBaseCrawler(final RestServerConfiguration config, final CrawlerState state) { + protected RestfulBaseCrawler(final WebApiConfiguration webApiConfig, final CrawlerState state) { super(state); - this.config = config; - restServer = Javalin.create(); + this.webApiConfig = webApiConfig; - JavalinJackson.configure(new ObjectMapper().registerModule(new Jdk8Module())); + webServer = Javalin.create() + .disableStartupBanner() + .server(() -> createServer(webApiConfig)) + .routes(() -> { + registerEndpoint(ApiEndpoint.STOP_CRAWLER, ctx -> stop()); + registerEndpoint(ApiEndpoint.GET_CONFIG, + ctx -> ctx.json(getCrawlerConfiguration())); + registerEndpoint(ApiEndpoint.GET_STATS, ctx -> ctx.json(getCrawlStats())); + }); + + webApiConfig.getCorsOrigins().forEach(webServer::enableCorsForOrigin); + + Optional accessControlConfigOpt + = webApiConfig.getAccessControlConfiguration(); + if (accessControlConfigOpt.isPresent()) { + AccessControlConfiguration accessControlConfig = accessControlConfigOpt.get(); + + webServer.before(new JwtHandler()); + + if (accessControlConfig.isCookieAuthenticationEnabled()) { + webServer.before(new XsrfTokenHandler()); + } + + byte[] secretKey = accessControlConfig.getSecretKey() + .orElseGet(() -> { + try { + return KeyGenerator.getInstance("HmacSHA256") + .generateKey() + .getEncoded(); + } catch (NoSuchAlgorithmException e) { + throw new IllegalStateException(e); + } + }); + Algorithm signerAlgorithm = Algorithm.HMAC256(secretKey); + + webServer.accessManager(new JwtAccessManager(signerAlgorithm)); + + webServer.routes(() -> registerEndpoint(ApiEndpoint.LOGIN, + new LoginHandler(accessControlConfig, signerAlgorithm))); + } else { + webServer.accessManager(new NoopAccessManager()); + } - configureRoutes(); + JavalinJackson.configure(new ObjectMapper().registerModule(new Jdk8Module())); } /** - * Returns the configuration of the REST server. + * Returns the configuration of the web API. * - * @return the configuration of the REST server + * @return the configuration of the web API */ - public final RestServerConfiguration getRestServerConfiguration() { - return config; + public WebApiConfiguration getWebApiConfiguration() { + return webApiConfig; } /** @@ -100,7 +155,7 @@ public final RestServerConfiguration getRestServerConfiguration() { protected void onStart() { super.onStart(); - restServer.start(config.getPort()); + webServer.start(); } /** @@ -110,23 +165,65 @@ protected void onStart() { protected void onStop() { super.onStop(); - restServer.stop(); + webServer.stop(); } /** - * Sets up the routes. + * Creates and configures a Jetty HTTP servlet server. + * + * @param webApiConfig the configuration of the web API + * + * @return the configured Jetty HTTP servlet server */ - private void configureRoutes() { - restServer.routes(() -> { - ApiBuilder.path("api", () -> { - ApiBuilder.path("crawler", () -> { - ApiBuilder.delete(ctx -> stop()); + private static Server createServer(final WebApiConfiguration webApiConfig) { + Server server = new Server(new QueuedThreadPool(250, 8, 60_000)); - ApiBuilder.get("config", ctx -> ctx.json(getCrawlerConfiguration())); + ServerConnector serverConnector = webApiConfig.getSslContextConfiguration() + .map(sslContextConfig -> { + SslContextFactory sslContextFactory = new SslContextFactory(); + sslContextFactory.setKeyStorePath(sslContextConfig.getKeyStorePath()); + sslContextFactory.setKeyStorePassword(sslContextConfig.getKeyStorePassword()); + sslContextConfig.getKeyManagerPassword() + .ifPresent(sslContextFactory::setKeyManagerPassword); - ApiBuilder.get("stats", ctx -> ctx.json(getCrawlStats())); - }); - }); - }); + return new ServerConnector(server, sslContextFactory); + }) + .orElseGet(() -> new ServerConnector(server)); + serverConnector.setPort(webApiConfig.getPort()); + + server.addConnector(serverConnector); + + return server; + } + + /** + * Adds an endpoint to the web API. + * + * @param apiEndpoint the endpoint + * @param handler the handler of the endpoint + */ + private static void registerEndpoint(final ApiEndpoint apiEndpoint, final Handler handler) { + switch (apiEndpoint.getHttpMethod()) { + case HEAD: + ApiBuilder.head(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); + break; + case GET: + ApiBuilder.get(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); + break; + case POST: + ApiBuilder.post(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); + break; + case PUT: + ApiBuilder.put(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); + break; + case PATCH: + ApiBuilder.patch(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); + break; + case DELETE: + ApiBuilder.delete(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); + break; + default: + throw new IllegalArgumentException("Unsupported HTTP method"); + } } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java new file mode 100644 index 0000000..df49d4d --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java @@ -0,0 +1,156 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.web; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.Validate; + +/** + * Configuration of the access management. + */ +public final class AccessControlConfiguration { + + private final List users; + private final byte[] secretKey; + private final boolean isCookieAuthenticationEnabled; + + private AccessControlConfiguration(final AccessControlConfigurationBuilder builder) { + users = builder.users; + secretKey = builder.secretKey; + isCookieAuthenticationEnabled = builder.isCookieAuthenticationEnabled; + } + + /** + * Returns the user with the given username. + * + * @param username the username of the user + * + * @return the user with the given username + */ + public Optional getUser(final String username) { + return users.stream() + .filter(user -> StringUtils.equalsIgnoreCase(user.getUsername(), username)) + .findFirst(); + } + + /** + * Returns the secret key to be used for the JWT signing algorithm. + * + * @return the secret key to be used for the JWT signing algorithm + */ + public Optional getSecretKey() { + return Optional.ofNullable(secretKey); + } + + /** + * Indicates if the JWT can be stored in a cookie. + * + * @return true if the JWT can be stored in a cookie, false otherwise + */ + public boolean isCookieAuthenticationEnabled() { + return isCookieAuthenticationEnabled; + } + + /** + * Builder for {@link AccessControlConfiguration}. + */ + public static final class AccessControlConfigurationBuilder { + + private final List users; + + private byte[] secretKey; + private boolean isCookieAuthenticationEnabled; + + /** + * Creates a {@link AccessControlConfigurationBuilder} instance. + * + * @param rootUser the root user + */ + public AccessControlConfigurationBuilder(final User rootUser) { + Validate.notNull(rootUser, "The rootUser parameter cannot be null"); + + users = new ArrayList<>(); + users.add(rootUser); + } + + /** + * Adds a user to the list of users who have access to the web API. + * + * @param newUser the user to add + * + * @return the AccessControlConfigurationBuilder instance + */ + public AccessControlConfigurationBuilder addUser(final User newUser) { + Validate.notNull(newUser, "The newUser parameter cannot be null"); + Validate.isTrue(isUniqueUsername(newUser.getUsername()), "Username must be unique"); + + users.add(newUser); + return this; + } + + /** + * Sets the secret key to be used for the JWT signing algorithm. If it is not specified, a + * default will be generated. + * + * @param secretKey the secret key + * + * @return the AccessControlConfigurationBuilder instance + */ + public AccessControlConfigurationBuilder setSecretKey(final Byte[] secretKey) { + Validate.notEmpty(secretKey, "The secretKey parameter cannot be empty"); + this.secretKey = ArrayUtils.toPrimitive(secretKey); + + return this; + } + + /** + * If enabled, the JWT will be stored in a cookie. + * + * @return the AccessControlConfigurationBuilder instance + */ + public AccessControlConfigurationBuilder enableCookieAuthentication() { + isCookieAuthenticationEnabled = true; + return this; + } + + /** + * Builds the configured AccessControlConfiguration instance. + * + * @return the configured AccessControlConfiguration instance + */ + public AccessControlConfiguration build() { + return new AccessControlConfiguration(this); + } + + /** + * Indicates if the provided username has not yet been associated with a previously added + * user. + * + * @param username the username to check + * + * @return true if the username is free, false otherwise + */ + private boolean isUniqueUsername(final String username) { + return users.stream() + .noneMatch(user -> StringUtils.equalsIgnoreCase(user.getUsername(), username)); + } + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/SslContextConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/web/SslContextConfiguration.java new file mode 100644 index 0000000..c24f902 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/web/SslContextConfiguration.java @@ -0,0 +1,87 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.web; + +import java.util.Optional; +import org.apache.commons.lang3.Validate; + +/** + * Configuration of the SSL context. + */ +public final class SslContextConfiguration { + + private final String keyStorePath; + private final String keyStorePassword; + private final String keyManagerPassword; + + /** + * Creates a {@link SslContextConfiguration} instance. + * + * @param keyStorePath the path to the keystore file + * @param keyStorePassword the password for the keystore + * @param keyManagerPassword the password for the key manager + */ + public SslContextConfiguration( + final String keyStorePath, + final String keyStorePassword, + final String keyManagerPassword) { + Validate.notBlank(keyStorePath, "The keyStorePath parameter cannot be null or blank"); + Validate.notBlank(keyStorePassword, + "The keyStorePassword parameter cannot be null or blank"); + + this.keyStorePath = keyStorePath; + this.keyStorePassword = keyStorePassword; + this.keyManagerPassword = keyManagerPassword; + } + + /** + * Creates a {@link SslContextConfiguration} instance. + * + * @param keyStorePath the path to the keystore file + * @param keyStorePassword the password for the keystore + */ + public SslContextConfiguration(final String keyStorePath, final String keyStorePassword) { + this(keyStorePath, keyStorePassword, null); + } + + /** + * Returns the path to the keystore file. + * + * @return the path to the keystore file + */ + public String getKeyStorePath() { + return keyStorePath; + } + + /** + * Returns the password for the keystore. + * + * @return the password for the keystore + */ + public String getKeyStorePassword() { + return keyStorePassword; + } + + /** + * Returns the password for the key manager. + * + * @return the password for the key manager + */ + public Optional getKeyManagerPassword() { + return Optional.ofNullable(keyManagerPassword); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/User.java b/src/main/java/com/github/peterbencze/serritor/api/web/User.java new file mode 100644 index 0000000..5609e1a --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/web/User.java @@ -0,0 +1,73 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.web; + +import org.apache.commons.lang3.Validate; + +/** + * Represents a user of the web API. + */ +public final class User { + + private final String username; + private final String passwordHash; + + /** + * Creates a {@link User} instance. + * + * @param username the unique username of the user + * @param passwordHash the BCrypt hash of the user's password + */ + public User(final String username, final String passwordHash) { + Validate.notBlank(username, "The username parameter cannot be null or blank"); + Validate.notBlank(passwordHash, "The passwordHash parameter cannot be null or blank"); + Validate.isTrue(isSupportedSaltVersion(passwordHash), + "Unsupported BCrypt salt version (only $2$ or $2a$ are supported)"); + + this.username = username; + this.passwordHash = passwordHash; + } + + /** + * Returns the username of the user. + * + * @return the username of the user + */ + public String getUsername() { + return username; + } + + /** + * Returns the BCrypt hash of the user's password. + * + * @return the BCrypt hash of the user's password + */ + public String getPasswordHash() { + return passwordHash; + } + + /** + * Determines if the version of the BCrypt algorithm used to create the hash is supported. + * + * @param passwordHash the BCrypt hash + * + * @return true if the version is supported, false otherwise + */ + private static boolean isSupportedSaltVersion(final String passwordHash) { + return passwordHash.startsWith("$2$") || passwordHash.startsWith("$2a$"); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java new file mode 100644 index 0000000..17b1598 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java @@ -0,0 +1,171 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.web; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import org.apache.commons.lang3.Validate; + +/** + * Configuration for the web API. + */ +public final class WebApiConfiguration { + + private final int port; + private final List corsOrigins; + private final SslContextConfiguration sslContextConfig; + private final AccessControlConfiguration accessControlConfig; + + private WebApiConfiguration(final WebApiConfigurationBuilder builder) { + port = builder.port; + corsOrigins = builder.corsOrigins; + sslContextConfig = builder.sslContextConfig; + accessControlConfig = builder.accessControlConfig; + } + + /** + * Returns the default configuration of the web API. + * + * @return the default configuration of the web API + */ + public static WebApiConfiguration createDefault() { + return new WebApiConfigurationBuilder().build(); + } + + /** + * Returns the port number used by the web server. + * + * @return the port number used by the web server + */ + public int getPort() { + return port; + } + + /** + * Returns the list of allowed CORS origins. + * + * @return the list of allowed CORS origins + */ + public List getCorsOrigins() { + return corsOrigins; + } + + /** + * Returns the SSL context configuration. + * + * @return the SSL context configuration + */ + public Optional getSslContextConfiguration() { + return Optional.ofNullable(sslContextConfig); + } + + /** + * Returns the access control configuration. + * + * @return the access control configuration + */ + public Optional getAccessControlConfiguration() { + return Optional.ofNullable(accessControlConfig); + } + + /** + * Builder for {@link WebApiConfiguration}. + */ + public static final class WebApiConfigurationBuilder { + + private static final int DEFAULT_PORT = 8080; + + private final List corsOrigins; + + private int port; + private SslContextConfiguration sslContextConfig; + private AccessControlConfiguration accessControlConfig; + + /** + * Creates a {@link WebApiConfigurationBuilder} instance. + */ + public WebApiConfigurationBuilder() { + corsOrigins = new ArrayList<>(); + port = DEFAULT_PORT; + } + + /** + * Sets the port number to be used by the web server. + * + * @param port the port number to use + * + * @return the WebApiConfigurationBuilder instance + */ + public WebApiConfigurationBuilder setPort(final int port) { + this.port = port; + return this; + } + + /** + * Configures the web server to accept cross origin requests for the specific origin. The + * wildcard symbol "*" can be used to enable CORS for all origins. + * + * @param origin the origin from which the server should accept cross origin requests + * + * @return the WebApiConfigurationBuilder instance + */ + public WebApiConfigurationBuilder enableCorsForOrigin(final String origin) { + corsOrigins.add(origin); + return this; + } + + /** + * Enables the use of SSL. + * + * @param sslContextConfig the SSL context configuration + * + * @return the WebApiConfigurationBuilder instance + */ + public WebApiConfigurationBuilder withSsl(final SslContextConfiguration sslContextConfig) { + Validate.notNull(sslContextConfig, "The sslContextConfig parameter cannot be null"); + + this.sslContextConfig = sslContextConfig; + return this; + } + + /** + * Enables access control. + * + * @param accessControlConfig the access control configuration + * + * @return the WebApiConfigurationBuilder instance + */ + public WebApiConfigurationBuilder withAccessControl( + final AccessControlConfiguration accessControlConfig) { + Validate.notNull(accessControlConfig, + "The accessControlConfig parameter cannot be null"); + + this.accessControlConfig = accessControlConfig; + return this; + } + + /** + * Builds the configured WebApiConfiguration instance. + * + * @return the configured WebApiConfiguration instance + */ + public WebApiConfiguration build() { + return new WebApiConfiguration(this); + } + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/ApiEndpoint.java b/src/main/java/com/github/peterbencze/serritor/internal/web/ApiEndpoint.java new file mode 100644 index 0000000..dd0d5e4 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/ApiEndpoint.java @@ -0,0 +1,69 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web; + +import io.javalin.security.Role; +import java.util.Collections; +import java.util.Set; + +/** + * Represents an endpoint of the web API. + */ +public enum ApiEndpoint { + + LOGIN(HttpMethod.POST, "api/auth", Collections.singleton(UserRole.UNAUTHENTICATED)), + STOP_CRAWLER(HttpMethod.DELETE, "api/crawler", Collections.singleton(UserRole.AUTHENTICATED)), + GET_CONFIG(HttpMethod.GET, "api/crawler/config", Collections.singleton(UserRole.AUTHENTICATED)), + GET_STATS(HttpMethod.GET, "api/crawler/stats", Collections.singleton(UserRole.AUTHENTICATED)); + + private final HttpMethod httpMethod; + private final String path; + private final Set userRoles; + + ApiEndpoint(final HttpMethod httpMethod, final String path, final Set userRoles) { + this.httpMethod = httpMethod; + this.path = path; + this.userRoles = userRoles; + } + + /** + * Returns the HTTP method associated with the endpoint. + * + * @return the HTTP method associated with the endpoint + */ + public HttpMethod getHttpMethod() { + return httpMethod; + } + + /** + * Returns the path of the endpoint. + * + * @return the path of the endpoint + */ + public String getPath() { + return path; + } + + /** + * Returns the user roles associated with the endpoint. + * + * @return the user roles associated with the endpoint + */ + public Set getUserRoles() { + return userRoles; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/HttpMethod.java b/src/main/java/com/github/peterbencze/serritor/internal/web/HttpMethod.java new file mode 100644 index 0000000..bffeea3 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/HttpMethod.java @@ -0,0 +1,32 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web; + +/** + * Specifies the possible HTTP methods. + */ +public enum HttpMethod { + HEAD, + GET, + POST, + PUT, + PATCH, + DELETE, + CONNECT, + OPTIONS, + TRACE +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/UserRole.java b/src/main/java/com/github/peterbencze/serritor/internal/web/UserRole.java new file mode 100644 index 0000000..a376204 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/UserRole.java @@ -0,0 +1,28 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web; + +import io.javalin.security.Role; + +/** + * Represents a user's role. + */ +public enum UserRole implements Role { + + UNAUTHENTICATED, + AUTHENTICATED +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java b/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java new file mode 100644 index 0000000..e8d0baf --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java @@ -0,0 +1,76 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.accessmanager; + +import com.auth0.jwt.JWT; +import com.auth0.jwt.algorithms.Algorithm; +import com.auth0.jwt.exceptions.JWTVerificationException; +import com.auth0.jwt.interfaces.JWTVerifier; +import com.github.peterbencze.serritor.internal.web.UserRole; +import com.github.peterbencze.serritor.internal.web.handler.JwtHandler; +import io.javalin.Context; +import io.javalin.Handler; +import io.javalin.UnauthorizedResponse; +import io.javalin.security.AccessManager; +import io.javalin.security.Role; +import java.util.Set; + +/** + * A JWT-based access manager. + */ +public final class JwtAccessManager implements AccessManager { + + private final Algorithm signerAlgorithm; + + /** + * Creates a {@link JwtAccessManager} instance. + * + * @param signerAlgorithm the algorithm used for signing JWTs + */ + public JwtAccessManager(final Algorithm signerAlgorithm) { + this.signerAlgorithm = signerAlgorithm; + } + + /** + * Checks if the user is allowed to access the specific endpoint. + * + * @param handler the request handler + * @param ctx the context object + * @param permittedRoles the set of permitted roles + */ + @Override + public void manage( + final Handler handler, + final Context ctx, + final Set permittedRoles) throws Exception { + if (!permittedRoles.contains(UserRole.UNAUTHENTICATED)) { + String jwt = ctx.attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME); + if (jwt == null) { + throw new UnauthorizedResponse(); + } + + JWTVerifier verifier = JWT.require(signerAlgorithm).build(); + try { + verifier.verify(jwt); + } catch (JWTVerificationException e) { + throw new UnauthorizedResponse(); + } + } + + handler.handle(ctx); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java b/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java new file mode 100644 index 0000000..3e51aba --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java @@ -0,0 +1,44 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.accessmanager; + +import io.javalin.Context; +import io.javalin.Handler; +import io.javalin.security.AccessManager; +import io.javalin.security.Role; +import java.util.Set; + +/** + * A no-operation access manager that is used when access control is disabled. + */ +public final class NoopAccessManager implements AccessManager { + + /** + * Simply lets the request pass through without credential checking. + * + * @param handler the request handler + * @param ctx the context object + * @param permittedRoles a set of permitted roles + */ + @Override + public void manage( + final Handler handler, + final Context ctx, + final Set permittedRoles) throws Exception { + handler.handle(ctx); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/dto/JwtDto.java b/src/main/java/com/github/peterbencze/serritor/internal/web/dto/JwtDto.java new file mode 100644 index 0000000..63aa09e --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/dto/JwtDto.java @@ -0,0 +1,69 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.dto; + +import java.util.Date; + +/** + * A DTO that is used to send back the generated JWT to the user. + */ +public final class JwtDto { + + private final String username; + private final Date expiryDate; + private final String jwt; + + /** + * Creates a {@link JwtDto} instance. + * + * @param username the username of the authenticated user + * @param expiryDate the expiry date of the JWT + * @param jwt the generated JWT + */ + public JwtDto(final String username, final Date expiryDate, final String jwt) { + this.username = username; + this.expiryDate = expiryDate; + this.jwt = jwt; + } + + /** + * Returns the username of the authenticated user. + * + * @return the username of the authenticated user + */ + public String getUsername() { + return username; + } + + /** + * Returns the expiry date of the JWT. + * + * @return the expiry date of the JWT + */ + public Date getExpiryDate() { + return expiryDate; + } + + /** + * Returns the generated JWT. + * + * @return the generated JWT + */ + public String getJwt() { + return jwt; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/dto/LoginDto.java b/src/main/java/com/github/peterbencze/serritor/internal/web/dto/LoginDto.java new file mode 100644 index 0000000..e13cfa2 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/dto/LoginDto.java @@ -0,0 +1,59 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.dto; + +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * A DTO that is used to send the user authentication credentials to the web server. + */ +public final class LoginDto { + + private final String username; + private final String password; + + /** + * Creates a {@link LoginDto} instance. + * + * @param username the username of the user + * @param password the password of the user + */ + public LoginDto( + @JsonProperty("username") final String username, + @JsonProperty("password") final String password) { + this.username = username; + this.password = password; + } + + /** + * Returns the username of the user. + * + * @return the username of the user + */ + public String getUsername() { + return username; + } + + /** + * Returns the password of the user. + * + * @return the password of the user + */ + public String getPassword() { + return password; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java new file mode 100644 index 0000000..4eca563 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java @@ -0,0 +1,75 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.handler; + +import io.javalin.Context; +import io.javalin.Handler; +import java.util.Optional; + +/** + * A before-handler which extracts the JWT from the Authorization header or the cookie. + */ +public final class JwtHandler implements Handler { + + public static final String CONTEXT_ATTRIBUTE_NAME = "JWT"; + static final String COOKIE_NAME = "JWT"; + + /** + * Extracts JWT from the Authorization header or the cookie. + * + * @param ctx the context object + */ + @Override + public void handle(final Context ctx) throws Exception { + Optional jwtFromHeaderOpt = extractJwtFromHeader(ctx); + if (jwtFromHeaderOpt.isPresent()) { + ctx.attribute(CONTEXT_ATTRIBUTE_NAME, jwtFromHeaderOpt.get()); + } else { + extractJwtFromCookie(ctx).ifPresent(jwt -> ctx.attribute(CONTEXT_ATTRIBUTE_NAME, jwt)); + } + } + + /** + * Returns the JWT from the Authorization header. + * + * @param ctx the context object + * + * @return the JWT from the Authorization header + */ + private static Optional extractJwtFromHeader(final Context ctx) { + return Optional.ofNullable(ctx.header("Authorization")) + .flatMap(header -> { + String[] headerValueParts = header.split(" "); + if (headerValueParts.length != 2 || !"Bearer".equals(headerValueParts[0])) { + return Optional.empty(); + } + + return Optional.of(headerValueParts[1]); + }); + } + + /** + * Returns the JWT from the cookie. + * + * @param ctx the context object + * + * @return the JWT from the cookie + */ + private static Optional extractJwtFromCookie(final Context ctx) { + return Optional.ofNullable(ctx.cookie(COOKIE_NAME)); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java new file mode 100644 index 0000000..04ff7b6 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java @@ -0,0 +1,102 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.handler; + +import com.auth0.jwt.JWT; +import com.auth0.jwt.algorithms.Algorithm; +import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; +import com.github.peterbencze.serritor.api.web.User; +import com.github.peterbencze.serritor.internal.web.dto.JwtDto; +import com.github.peterbencze.serritor.internal.web.dto.LoginDto; +import io.javalin.Context; +import io.javalin.Handler; +import io.javalin.UnauthorizedResponse; +import java.security.SecureRandom; +import java.time.Duration; +import java.time.Instant; +import java.util.Base64; +import java.util.Date; +import org.mindrot.jbcrypt.BCrypt; + +/** + * A handler that is used to verify the authentication credentials of the user. + */ +public final class LoginHandler implements Handler { + + private final AccessControlConfiguration accessControlConfig; + private final Algorithm signerAlgorithm; + + /** + * Creates a {@link LoginHandler} instance. + * + * @param accessControlConfig the access control configuration + * @param signerAlgorithm the algorithm used for signing JWTs + */ + public LoginHandler( + final AccessControlConfiguration accessControlConfig, + final Algorithm signerAlgorithm) { + this.accessControlConfig = accessControlConfig; + this.signerAlgorithm = signerAlgorithm; + } + + /** + * Verifies the authentication credentials of the user. + * + * @param ctx the context object + */ + @Override + public void handle(final Context ctx) throws Exception { + LoginDto loginDto = ctx.bodyAsClass(LoginDto.class); + + User user = accessControlConfig.getUser(loginDto.getUsername()) + .orElseThrow(UnauthorizedResponse::new); + + if (!BCrypt.checkpw(loginDto.getPassword(), user.getPasswordHash())) { + throw new UnauthorizedResponse(); + } + + Duration tokenValidDuration = Duration.ofHours(1); + Date expiryDate = Date.from(Instant.now().plus(tokenValidDuration)); + String jwt = JWT.create() + .withExpiresAt(expiryDate) + .withClaim("username", user.getUsername()) + .sign(signerAlgorithm); + + if (accessControlConfig.isCookieAuthenticationEnabled()) { + int cookieAgeInSeconds = Math.toIntExact(tokenValidDuration.getSeconds()); + + ctx.cookie(JwtHandler.COOKIE_NAME, jwt, cookieAgeInSeconds); + ctx.cookie(XsrfTokenHandler.COOKIE_NAME, generateXsrfToken(), cookieAgeInSeconds); + } else { + ctx.json(new JwtDto(user.getUsername(), expiryDate, jwt)); + } + } + + /** + * Generates a random 128-bit XSRF token. + * + * @return the generated XSRF token + */ + private static String generateXsrfToken() { + SecureRandom secureRandom = new SecureRandom(); + byte[] randomBytes = new byte[16]; + + secureRandom.nextBytes(randomBytes); + + return Base64.getUrlEncoder().withoutPadding().encodeToString(randomBytes); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java new file mode 100644 index 0000000..a30a369 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java @@ -0,0 +1,57 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.handler; + +import com.github.peterbencze.serritor.internal.web.HttpMethod; +import io.javalin.Context; +import io.javalin.Handler; +import io.javalin.UnauthorizedResponse; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +/** + * A before-handler that is responsible for the validation of the XSRF token header if an XSRF + * cookie is present in the request. + */ +public final class XsrfTokenHandler implements Handler { + + static final String COOKIE_NAME = "XSRF-TOKEN"; + static final String HEADER_NAME = "X-XSRF-TOKEN"; + + private static final List XSRF_SAFE_HTTP_METHODS + = Arrays.asList(HttpMethod.HEAD, HttpMethod.GET, HttpMethod.OPTIONS, HttpMethod.TRACE); + + /** + * Verifies that the XSRF token present in the cookie matches the one present in the header. + * + * @param ctx the context object + */ + @Override + public void handle(final Context ctx) throws Exception { + HttpMethod requestMethod = HttpMethod.valueOf(ctx.method()); + if (XSRF_SAFE_HTTP_METHODS.contains(requestMethod)) { + return; + } + + Optional.ofNullable(ctx.cookie(COOKIE_NAME)).ifPresent(xsrfTokenInCookie -> { + if (!xsrfTokenInCookie.equals(ctx.header(HEADER_NAME))) { + throw new UnauthorizedResponse("XSRF token missing or incorrect"); + } + }); + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManagerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManagerTest.java new file mode 100644 index 0000000..70c8048 --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManagerTest.java @@ -0,0 +1,82 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.accessmanager; + +import com.auth0.jwt.algorithms.Algorithm; +import com.auth0.jwt.exceptions.SignatureVerificationException; +import com.auth0.jwt.interfaces.DecodedJWT; +import com.github.peterbencze.serritor.internal.web.UserRole; +import com.github.peterbencze.serritor.internal.web.handler.JwtHandler; +import io.javalin.Context; +import io.javalin.Handler; +import io.javalin.UnauthorizedResponse; +import io.javalin.security.Role; +import java.util.Set; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +/** + * Test cases for {@link JwtAccessManager}. + */ +public final class JwtAccessManagerTest { + + private Handler handlerMock; + private Context contextMock; + private Set permittedRolesMock; + + private Algorithm signerAlgorithm; + private JwtAccessManager jwtAccessManager; + + @Before + public void before() { + handlerMock = Mockito.mock(Handler.class); + contextMock = Mockito.mock(Context.class); + permittedRolesMock = Mockito.mock(Set.class); + + signerAlgorithm = Mockito.spy(Algorithm.HMAC256("secret")); + jwtAccessManager = new JwtAccessManager(signerAlgorithm); + } + + @Test(expected = UnauthorizedResponse.class) + public void testManageWhenEndpointIsRestrictedAndJwtIsNotPresent() throws Exception { + Mockito.when(permittedRolesMock.contains(Mockito.any(UserRole.class))).thenReturn(false); + Mockito.when(contextMock.attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME)).thenReturn(null); + + jwtAccessManager.manage(handlerMock, contextMock, permittedRolesMock); + } + + @Test(expected = UnauthorizedResponse.class) + public void testManageWhenEndpointIsRestrictedAndJwtIsInvalid() throws Exception { + Mockito.when(permittedRolesMock.contains(Mockito.any(UserRole.class))).thenReturn(false); + Mockito.when(contextMock.attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME)) + .thenReturn("eyJhbGciOiJIUzI1NiJ9.e30.XmNK3GpH3Ys_7wsYBfq4C3M6goz71I7dTgUkuIa5lyQ"); + Mockito.doThrow(SignatureVerificationException.class).when(signerAlgorithm) + .verify(Mockito.any(DecodedJWT.class)); + + jwtAccessManager.manage(handlerMock, contextMock, permittedRolesMock); + } + + @Test + public void testManageWhenEndpointIsNotRestricted() throws Exception { + Mockito.when(permittedRolesMock.contains(Mockito.any(UserRole.class))).thenReturn(true); + + jwtAccessManager.manage(handlerMock, contextMock, permittedRolesMock); + + Mockito.verify(handlerMock).handle(Mockito.eq(contextMock)); + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandlerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandlerTest.java new file mode 100644 index 0000000..6b5a017 --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandlerTest.java @@ -0,0 +1,59 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.handler; + +import io.javalin.Context; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +/** + * Test cases for {@link JwtHandler}. + */ +public final class JwtHandlerTest { + + private static final String JWT = "foo.bar.baz"; + + private Context contextMock; + + private JwtHandler jwtHandler; + + @Before + public void before() { + contextMock = Mockito.mock(Context.class); + + jwtHandler = new JwtHandler(); + } + + @Test + public void testHandleWhenJwtIsPresentInHeader() throws Exception { + Mockito.when(contextMock.header(Mockito.eq("Authorization"))).thenReturn("Bearer " + JWT); + + jwtHandler.handle(contextMock); + + Mockito.verify(contextMock).attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME, JWT); + } + + @Test + public void testHandleWhenJwtIsPresentInCookie() throws Exception { + Mockito.when(contextMock.cookie(JwtHandler.COOKIE_NAME)).thenReturn(JWT); + + jwtHandler.handle(contextMock); + + Mockito.verify(contextMock).attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME, JWT); + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandlerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandlerTest.java new file mode 100644 index 0000000..79c4d23 --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandlerTest.java @@ -0,0 +1,118 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.handler; + +import com.auth0.jwt.algorithms.Algorithm; +import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; +import com.github.peterbencze.serritor.api.web.User; +import com.github.peterbencze.serritor.internal.web.dto.JwtDto; +import com.github.peterbencze.serritor.internal.web.dto.LoginDto; +import io.javalin.Context; +import io.javalin.UnauthorizedResponse; +import java.util.Optional; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +/** + * Test cases for {@link LoginHandler}. + */ +public final class LoginHandlerTest { + + private static final String INCORRECT_PASSWORD_HASH + = "$2a$10$Jh4rXRRgeI6WDsb8X7XXpuOJlF1ntM6OJ4ObdNiEaI0AH6d4Lcmky"; + private static final String CORRECT_PASSWORD_HASH + = "$2a$10$baEfqZy/tI3RoKlxQk6jGe9L5nf3NMTEOSWKasVArYH3Ki44pNSU2"; + + private AccessControlConfiguration accessControlConfigMock; + private Context contextMock; + private User userMock; + + private LoginHandler loginHandler; + + @Before + public void before() { + accessControlConfigMock = Mockito.mock(AccessControlConfiguration.class); + + LoginDto loginDtoMock = Mockito.mock(LoginDto.class); + Mockito.when(loginDtoMock.getUsername()).thenReturn("foo"); + Mockito.when(loginDtoMock.getPassword()).thenReturn("bar"); + + contextMock = Mockito.mock(Context.class); + Mockito.when(contextMock.bodyAsClass(LoginDto.class)).thenReturn(loginDtoMock); + + userMock = Mockito.mock(User.class); + + Algorithm signerAlgorithm = Mockito.spy(Algorithm.HMAC256("secret")); + loginHandler = new LoginHandler(accessControlConfigMock, signerAlgorithm); + } + + @Test(expected = UnauthorizedResponse.class) + public void testHandleWhenUserDoesNotExist() throws Exception { + Mockito.when(accessControlConfigMock.getUser(Mockito.anyString())) + .thenReturn(Optional.empty()); + + loginHandler.handle(contextMock); + } + + @Test(expected = UnauthorizedResponse.class) + public void testHandleWhenPasswordIsIncorrect() throws Exception { + Mockito.when(userMock.getPasswordHash()).thenReturn(INCORRECT_PASSWORD_HASH); + + Mockito.when(accessControlConfigMock.getUser(Mockito.anyString())) + .thenReturn(Optional.of(userMock)); + + loginHandler.handle(contextMock); + } + + @Test + public void testHandleWhenPasswordIsCorrectAndCookieAuthenticationIsDisabled() + throws Exception { + Mockito.when(userMock.getPasswordHash()).thenReturn(CORRECT_PASSWORD_HASH); + + Mockito.when(accessControlConfigMock.getUser(Mockito.anyString())) + .thenReturn(Optional.of(userMock)); + Mockito.when(accessControlConfigMock.isCookieAuthenticationEnabled()).thenReturn(false); + + loginHandler.handle(contextMock); + + Mockito.verify(contextMock).json(Mockito.any(JwtDto.class)); + Mockito.verify(contextMock, Mockito.never()) + .cookie(Mockito.eq(JwtHandler.COOKIE_NAME), Mockito.anyString(), Mockito.anyInt()); + Mockito.verify(contextMock, Mockito.never()) + .cookie(Mockito.eq(XsrfTokenHandler.COOKIE_NAME), Mockito.anyString(), + Mockito.anyInt()); + } + + @Test + public void testHandleWhenPasswordIsCorrectAndCookieAuthenticationIsEnabled() throws Exception { + Mockito.when(userMock.getPasswordHash()).thenReturn(CORRECT_PASSWORD_HASH); + + Mockito.when(accessControlConfigMock.getUser(Mockito.anyString())) + .thenReturn(Optional.of(userMock)); + Mockito.when(accessControlConfigMock.isCookieAuthenticationEnabled()).thenReturn(true); + + loginHandler.handle(contextMock); + + Mockito.verify(contextMock, Mockito.never()).json(Mockito.any(JwtDto.class)); + Mockito.verify(contextMock) + .cookie(Mockito.eq(JwtHandler.COOKIE_NAME), Mockito.anyString(), Mockito.anyInt()); + Mockito.verify(contextMock) + .cookie(Mockito.eq(XsrfTokenHandler.COOKIE_NAME), Mockito.anyString(), + Mockito.anyInt()); + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandlerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandlerTest.java new file mode 100644 index 0000000..cc08c27 --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandlerTest.java @@ -0,0 +1,54 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.handler; + +import io.javalin.Context; +import io.javalin.UnauthorizedResponse; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +/** + * Test cases for {@link XsrfTokenHandler}. + */ +public final class XsrfTokenHandlerTest { + + private Context contextMock; + + private XsrfTokenHandler xsrfTokenHandler; + + @Before + public void before() { + contextMock = Mockito.mock(Context.class); + Mockito.when(contextMock.method()).thenReturn("POST"); + Mockito.when(contextMock.cookie(XsrfTokenHandler.COOKIE_NAME)).thenReturn("foo"); + + xsrfTokenHandler = new XsrfTokenHandler(); + } + + @Test(expected = UnauthorizedResponse.class) + public void testHandleWhenHeaderIsNotPresent() throws Exception { + xsrfTokenHandler.handle(contextMock); + } + + @Test(expected = UnauthorizedResponse.class) + public void testHandleWhenHeaderContainsInvalidToken() throws Exception { + Mockito.when(contextMock.header(XsrfTokenHandler.HEADER_NAME)).thenReturn("bar"); + + xsrfTokenHandler.handle(contextMock); + } +} From 11548bd53ed0455c856b840c50bd2426a95bf5f9 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 2 Apr 2019 15:28:25 +0200 Subject: [PATCH 50/63] Fix SonarLint warnings --- .../peterbencze/serritor/api/BaseCrawler.java | 7 ++--- .../serritor/api/helper/UrlFinder.java | 28 ++++++++----------- .../serritor/internal/CrawlFrontier.java | 5 ++-- .../serritor/internal/WebDriverFactory.java | 13 +++++---- 4 files changed, 24 insertions(+), 29 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 35c9c08..c23eee1 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -76,8 +76,6 @@ /** * Provides a skeletal implementation of a crawler to minimize the effort for users to implement * their own. - * - * @author Peter Bencze */ public abstract class BaseCrawler { @@ -489,7 +487,6 @@ private void run() { * * @return the created crawl delay mechanism */ - @SuppressWarnings("checkstyle:MissingSwitchDefault") private CrawlDelayMechanism createCrawlDelayMechanism() { switch (config.getCrawlDelayStrategy()) { case FIXED: @@ -498,9 +495,9 @@ private CrawlDelayMechanism createCrawlDelayMechanism() { return new RandomCrawlDelayMechanism(config); case ADAPTIVE: return new AdaptiveCrawlDelayMechanism(config, (JavascriptExecutor) webDriver); + default: + throw new IllegalArgumentException("Unsupported crawl delay strategy"); } - - throw new IllegalArgumentException("Unsupported crawl delay strategy."); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index 6e2e62b..910ff5b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -21,7 +21,7 @@ import com.google.common.net.InternetDomainName; import java.net.URI; import java.util.ArrayList; -import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -36,8 +36,6 @@ /** * Finds URLs in HTML page sources using regular expressions. - * - * @author Peter Bencze */ public final class UrlFinder { @@ -77,20 +75,18 @@ public List findUrlsInPage(final CompleteCrawlResponse completeCrawlResp Set foundUrls = new HashSet<>(); // Find elements using the specified locating mechanisms - Set extractedElements = locatingMechanisms.stream() + List extractedElements = locatingMechanisms.stream() .map(completeCrawlResponse.getWebDriver()::findElements) .flatMap(List::stream) - .collect(Collectors.toSet()); + .collect(Collectors.toList()); // Find URLs in the attribute values of the found elements - extractedElements.forEach((WebElement element) -> { - attributes.stream() - .map(element::getAttribute) - .filter(StringUtils::isNotBlank) - .map(this::findUrlsInAttributeValue) - .flatMap(List::stream) - .forEach(foundUrls::add); - }); + extractedElements.forEach((WebElement element) -> attributes.stream() + .map(element::getAttribute) + .filter(StringUtils::isNotBlank) + .map(this::findUrlsInAttributeValue) + .flatMap(List::stream) + .forEach(foundUrls::add)); return new ArrayList<>(foundUrls); } @@ -141,7 +137,7 @@ public static final class UrlFinderBuilder { * @param urlPattern the pattern to use to find URLs */ public UrlFinderBuilder(final Pattern urlPattern) { - this(Arrays.asList(urlPattern)); + this(Collections.singletonList(urlPattern)); } /** @@ -168,7 +164,7 @@ public UrlFinderBuilder(final List urlPatterns) { * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setLocatingMechanism(final By locatingMechanism) { - return setLocatingMechanisms(Arrays.asList(locatingMechanism)); + return setLocatingMechanisms(Collections.singletonList(locatingMechanism)); } /** @@ -210,7 +206,7 @@ public UrlFinderBuilder setAttributes(final List attributes) { * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setAttribute(final String attribute) { - return setAttributes(Arrays.asList(attribute)); + return setAttributes(Collections.singletonList(attribute)); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 381eca1..3cdc44c 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -179,7 +179,6 @@ private static String createFingerprintForUrl(final URI url) { * * @return the priority queue using the strategy specified in the configuration */ - @SuppressWarnings("checkstyle:MissingSwitchDefault") private PriorityQueue createPriorityQueue() { Function crawlDepthGetter = (Function & Serializable) CrawlCandidate::getCrawlDepth; @@ -199,8 +198,8 @@ private PriorityQueue createPriorityQueue() { .thenComparing(priorityGetter, Comparator.reverseOrder()); return new PriorityQueue<>(depthFirstComparator); + default: + throw new IllegalArgumentException("Unsupported crawl strategy"); } - - throw new IllegalArgumentException("Unsupported crawl strategy"); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java b/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java index 7ea162f..9ed152d 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/WebDriverFactory.java @@ -30,11 +30,15 @@ /** * Provides preconfigured {@link WebDriver} instances. - * - * @author Peter Bencze */ public final class WebDriverFactory { + /** + * Private constructor to hide the implicit public one. + */ + private WebDriverFactory() { + } + /** * Creates the specific WebDriver instance with the provided properties. * @@ -43,7 +47,6 @@ public final class WebDriverFactory { * * @return the preconfigured WebDriver instance */ - @SuppressWarnings("checkstyle:MissingSwitchDefault") public static WebDriver createWebDriver(final Browser browser, final Capabilities capabilities) { switch (browser) { @@ -53,9 +56,9 @@ public static WebDriver createWebDriver(final Browser browser, return createChromeDriver(capabilities); case FIREFOX: return createFirefoxDriver(capabilities); + default: + throw new IllegalArgumentException("Unsupported browser"); } - - throw new IllegalArgumentException("Unsupported browser."); } /** From 2af9f3b457550c23692c52e41e0b3b597eee0f06 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 2 Apr 2019 15:37:08 +0200 Subject: [PATCH 51/63] Remove author Javadoc tags --- .../java/com/github/peterbencze/serritor/api/Browser.java | 2 -- .../peterbencze/serritor/api/CompleteCrawlResponse.java | 2 -- .../github/peterbencze/serritor/api/CrawlDelayStrategy.java | 2 -- .../java/com/github/peterbencze/serritor/api/CrawlStats.java | 2 -- .../com/github/peterbencze/serritor/api/CrawlStrategy.java | 2 -- .../com/github/peterbencze/serritor/api/CrawlerState.java | 2 -- .../github/peterbencze/serritor/api/PartialCrawlResponse.java | 2 -- .../peterbencze/serritor/api/PatternMatchingCallback.java | 2 -- .../peterbencze/serritor/api/event/NetworkErrorEvent.java | 2 -- .../peterbencze/serritor/api/event/NonHtmlContentEvent.java | 2 -- .../github/peterbencze/serritor/api/event/PageLoadEvent.java | 2 -- .../peterbencze/serritor/api/event/PageLoadTimeoutEvent.java | 2 -- .../peterbencze/serritor/api/event/RequestErrorEvent.java | 2 -- .../peterbencze/serritor/api/event/RequestRedirectEvent.java | 2 -- .../com/github/peterbencze/serritor/internal/EventObject.java | 2 -- .../crawldelaymechanism/AdaptiveCrawlDelayMechanism.java | 2 -- .../internal/crawldelaymechanism/CrawlDelayMechanism.java | 2 -- .../crawldelaymechanism/FixedCrawlDelayMechanism.java | 2 -- .../crawldelaymechanism/RandomCrawlDelayMechanism.java | 2 -- .../peterbencze/serritor/internal/util/CookieConverter.java | 2 -- .../serritor/internal/util/stopwatch/Stopwatch.java | 2 -- .../serritor/internal/util/stopwatch/TimeSource.java | 2 -- .../serritor/internal/util/stopwatch/UtcTimeSource.java | 2 -- .../github/peterbencze/serritor/api/helper/UrlFinderTest.java | 2 -- .../github/peterbencze/serritor/internal/CrawlDomainTest.java | 2 -- .../crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java | 2 -- .../crawldelaymechanism/FixedCrawlDelayMechanismTest.java | 2 -- .../serritor/internal/util/CookieConverterTest.java | 2 -- .../serritor/internal/util/stopwatch/StopwatchTest.java | 4 +--- .../java/com/github/peterbencze/serritor/it/SerritorIT.java | 2 -- 30 files changed, 1 insertion(+), 61 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/Browser.java b/src/main/java/com/github/peterbencze/serritor/api/Browser.java index 912586f..0fffb9a 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/Browser.java +++ b/src/main/java/com/github/peterbencze/serritor/api/Browser.java @@ -18,8 +18,6 @@ /** * Supported browsers that can be used for crawling. - * - * @author Peter Bencze */ public enum Browser { diff --git a/src/main/java/com/github/peterbencze/serritor/api/CompleteCrawlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/CompleteCrawlResponse.java index 9f0fc34..307a9d3 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CompleteCrawlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CompleteCrawlResponse.java @@ -22,8 +22,6 @@ /** * Represents a complete crawl response that provides access to the HTTP header information and the * {@link WebDriver} instance to interact with the browser. - * - * @author Peter Bencze */ public final class CompleteCrawlResponse extends PartialCrawlResponse { diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java index 4a80d8b..f7282d8 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java @@ -18,8 +18,6 @@ /** * Available crawl delay strategies which define how the delay between each request is determined. - * - * @author Peter Bencze */ public enum CrawlDelayStrategy { diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java index 34e1f9d..e1fb682 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java @@ -29,8 +29,6 @@ /** * Summary statistics about the crawl progress. - * - * @author Peter Bencze */ @JsonPropertyOrder({ "runDuration", diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java index e449892..2b17d92 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java @@ -18,8 +18,6 @@ /** * Available crawl strategies that define the order in which crawl requests are processed. - * - * @author Peter Bencze */ public enum CrawlStrategy { diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java index b50ceff..c0e5386 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerState.java @@ -26,8 +26,6 @@ /** * Represents the current state of the crawling session. More specifically, it contains a set of * state objects that can be later reused to resume that particular session. - * - * @author Peter Bencze */ public final class CrawlerState implements Serializable { diff --git a/src/main/java/com/github/peterbencze/serritor/api/PartialCrawlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/PartialCrawlResponse.java index f53e245..4f8799d 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/PartialCrawlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/PartialCrawlResponse.java @@ -28,8 +28,6 @@ /** * Represents a partial response that only contains HTTP header information. - * - * @author Peter Bencze */ public class PartialCrawlResponse { diff --git a/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java b/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java index 063e4e9..35b3a1d 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java +++ b/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java @@ -26,8 +26,6 @@ * URL. * * @param the type of the input to the operation - * - * @author Peter Bencze */ public final class PatternMatchingCallback { diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java index 0c248d6..41338ef 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/NetworkErrorEvent.java @@ -21,8 +21,6 @@ /** * Event which gets delivered when a network error occurs. - * - * @author Peter Bencze */ public final class NetworkErrorEvent extends EventObject { diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java index 95259a9..823b6bf 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java @@ -22,8 +22,6 @@ /** * Event which gets delivered when the MIME type of the response is not "text/html". - * - * @author Peter Bencze */ public final class NonHtmlContentEvent extends EventObject { diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java index 4ccf8e3..1243b36 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java @@ -22,8 +22,6 @@ /** * Event which gets delivered when the browser loads the page. - * - * @author Peter Bencze */ public final class PageLoadEvent extends EventObject { diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java index e50b068..1d8e003 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java @@ -22,8 +22,6 @@ /** * Event which gets delivered when a page does not load in the browser within the timeout period. - * - * @author Peter Bencze */ public final class PageLoadTimeoutEvent extends EventObject { diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java index 8c576f0..d2736d5 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java @@ -23,8 +23,6 @@ /** * Event which gets delivered when a request error (an error with HTTP status code 4xx or 5xx) * occurs. - * - * @author Peter Bencze */ public final class RequestErrorEvent extends EventObject { diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java index a9f4154..771cc77 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java @@ -23,8 +23,6 @@ /** * Event which gets delivered when a request is redirected. - * - * @author Peter Bencze */ public final class RequestRedirectEvent extends EventObject { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java b/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java index ab17594..8d317f3 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java @@ -20,8 +20,6 @@ /** * Base class from which all event objects shall be derived. - * - * @author Peter Bencze */ public abstract class EventObject { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java index ea724bc..9104517 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java @@ -23,8 +23,6 @@ /** * A crawl delay mechanism, in which case the delay corresponds to the page loading time, if it is * between the specified range, otherwise the minimum or maximum duration is used. - * - * @author Peter Bencze */ public final class AdaptiveCrawlDelayMechanism implements CrawlDelayMechanism { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java index 4f1d34d..9a16db5 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java @@ -18,8 +18,6 @@ /** * An interface which should be implemented by every crawl delay mechanism. - * - * @author Peter Bencze */ public interface CrawlDelayMechanism { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java index 9713f8b..0547043 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java @@ -21,8 +21,6 @@ /** * A crawl delay mechanism, in which case the delay is constant and equals to the duration specified * in the configuration. - * - * @author Peter Bencze */ public final class FixedCrawlDelayMechanism implements CrawlDelayMechanism { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java index a457da3..915ab8b 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java @@ -22,8 +22,6 @@ /** * A crawl delay mechanism in which case the duration is randomized between the specified minimum * and maximum range. - * - * @author Peter Bencze */ public final class RandomCrawlDelayMechanism implements CrawlDelayMechanism { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/util/CookieConverter.java b/src/main/java/com/github/peterbencze/serritor/internal/util/CookieConverter.java index 4b104e1..571e3ca 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/util/CookieConverter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/util/CookieConverter.java @@ -22,8 +22,6 @@ /** * Converts Selenium cookies to HTTP client ones. - * - * @author Peter Bencze */ public final class CookieConverter { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/Stopwatch.java b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/Stopwatch.java index c78efd1..cfffe86 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/Stopwatch.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/Stopwatch.java @@ -25,8 +25,6 @@ /** * A serializable and thread-safe stopwatch implementation that can be used to measure elapsed * time. - * - * @author Peter Bencze */ public final class Stopwatch implements Serializable { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/TimeSource.java b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/TimeSource.java index 3f74564..d365f7c 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/TimeSource.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/TimeSource.java @@ -21,8 +21,6 @@ /** * A source providing access to the current instant. All implementations should be serializable. - * - * @author Peter Bencze */ public interface TimeSource extends Serializable { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/UtcTimeSource.java b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/UtcTimeSource.java index 263ca35..24a98ba 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/UtcTimeSource.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/util/stopwatch/UtcTimeSource.java @@ -20,8 +20,6 @@ /** * A source providing access to the current UTC instant. - * - * @author Peter Bencze */ public final class UtcTimeSource implements TimeSource { diff --git a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java index 3bc144b..d805288 100644 --- a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java +++ b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java @@ -31,8 +31,6 @@ /** * Test cases for {@link UrlFinder}. - * - * @author Peter Bencze */ public final class UrlFinderTest { diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java index 4bdb829..128115e 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java @@ -22,8 +22,6 @@ /** * Test cases for {@link CrawlDomain}. - * - * @author Peter Bencze */ public final class CrawlDomainTest { diff --git a/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java index e48137a..f6d85d3 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java @@ -26,8 +26,6 @@ /** * Test cases for {@link AdaptiveCrawlDelayMechanism}. - * - * @author Peter Bencze */ public final class AdaptiveCrawlDelayMechanismTest { diff --git a/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java index 535f5f4..69bad9d 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java @@ -25,8 +25,6 @@ /** * Test cases for {@link FixedCrawlDelayMechanism}. - * - * @author Peter Bencze */ public class FixedCrawlDelayMechanismTest { diff --git a/src/test/java/com/github/peterbencze/serritor/internal/util/CookieConverterTest.java b/src/test/java/com/github/peterbencze/serritor/internal/util/CookieConverterTest.java index e651673..856ff35 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/util/CookieConverterTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/util/CookieConverterTest.java @@ -25,8 +25,6 @@ /** * Test cases for {@link CookieConverter}. - * - * @author Peter Bencze */ public final class CookieConverterTest { diff --git a/src/test/java/com/github/peterbencze/serritor/internal/util/stopwatch/StopwatchTest.java b/src/test/java/com/github/peterbencze/serritor/internal/util/stopwatch/StopwatchTest.java index dd302f6..85e10d3 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/util/stopwatch/StopwatchTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/util/stopwatch/StopwatchTest.java @@ -24,9 +24,7 @@ import org.mockito.Mockito; /** - * Test cases for {@link Stopwatch} - * - * @author Peter Bencze + * Test cases for {@link Stopwatch}. */ public final class StopwatchTest { diff --git a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java b/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java index ac20b35..ece5d25 100644 --- a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java +++ b/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java @@ -46,8 +46,6 @@ /** * Integration test cases for Serritor. - * - * @author Peter Bencze */ public class SerritorIT { From bfbcd49450ec764325002976728497ba3dfd87bb Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 4 Apr 2019 22:23:14 +0200 Subject: [PATCH 52/63] Rename callbacks --- .../peterbencze/serritor/api/BaseCrawler.java | 81 +++++++++--------- .../peterbencze/serritor/api/CrawlStats.java | 40 +++++---- ...ntEvent.java => NonHtmlResponseEvent.java} | 8 +- ...LoadEvent.java => ResponseErrorEvent.java} | 9 +- ...orEvent.java => ResponseSuccessEvent.java} | 10 +-- .../serritor/internal/stats/StatsCounter.java | 62 +++++++------- .../internal/stats/StatsCounterSnapshot.java | 40 +++++---- .../internal/CustomCallbackManagerTest.java | 85 ++++++++++--------- .../internal/stats/StatsCounterTest.java | 36 ++++---- .../peterbencze/serritor/it/SerritorIT.java | 12 +-- 10 files changed, 201 insertions(+), 182 deletions(-) rename src/main/java/com/github/peterbencze/serritor/api/event/{NonHtmlContentEvent.java => NonHtmlResponseEvent.java} (86%) rename src/main/java/com/github/peterbencze/serritor/api/event/{PageLoadEvent.java => ResponseErrorEvent.java} (84%) rename src/main/java/com/github/peterbencze/serritor/api/event/{RequestErrorEvent.java => ResponseSuccessEvent.java} (84%) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index c23eee1..1b44893 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -19,11 +19,11 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; import com.github.peterbencze.serritor.api.event.NetworkErrorEvent; -import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent; -import com.github.peterbencze.serritor.api.event.PageLoadEvent; +import com.github.peterbencze.serritor.api.event.NonHtmlResponseEvent; import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent; -import com.github.peterbencze.serritor.api.event.RequestErrorEvent; import com.github.peterbencze.serritor.api.event.RequestRedirectEvent; +import com.github.peterbencze.serritor.api.event.ResponseErrorEvent; +import com.github.peterbencze.serritor.api.event.ResponseSuccessEvent; import com.github.peterbencze.serritor.internal.CrawlFrontier; import com.github.peterbencze.serritor.internal.CustomCallbackManager; import com.github.peterbencze.serritor.internal.EventObject; @@ -418,7 +418,7 @@ private void run() { String mimeType = getResponseMimeType(httpHeadResponse); if (!mimeType.equals(ContentType.TEXT_HTML.getMimeType())) { // URLs that point to non-HTML content should not be opened in the browser - handleNonHtmlContent(new NonHtmlContentEvent(currentCandidate, + handleNonHtmlResponse(new NonHtmlResponseEvent(currentCandidate, new PartialCrawlResponse(httpHeadResponse))); continue; @@ -471,13 +471,13 @@ private void run() { int statusCode = harResponse.getStatus(); if (HttpStatus.isClientError(statusCode) || HttpStatus.isServerError(statusCode)) { - handleRequestError(new RequestErrorEvent(currentCandidate, + handleResponseError(new ResponseErrorEvent(currentCandidate, new CompleteCrawlResponse(harResponse, webDriver))); continue; } - handlePageLoad(new PageLoadEvent(currentCandidate, + handleResponseSuccess(new ResponseSuccessEvent(currentCandidate, new CompleteCrawlResponse(harResponse, webDriver))); } } @@ -525,7 +525,7 @@ private static String getResponseMimeType(final HttpResponse httpHeadResponse) { } /** - * Handles network errors that occur during the crawl. + * Handles network errors. * * @param event the event which gets delivered when a network error occurs */ @@ -536,7 +536,7 @@ private void handleNetworkError(final NetworkErrorEvent event) { } /** - * Handles request redirects that occur during the crawl. + * Handles request redirects. * * @param event the event which gets delivered when a request is redirected */ @@ -550,20 +550,20 @@ private void handleRequestRedirect(final RequestRedirectEvent event) { } /** - * Handles responses with non-HTML content that occur during the crawl. + * Handles responses with non-HTML content. * - * @param event the event which gets delivered when the MIME type of the response is not + * @param event the event which gets delivered when the content type of the response is not * text/html */ - private void handleNonHtmlContent(final NonHtmlContentEvent event) { - callbackManager.callCustomOrDefault(NonHtmlContentEvent.class, event, - this::onNonHtmlContent); + private void handleNonHtmlResponse(final NonHtmlResponseEvent event) { + callbackManager.callCustomOrDefault(NonHtmlResponseEvent.class, event, + this::onNonHtmlResponse); - statsCounter.recordNonHtmlContent(); + statsCounter.recordNonHtmlResponse(); } /** - * Handles page load timeout that occur during the crawl. + * Handles page load timeouts. * * @param event the event which gets delivered when a page does not load in the browser within * the timeout period @@ -576,26 +576,28 @@ private void handlePageLoadTimeout(final PageLoadTimeoutEvent event) { } /** - * Handles request errors that occur during the crawl. + * Handles responses whose HTTP status code indicates an error. * * @param event the event which gets delivered when a request error (an error with HTTP status * code 4xx or 5xx) occurs */ - private void handleRequestError(final RequestErrorEvent event) { - callbackManager.callCustomOrDefault(RequestErrorEvent.class, event, this::onRequestError); + private void handleResponseError(final ResponseErrorEvent event) { + callbackManager.callCustomOrDefault(ResponseErrorEvent.class, event, this::onResponseError); - statsCounter.recordRequestError(); + statsCounter.recordResponseError(); } /** - * Handles successful page loads that occur during the crawl. + * Handles responses whose HTTP status code indicates success. * - * @param event the event which gets delivered when the browser loads the page + * @param event the event which gets delivered when the browser loads the page and the HTTP + * status code indicates success (2xx) */ - private void handlePageLoad(final PageLoadEvent event) { - callbackManager.callCustomOrDefault(PageLoadEvent.class, event, this::onPageLoad); + private void handleResponseSuccess(final ResponseSuccessEvent event) { + callbackManager.callCustomOrDefault(ResponseSuccessEvent.class, event, + this::onResponseSuccess); - statsCounter.recordPageLoad(); + statsCounter.recordResponseSuccess(); } /** @@ -659,28 +661,29 @@ protected void onBrowserInit(final Options options) { } /** - * Callback which gets called when the crawler is started. + * Callback which gets called when the crawler starts. */ protected void onStart() { LOGGER.info("onStart"); } /** - * Callback which gets called when the browser loads the page. + * Callback which gets called when the browser loads the page and the HTTP status code of the + * response indicates success (2xx). * - * @param event the PageLoadEvent instance + * @param event the ResponseSuccessEvent instance */ - protected void onPageLoad(final PageLoadEvent event) { - LOGGER.info("onPageLoad: {}", event.getCrawlCandidate().getRequestUrl()); + protected void onResponseSuccess(final ResponseSuccessEvent event) { + LOGGER.info("onResponseSuccess: {}", event.getCrawlCandidate().getRequestUrl()); } /** - * Callback which gets called when the content type is not HTML. + * Callback which gets called when the content type of the response is not text/html. * - * @param event the NonHtmlContentEvent instance + * @param event the NonHtmlResponseEvent instance */ - protected void onNonHtmlContent(final NonHtmlContentEvent event) { - LOGGER.info("onNonHtmlContent: {}", event.getCrawlCandidate().getRequestUrl()); + protected void onNonHtmlResponse(final NonHtmlResponseEvent event) { + LOGGER.info("onNonHtmlResponse: {}", event.getCrawlCandidate().getRequestUrl()); } /** @@ -693,13 +696,13 @@ protected void onNetworkError(final NetworkErrorEvent event) { } /** - * Callback which gets called when a request error (an error with HTTP status code 4xx or 5xx) - * occurs. + * Callback which gets called when the browser loads the page and the HTTP status code of the + * response indicates error (4xx or 5xx). * - * @param event the RequestErrorEvent instance + * @param event the ResponseErrorEvent instance */ - protected void onRequestError(final RequestErrorEvent event) { - LOGGER.info("onRequestError: {}", event.getCrawlCandidate().getRequestUrl()); + protected void onResponseError(final ResponseErrorEvent event) { + LOGGER.info("onResponseError: {}", event.getCrawlCandidate().getRequestUrl()); } /** @@ -723,7 +726,7 @@ protected void onPageLoadTimeout(final PageLoadTimeoutEvent event) { } /** - * Callback which gets called when the crawler is stopped. + * Callback which gets called when the crawler stops. */ protected void onStop() { LOGGER.info("onStop"); diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java index e1fb682..b07080a 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlStats.java @@ -36,11 +36,11 @@ "remainingDurationEstimate", "remainingCrawlCandidateCount", "processedCrawlCandidateCount", - "pageLoadCount", + "responseSuccessCount", "pageLoadTimeoutCount", "requestRedirectCount", - "nonHtmlContentCount", - "requestErrorCount", + "nonHtmlResponseCount", + "responseErrorCount", "networkErrorCount", "filteredDuplicateRequestCount", "filteredOffsiteRequestCount", @@ -129,12 +129,14 @@ public int getProcessedCrawlCandidateCount() { } /** - * Returns the number of successful page loads that occurred during the crawl. + * Returns the number of responses received during the crawl, whose HTTP status code indicated + * success (2xx). * - * @return the number of successful page loads that occurred during the crawl + * @return the number of responses received during the crawl, whose HTTP status code indicated + * success (2xx) */ - public int getPageLoadCount() { - return statsCounterSnapshot.getPageLoadCount(); + public int getResponseSuccessCount() { + return statsCounterSnapshot.getResponseSuccessCount(); } /** @@ -156,21 +158,23 @@ public int getRequestRedirectCount() { } /** - * Returns the number of responses with non-HTML content that occurred during the crawl. + * Returns the number of responses received with non-HTML content. * - * @return the number of responses with non-HTML content that occurred during the crawl + * @return the number of responses received with non-HTML content */ - public int getNonHtmlContentCount() { - return statsCounterSnapshot.getNonHtmlContentCount(); + public int getNonHtmlResponseCount() { + return statsCounterSnapshot.getNonHtmlResponseCount(); } /** - * Returns the number of request errors that occurred during the crawl. + * Returns the number of responses received during the crawl, whose HTTP status code indicated + * error (4xx or 5xx). * - * @return the number of request errors that occurred during the crawl + * @return the number of responses received during the crawl, whose HTTP status code indicated + * error (4xx or 5xx) */ - public int getRequestErrorCount() { - return statsCounterSnapshot.getRequestErrorCount(); + public int getResponseErrorCount() { + return statsCounterSnapshot.getResponseErrorCount(); } /** @@ -225,11 +229,11 @@ public String toString() { remainingDurationEstimate.toMillis(), true, true)) .append("remainingCrawlCandidateCount", getRemainingCrawlCandidateCount()) .append("processedCrawlCandidateCount", getProcessedCrawlCandidateCount()) - .append("pageLoadCount", getPageLoadCount()) + .append("responseSuccessCount", getResponseSuccessCount()) .append("pageLoadTimeoutCount", getPageLoadTimeoutCount()) .append("requestRedirectCount", getRequestRedirectCount()) - .append("nonHtmlContentCount", getNonHtmlContentCount()) - .append("requestErrorCount", getRequestErrorCount()) + .append("nonHtmlResponseCount", getNonHtmlResponseCount()) + .append("responseErrorCount", getResponseErrorCount()) .append("networkErrorCount", getNetworkErrorCount()) .append("filteredDuplicateRequestCount", getFilteredDuplicateRequestCount()) .append("filteredOffsiteRequestCount", getFilteredOffsiteRequestCount()) diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlResponseEvent.java similarity index 86% rename from src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java rename to src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlResponseEvent.java index 823b6bf..3ebff0a 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlResponseEvent.java @@ -21,19 +21,19 @@ import com.github.peterbencze.serritor.internal.EventObject; /** - * Event which gets delivered when the MIME type of the response is not "text/html". + * Event which gets delivered when the content type of the response is not text/html. */ -public final class NonHtmlContentEvent extends EventObject { +public final class NonHtmlResponseEvent extends EventObject { private final PartialCrawlResponse partialCrawlResponse; /** - * Creates a {@link NonHtmlContentEvent} instance. + * Creates a {@link NonHtmlResponseEvent} instance. * * @param crawlCandidate the current crawl candidate * @param partialCrawlResponse the partial crawl response */ - public NonHtmlContentEvent( + public NonHtmlResponseEvent( final CrawlCandidate crawlCandidate, final PartialCrawlResponse partialCrawlResponse) { super(crawlCandidate); diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/ResponseErrorEvent.java similarity index 84% rename from src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java rename to src/main/java/com/github/peterbencze/serritor/api/event/ResponseErrorEvent.java index 1243b36..464e6fa 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/ResponseErrorEvent.java @@ -21,19 +21,20 @@ import com.github.peterbencze.serritor.internal.EventObject; /** - * Event which gets delivered when the browser loads the page. + * Event which gets delivered when the browser loads the page and the HTTP status code indicates + * error (4xx or 5xx). */ -public final class PageLoadEvent extends EventObject { +public final class ResponseErrorEvent extends EventObject { private final CompleteCrawlResponse completeCrawlResponse; /** - * Creates a {@link PageLoadEvent} instance. + * Creates a {@link ResponseErrorEvent} instance. * * @param crawlCandidate the current crawl candidate * @param completeCrawlResponse the complete crawl response */ - public PageLoadEvent( + public ResponseErrorEvent( final CrawlCandidate crawlCandidate, final CompleteCrawlResponse completeCrawlResponse) { super(crawlCandidate); diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/ResponseSuccessEvent.java similarity index 84% rename from src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java rename to src/main/java/com/github/peterbencze/serritor/api/event/ResponseSuccessEvent.java index d2736d5..827d05e 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/ResponseSuccessEvent.java @@ -21,20 +21,20 @@ import com.github.peterbencze.serritor.internal.EventObject; /** - * Event which gets delivered when a request error (an error with HTTP status code 4xx or 5xx) - * occurs. + * Event which gets delivered when the browser loads the page and the HTTP status code indicates + * success (2xx). */ -public final class RequestErrorEvent extends EventObject { +public final class ResponseSuccessEvent extends EventObject { private final CompleteCrawlResponse completeCrawlResponse; /** - * Creates a {@link RequestErrorEvent} instance. + * Creates a {@link ResponseSuccessEvent} instance. * * @param crawlCandidate the current crawl candidate * @param completeCrawlResponse the complete crawl response */ - public RequestErrorEvent( + public ResponseSuccessEvent( final CrawlCandidate crawlCandidate, final CompleteCrawlResponse completeCrawlResponse) { super(crawlCandidate); diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java index 64183dd..6c657fd 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounter.java @@ -29,11 +29,11 @@ public final class StatsCounter implements Serializable { private int remainingCrawlCandidateCount; private int processedCrawlCandidateCount; - private int pageLoadCount; + private int responseSuccessCount; private int pageLoadTimeoutCount; private int requestRedirectCount; - private int nonHtmlContentCount; - private int requestErrorCount; + private int nonHtmlResponseCount; + private int responseErrorCount; private int networkErrorCount; private int filteredDuplicateRequestCount; private int filteredOffsiteRequestCount; @@ -73,23 +73,24 @@ public int getProcessedCrawlCandidateCount() { } /** - * Returns the number of successful page loads that occurred during the crawl. + * Returns the number of responses received during the crawl, whose HTTP status code indicated + * success (2xx). * - * @return the number of successful page loads that occurred during the crawl + * @return the number of responses received during the crawl, whose HTTP status code indicated + * success (2xx) */ - public int getPageLoadCount() { - return lock.readWithLock(() -> pageLoadCount); + public int getResponseSuccessCount() { + return lock.readWithLock(() -> responseSuccessCount); } /** - * Records a successful page load. This should be called when the status code of the response is - * successful. + * Records the receipt of a response whose HTTP status code indicates success (2xx). */ - public void recordPageLoad() { + public void recordResponseSuccess() { lock.writeWithLock(() -> { decrementRemainingCrawlCandidateCount(); - ++pageLoadCount; + ++responseSuccessCount; incrementProcessedCrawlCandidateCount(); }); } @@ -104,8 +105,7 @@ public int getPageLoadTimeoutCount() { } /** - * Records a page load timeout. This should be called when a page does not load in the browser - * within the timeout period. + * Records a page load timeout. */ public void recordPageLoadTimeout() { lock.writeWithLock(() -> { @@ -119,14 +119,14 @@ public void recordPageLoadTimeout() { /** * Returns the number of request redirects that occurred during the crawl. * - * @return the number of request redirects that occurred during the crawl. + * @return the number of request redirects that occurred during the crawl */ public int getRequestRedirectCount() { return lock.readWithLock(() -> requestRedirectCount); } /** - * Records a request redirect. This should be called when a request is redirected. + * Records a request redirect. */ public void recordRequestRedirect() { lock.writeWithLock(() -> { @@ -138,45 +138,45 @@ public void recordRequestRedirect() { } /** - * Returns the number of responses with non-HTML content that occurred during the crawl. + * Returns the number of responses received with non-HTML content. * - * @return the number of responses with non-HTML content that occurred during the crawl + * @return the number of responses received with non-HTML content */ - public int getNonHtmlContentCount() { - return lock.readWithLock(() -> nonHtmlContentCount); + public int getNonHtmlResponseCount() { + return lock.readWithLock(() -> nonHtmlResponseCount); } /** - * Records a response with non-HTML content. This should be called when the MIME type of a - * response is not text/html. + * Records the receipt of a response with non-HTML content. */ - public void recordNonHtmlContent() { + public void recordNonHtmlResponse() { lock.writeWithLock(() -> { decrementRemainingCrawlCandidateCount(); - ++nonHtmlContentCount; + ++nonHtmlResponseCount; incrementProcessedCrawlCandidateCount(); }); } /** - * Returns the number of request errors that occurred during the crawl. + * Returns the number of responses received during the crawl, whose HTTP status code indicated + * error (4xx or 5xx). * - * @return the number of request errors that occurred during the crawl + * @return the number of responses received during the crawl, whose HTTP status code indicated + * error (4xx or 5xx) */ - public int getRequestErrorCount() { - return lock.readWithLock(() -> requestErrorCount); + public int getResponseErrorCount() { + return lock.readWithLock(() -> responseErrorCount); } /** - * Records an error response. This should be called when the status code of the response is 4xx - * or 5xx. + * Records the receipt of a response whose HTTP status code indicates error (4xx or 5xx). */ - public void recordRequestError() { + public void recordResponseError() { lock.writeWithLock(() -> { decrementRemainingCrawlCandidateCount(); - ++requestErrorCount; + ++responseErrorCount; incrementProcessedCrawlCandidateCount(); }); } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounterSnapshot.java b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounterSnapshot.java index c92dc5b..09dc5b0 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounterSnapshot.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/stats/StatsCounterSnapshot.java @@ -23,11 +23,11 @@ public final class StatsCounterSnapshot { private final int remainingCrawlCandidateCount; private final int processedCrawlCandidateCount; - private final int pageLoadCount; + private final int responseSuccessCount; private final int pageLoadTimeoutCount; private final int requestRedirectCount; - private final int nonHtmlContentCount; - private final int requestErrorCount; + private final int nonHtmlResponseCount; + private final int responseErrorCount; private final int networkErrorCount; private final int filteredDuplicateRequestCount; private final int filteredOffsiteRequestCount; @@ -41,11 +41,11 @@ public final class StatsCounterSnapshot { public StatsCounterSnapshot(final StatsCounter statsCounter) { remainingCrawlCandidateCount = statsCounter.getRemainingCrawlCandidateCount(); processedCrawlCandidateCount = statsCounter.getProcessedCrawlCandidateCount(); - pageLoadCount = statsCounter.getPageLoadCount(); + responseSuccessCount = statsCounter.getResponseSuccessCount(); pageLoadTimeoutCount = statsCounter.getPageLoadTimeoutCount(); requestRedirectCount = statsCounter.getRequestRedirectCount(); - nonHtmlContentCount = statsCounter.getNonHtmlContentCount(); - requestErrorCount = statsCounter.getRequestErrorCount(); + nonHtmlResponseCount = statsCounter.getNonHtmlResponseCount(); + responseErrorCount = statsCounter.getResponseErrorCount(); networkErrorCount = statsCounter.getNetworkErrorCount(); filteredDuplicateRequestCount = statsCounter.getFilteredDuplicateRequestCount(); filteredOffsiteRequestCount = statsCounter.getFilteredOffsiteRequestCount(); @@ -72,12 +72,14 @@ public int getProcessedCrawlCandidateCount() { } /** - * Returns the number of successful page loads that occurred during the crawl. + * Returns the number of responses received during the crawl, whose HTTP status code indicated + * success (2xx). * - * @return the number of successful page loads that occurred during the crawl + * @return the number of responses received during the crawl, whose HTTP status code indicated + * success (2xx) */ - public int getPageLoadCount() { - return pageLoadCount; + public int getResponseSuccessCount() { + return responseSuccessCount; } /** @@ -99,21 +101,23 @@ public int getRequestRedirectCount() { } /** - * Returns the number of responses with non-HTML content that occurred during the crawl. + * Returns the number of responses received with non-HTML content. * - * @return the number of responses with non-HTML content that occurred during the crawl + * @return the number of responses received with non-HTML content */ - public int getNonHtmlContentCount() { - return nonHtmlContentCount; + public int getNonHtmlResponseCount() { + return nonHtmlResponseCount; } /** - * Returns the number of request errors that occurred during the crawl. + * Returns the number of responses received during the crawl, whose HTTP status code indicated + * error (4xx or 5xx). * - * @return the number of request errors that occurred during the crawl + * @return the number of responses received during the crawl, whose HTTP status code indicated + * error (4xx or 5xx) */ - public int getRequestErrorCount() { - return requestErrorCount; + public int getResponseErrorCount() { + return responseErrorCount; } /** diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CustomCallbackManagerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CustomCallbackManagerTest.java index c867f9a..be0cab6 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CustomCallbackManagerTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CustomCallbackManagerTest.java @@ -18,7 +18,7 @@ import com.github.peterbencze.serritor.api.CrawlCandidate; import com.github.peterbencze.serritor.api.PatternMatchingCallback; -import com.github.peterbencze.serritor.api.event.PageLoadEvent; +import com.github.peterbencze.serritor.api.event.ResponseSuccessEvent; import java.net.URI; import java.util.function.Consumer; import java.util.regex.Matcher; @@ -33,97 +33,104 @@ public final class CustomCallbackManagerTest { private CustomCallbackManager callbackManager; - private Consumer defaultPageLoadCallbackMock; - private PageLoadEvent pageLoadEventMock; + private Consumer defaultResponseSuccessCallbackMock; + private ResponseSuccessEvent responseSuccessEventMock; @Before public void before() { callbackManager = new CustomCallbackManager(); - defaultPageLoadCallbackMock = Mockito.mock(Consumer.class); - pageLoadEventMock = Mockito.mock(PageLoadEvent.class); + defaultResponseSuccessCallbackMock = Mockito.mock(Consumer.class); + responseSuccessEventMock = Mockito.mock(ResponseSuccessEvent.class); } @Test public void testCallWithNoCustomCallback() { CrawlCandidate crawlCandidateMock = createCrawlCandidateMock(); - Mockito.when(pageLoadEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); + Mockito.when(responseSuccessEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); - callbackManager.callCustomOrDefault(PageLoadEvent.class, pageLoadEventMock, - defaultPageLoadCallbackMock); + callbackManager.callCustomOrDefault(ResponseSuccessEvent.class, responseSuccessEventMock, + defaultResponseSuccessCallbackMock); - Mockito.verify(defaultPageLoadCallbackMock, Mockito.times(1)).accept(pageLoadEventMock); + Mockito.verify(defaultResponseSuccessCallbackMock).accept(responseSuccessEventMock); } @Test public void testCallWithNoApplicableCustomCallback() { - PatternMatchingCallback patternMatchingCallbackMock + PatternMatchingCallback patternMatchingCallbackMock = Mockito.mock(PatternMatchingCallback.class); Pattern patternMock = createPatternMock(false); - Consumer customPageLoadCallbackMock = Mockito.mock(Consumer.class); + Consumer customResponseSuccessCallbackMock + = Mockito.mock(Consumer.class); Mockito.when(patternMatchingCallbackMock.getUrlPattern()).thenReturn(patternMock); Mockito.when(patternMatchingCallbackMock.getCallback()) - .thenReturn(customPageLoadCallbackMock); + .thenReturn(customResponseSuccessCallbackMock); CrawlCandidate crawlCandidateMock = createCrawlCandidateMock(); - Mockito.when(pageLoadEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); + Mockito.when(responseSuccessEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); - callbackManager.addCustomCallback(PageLoadEvent.class, patternMatchingCallbackMock); - callbackManager.callCustomOrDefault(PageLoadEvent.class, pageLoadEventMock, - defaultPageLoadCallbackMock); + callbackManager.addCustomCallback(ResponseSuccessEvent.class, patternMatchingCallbackMock); + callbackManager.callCustomOrDefault(ResponseSuccessEvent.class, responseSuccessEventMock, + defaultResponseSuccessCallbackMock); - Mockito.verify(defaultPageLoadCallbackMock, Mockito.times(1)).accept(pageLoadEventMock); - Mockito.verify(customPageLoadCallbackMock, Mockito.never()).accept(pageLoadEventMock); + Mockito.verify(defaultResponseSuccessCallbackMock).accept(responseSuccessEventMock); + Mockito.verify(customResponseSuccessCallbackMock, Mockito.never()) + .accept(responseSuccessEventMock); } @Test public void testCallWithSingleApplicableCustomCallback() { Pattern patternMock = createPatternMock(true); - Consumer customPageLoadCallbackMock = Mockito.mock(Consumer.class); - PatternMatchingCallback patternMatchingCallbackMock + Consumer customResponseSuccessCallbackMock + = Mockito.mock(Consumer.class); + PatternMatchingCallback patternMatchingCallbackMock = Mockito.mock(PatternMatchingCallback.class); Mockito.when(patternMatchingCallbackMock.getUrlPattern()).thenReturn(patternMock); Mockito.when(patternMatchingCallbackMock.getCallback()) - .thenReturn(customPageLoadCallbackMock); + .thenReturn(customResponseSuccessCallbackMock); CrawlCandidate crawlCandidateMock = createCrawlCandidateMock(); - Mockito.when(pageLoadEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); + Mockito.when(responseSuccessEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); - callbackManager.addCustomCallback(PageLoadEvent.class, patternMatchingCallbackMock); - callbackManager.callCustomOrDefault(PageLoadEvent.class, pageLoadEventMock, - defaultPageLoadCallbackMock); + callbackManager.addCustomCallback(ResponseSuccessEvent.class, patternMatchingCallbackMock); + callbackManager.callCustomOrDefault(ResponseSuccessEvent.class, responseSuccessEventMock, + defaultResponseSuccessCallbackMock); - Mockito.verify(defaultPageLoadCallbackMock, Mockito.never()).accept(pageLoadEventMock); - Mockito.verify(customPageLoadCallbackMock, Mockito.times(1)).accept(pageLoadEventMock); + Mockito.verify(defaultResponseSuccessCallbackMock, Mockito.never()) + .accept(responseSuccessEventMock); + Mockito.verify(customResponseSuccessCallbackMock).accept(responseSuccessEventMock); } @Test public void testCallWithMultipleApplicableCustomCallback() { Pattern patternMock = createPatternMock(true); - Consumer customPageLoadCallbackMock = Mockito.mock(Consumer.class); + Consumer customResponseSuccessCallbackMock + = Mockito.mock(Consumer.class); - PatternMatchingCallback patternMatchingCallbackMock1 + PatternMatchingCallback patternMatchingCallbackMock1 = Mockito.mock(PatternMatchingCallback.class); Mockito.when(patternMatchingCallbackMock1.getUrlPattern()).thenReturn(patternMock); Mockito.when(patternMatchingCallbackMock1.getCallback()) - .thenReturn(customPageLoadCallbackMock); + .thenReturn(customResponseSuccessCallbackMock); - PatternMatchingCallback patternMatchingCallbackMock2 + PatternMatchingCallback patternMatchingCallbackMock2 = Mockito.mock(PatternMatchingCallback.class); Mockito.when(patternMatchingCallbackMock2.getUrlPattern()).thenReturn(patternMock); Mockito.when(patternMatchingCallbackMock2.getCallback()) - .thenReturn(customPageLoadCallbackMock); + .thenReturn(customResponseSuccessCallbackMock); CrawlCandidate crawlCandidateMock = createCrawlCandidateMock(); - Mockito.when(pageLoadEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); + Mockito.when(responseSuccessEventMock.getCrawlCandidate()).thenReturn(crawlCandidateMock); - callbackManager.addCustomCallback(PageLoadEvent.class, patternMatchingCallbackMock1); - callbackManager.addCustomCallback(PageLoadEvent.class, patternMatchingCallbackMock2); - callbackManager.callCustomOrDefault(PageLoadEvent.class, pageLoadEventMock, - defaultPageLoadCallbackMock); + callbackManager.addCustomCallback(ResponseSuccessEvent.class, patternMatchingCallbackMock1); + callbackManager.addCustomCallback(ResponseSuccessEvent.class, patternMatchingCallbackMock2); + callbackManager.callCustomOrDefault(ResponseSuccessEvent.class, responseSuccessEventMock, + defaultResponseSuccessCallbackMock); - Mockito.verify(defaultPageLoadCallbackMock, Mockito.never()).accept(pageLoadEventMock); - Mockito.verify(customPageLoadCallbackMock, Mockito.times(2)).accept(pageLoadEventMock); + Mockito.verify(defaultResponseSuccessCallbackMock, Mockito.never()) + .accept(responseSuccessEventMock); + Mockito.verify(customResponseSuccessCallbackMock, Mockito.times(2)) + .accept(responseSuccessEventMock); } private static Pattern createPatternMock(final boolean shouldMatch) { diff --git a/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java b/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java index e89e2fc..a853822 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/stats/StatsCounterTest.java @@ -33,18 +33,18 @@ public void before() { } @Test - public void testRecordPageLoad() { + public void testRecordResponseSuccess() { statsCounter.recordRemainingCrawlCandidate(); int remainingCrawlCandidateCountBefore = statsCounter.getRemainingCrawlCandidateCount(); - int pageLoadCountBefore = statsCounter.getPageLoadCount(); + int responseSuccessCountBefore = statsCounter.getResponseSuccessCount(); int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); - statsCounter.recordPageLoad(); + statsCounter.recordResponseSuccess(); Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, statsCounter.getRemainingCrawlCandidateCount()); - Assert.assertEquals(pageLoadCountBefore + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(responseSuccessCountBefore + 1, statsCounter.getResponseSuccessCount()); Assert.assertEquals(processedCrawlCandidateCountBefore + 1, statsCounter.getProcessedCrawlCandidateCount()); } @@ -57,11 +57,11 @@ public void testRecordPageLoadTimeout() { int pageLoadTimeoutCountBefore = statsCounter.getPageLoadTimeoutCount(); int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); - statsCounter.recordPageLoad(); + statsCounter.recordResponseSuccess(); Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, statsCounter.getRemainingCrawlCandidateCount()); - Assert.assertEquals(pageLoadTimeoutCountBefore + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(pageLoadTimeoutCountBefore + 1, statsCounter.getResponseSuccessCount()); Assert.assertEquals(processedCrawlCandidateCountBefore + 1, statsCounter.getProcessedCrawlCandidateCount()); } @@ -74,45 +74,45 @@ public void testRecordRequestRedirect() { int requestRedirectCountBefore = statsCounter.getRequestRedirectCount(); int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); - statsCounter.recordPageLoad(); + statsCounter.recordResponseSuccess(); Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, statsCounter.getRemainingCrawlCandidateCount()); - Assert.assertEquals(requestRedirectCountBefore + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(requestRedirectCountBefore + 1, statsCounter.getResponseSuccessCount()); Assert.assertEquals(processedCrawlCandidateCountBefore + 1, statsCounter.getProcessedCrawlCandidateCount()); } @Test - public void testRecordNonHtmlContent() { + public void testRecordNonHtmlResponse() { statsCounter.recordRemainingCrawlCandidate(); int remainingCrawlCandidateCountBefore = statsCounter.getRemainingCrawlCandidateCount(); - int nonHtmlContentCount = statsCounter.getNonHtmlContentCount(); + int nonHtmlResponseCount = statsCounter.getNonHtmlResponseCount(); int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); - statsCounter.recordPageLoad(); + statsCounter.recordResponseSuccess(); Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, statsCounter.getRemainingCrawlCandidateCount()); - Assert.assertEquals(nonHtmlContentCount + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(nonHtmlResponseCount + 1, statsCounter.getResponseSuccessCount()); Assert.assertEquals(processedCrawlCandidateCountBefore + 1, statsCounter.getProcessedCrawlCandidateCount()); } @Test - public void testRecordRequestError() { + public void testRecordResponseError() { statsCounter.recordRemainingCrawlCandidate(); int remainingCrawlCandidateCountBefore = statsCounter.getRemainingCrawlCandidateCount(); - int requestErrorCount = statsCounter.getRequestErrorCount(); + int responseErrorCount = statsCounter.getResponseErrorCount(); int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); - statsCounter.recordPageLoad(); + statsCounter.recordResponseSuccess(); Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, statsCounter.getRemainingCrawlCandidateCount()); - Assert.assertEquals(requestErrorCount + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(responseErrorCount + 1, statsCounter.getResponseSuccessCount()); Assert.assertEquals(processedCrawlCandidateCountBefore + 1, statsCounter.getProcessedCrawlCandidateCount()); } @@ -125,11 +125,11 @@ public void testRecordNetworkError() { int networkErrorCount = statsCounter.getNetworkErrorCount(); int processedCrawlCandidateCountBefore = statsCounter.getProcessedCrawlCandidateCount(); - statsCounter.recordPageLoad(); + statsCounter.recordResponseSuccess(); Assert.assertEquals(remainingCrawlCandidateCountBefore - 1, statsCounter.getRemainingCrawlCandidateCount()); - Assert.assertEquals(networkErrorCount + 1, statsCounter.getPageLoadCount()); + Assert.assertEquals(networkErrorCount + 1, statsCounter.getResponseSuccessCount()); Assert.assertEquals(processedCrawlCandidateCountBefore + 1, statsCounter.getProcessedCrawlCandidateCount()); } diff --git a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java b/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java index ece5d25..9a9a250 100644 --- a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java +++ b/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java @@ -20,8 +20,8 @@ import com.github.peterbencze.serritor.api.Browser; import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlerConfiguration; -import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent; -import com.github.peterbencze.serritor.api.event.PageLoadEvent; +import com.github.peterbencze.serritor.api.event.NonHtmlResponseEvent; +import com.github.peterbencze.serritor.api.event.ResponseSuccessEvent; import com.github.tomakehurst.wiremock.WireMockServer; import com.github.tomakehurst.wiremock.client.WireMock; import com.github.tomakehurst.wiremock.core.WireMockConfiguration; @@ -82,8 +82,8 @@ public void testFileDownload() throws IOException { BaseCrawler crawler = new BaseCrawler(config) { @Override - protected void onNonHtmlContent(final NonHtmlContentEvent event) { - super.onNonHtmlContent(event); + protected void onNonHtmlResponse(final NonHtmlResponseEvent event) { + super.onNonHtmlResponse(event); try { downloadFile(event.getCrawlCandidate().getRequestUrl(), destinationFile); @@ -124,8 +124,8 @@ public void testResumeState() throws IOException { BaseCrawler crawler = new BaseCrawler(config) { @Override - protected void onPageLoad(final PageLoadEvent event) { - super.onPageLoad(event); + protected void onResponseSuccess(final ResponseSuccessEvent event) { + super.onResponseSuccess(event); stop(); } From 82e149deeedf0c1247d82e206b5a8ef2df214de6 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 7 Apr 2019 11:38:03 +0200 Subject: [PATCH 53/63] Add detailed logging --- pom.xml | 2 +- .../peterbencze/serritor/api/BaseCrawler.java | 50 ++++++++++++++++++- .../peterbencze/serritor/api/Browser.java | 17 +++++-- .../serritor/api/RestfulBaseCrawler.java | 8 +++ .../serritor/internal/CrawlFrontier.java | 17 +++++++ .../internal/CustomCallbackManager.java | 19 +++++-- .../web/accessmanager/JwtAccessManager.java | 14 ++++++ .../web/accessmanager/NoopAccessManager.java | 6 +++ .../internal/web/handler/JwtHandler.java | 14 +++++- .../internal/web/handler/LoginHandler.java | 23 +++++++-- .../web/handler/XsrfTokenHandler.java | 10 +++- 11 files changed, 163 insertions(+), 17 deletions(-) diff --git a/pom.xml b/pom.xml index 071e5f5..c1d422e 100644 --- a/pom.xml +++ b/pom.xml @@ -83,7 +83,7 @@ org.slf4j - slf4j-simple + slf4j-api 1.7.25 diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 1b44893..cdab6a4 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -185,6 +185,9 @@ private void start(final Browser browser, try { Validate.validState(isStopped.get(), "The crawler is already running."); + LOGGER.debug("Crawler is starting (resuming crawl: {})", isResuming); + LOGGER.debug("Using configuration: {}", config); + isStopped.set(false); runTimeStopwatch.start(); @@ -208,14 +211,21 @@ private void start(final Browser browser, String host = urlComponents[0]; int port = Integer.parseInt(urlComponents[1]); + LOGGER.debug("Using chained proxy on address {}:{}", host, port); + proxyServer.setChainedProxy(new InetSocketAddress(host, port)); } proxyServer.start(); + LOGGER.debug("Proxy server started on port {}", proxyServer.getPort()); + capabilitiesClone.setCapability(CapabilityType.PROXY, ClientUtil.createSeleniumProxy(proxyServer)); + LOGGER.debug("Starting {} browser", browser); webDriver = WebDriverFactory.createWebDriver(browser, capabilitiesClone); + + LOGGER.debug("Calling onBrowserInit callback"); onBrowserInit(webDriver.manage()); if (!isResuming) { @@ -233,20 +243,26 @@ private void start(final Browser browser, // Must be created here (the adaptive crawl delay strategy depends on the WebDriver) crawlDelayMechanism = createCrawlDelayMechanism(); + LOGGER.debug("Calling onStart callback"); onStart(); run(); } finally { + LOGGER.debug("Crawler is stopping"); + try { + LOGGER.debug("Calling onStop callback"); onStop(); } finally { HttpClientUtils.closeQuietly(httpClient); if (webDriver != null) { + LOGGER.debug("Closing browser"); webDriver.quit(); } if (proxyServer != null && proxyServer.isStarted()) { + LOGGER.debug("Stopping proxy server"); proxyServer.stop(); } @@ -320,6 +336,8 @@ protected final void registerCustomCallback( protected final void stop() { Validate.validState(!isStopped.get(), "The crawler is not started."); + LOGGER.debug("Initiating stop"); + // Indicate that the crawling should be stopped isStopInitiated.set(true); } @@ -362,6 +380,8 @@ protected final void downloadFile(final URI source, final File destination) thro Validate.notNull(source, "The source parameter cannot be null."); Validate.notNull(destination, "The destination parameter cannot be null."); + LOGGER.debug("Downloading file from {} to {}", source, destination); + HttpGet request = new HttpGet(source); try (CloseableHttpResponse response = httpClient.execute(request)) { HttpEntity entity = response.getEntity(); @@ -386,10 +406,14 @@ private void run() { } CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); + LOGGER.debug("Next crawl candidate: {}", currentCandidate); + String candidateUrl = currentCandidate.getRequestUrl().toString(); CloseableHttpResponse httpHeadResponse = null; try { + LOGGER.debug("Sending HTTP head request to URL {}", candidateUrl); + try { httpHeadResponse = httpClient.execute(new HttpHead(candidateUrl)); } catch (IOException exception) { @@ -426,6 +450,7 @@ private void run() { proxyServer.newHar(); + LOGGER.debug("Opening URL {} in browser", candidateUrl); try { webDriver.get(candidateUrl); @@ -530,6 +555,8 @@ private static String getResponseMimeType(final HttpResponse httpHeadResponse) { * @param event the event which gets delivered when a network error occurs */ private void handleNetworkError(final NetworkErrorEvent event) { + LOGGER.debug("Network error occurred: {}", event.getErrorMessage()); + callbackManager.callCustomOrDefault(NetworkErrorEvent.class, event, this::onNetworkError); statsCounter.recordNetworkError(); @@ -541,6 +568,10 @@ private void handleNetworkError(final NetworkErrorEvent event) { * @param event the event which gets delivered when a request is redirected */ private void handleRequestRedirect(final RequestRedirectEvent event) { + LOGGER.debug("Request redirected from {} to {}", + event.getCrawlCandidate().getRequestUrl(), + event.getRedirectedCrawlRequest().getRequestUrl()); + crawl(event.getRedirectedCrawlRequest()); callbackManager.callCustomOrDefault(RequestRedirectEvent.class, event, @@ -556,6 +587,8 @@ private void handleRequestRedirect(final RequestRedirectEvent event) { * text/html */ private void handleNonHtmlResponse(final NonHtmlResponseEvent event) { + LOGGER.debug("Received response with non-HTML content"); + callbackManager.callCustomOrDefault(NonHtmlResponseEvent.class, event, this::onNonHtmlResponse); @@ -569,6 +602,8 @@ private void handleNonHtmlResponse(final NonHtmlResponseEvent event) { * the timeout period */ private void handlePageLoadTimeout(final PageLoadTimeoutEvent event) { + LOGGER.debug("Page did not load in the browser within the timeout period"); + callbackManager.callCustomOrDefault(PageLoadTimeoutEvent.class, event, this::onPageLoadTimeout); @@ -578,10 +613,13 @@ private void handlePageLoadTimeout(final PageLoadTimeoutEvent event) { /** * Handles responses whose HTTP status code indicates an error. * - * @param event the event which gets delivered when a request error (an error with HTTP status - * code 4xx or 5xx) occurs + * @param event the event which gets delivered when the browser loads the page and the HTTP + * status code indicates error (4xx or 5xx) */ private void handleResponseError(final ResponseErrorEvent event) { + LOGGER.debug("Received response whose status code ({}) indicates error", + event.getCompleteCrawlResponse().getStatusCode()); + callbackManager.callCustomOrDefault(ResponseErrorEvent.class, event, this::onResponseError); statsCounter.recordResponseError(); @@ -594,6 +632,9 @@ private void handleResponseError(final ResponseErrorEvent event) { * status code indicates success (2xx) */ private void handleResponseSuccess(final ResponseSuccessEvent event) { + LOGGER.debug("Received response whose status code ({}) indicates success", + event.getCompleteCrawlResponse().getStatusCode()); + callbackManager.callCustomOrDefault(ResponseSuccessEvent.class, event, this::onResponseSuccess); @@ -604,6 +645,8 @@ private void handleResponseSuccess(final ResponseSuccessEvent event) { * Copies all the Selenium cookies for the current domain to the HTTP client cookie store. */ private void syncHttpClientCookies() { + LOGGER.debug("Synchronizing HTTP client cookies"); + webDriver.manage() .getCookies() .stream() @@ -615,9 +658,12 @@ private void syncHttpClientCookies() { * Delays the next request. */ private void performDelay() { + LOGGER.debug("Performing delay"); + try { TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay()); } catch (InterruptedException ex) { + LOGGER.debug("Delay interrupted, stopping crawler"); Thread.currentThread().interrupt(); isStopInitiated.set(true); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/Browser.java b/src/main/java/com/github/peterbencze/serritor/api/Browser.java index 0fffb9a..bf39bfa 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/Browser.java +++ b/src/main/java/com/github/peterbencze/serritor/api/Browser.java @@ -21,7 +21,18 @@ */ public enum Browser { - HTML_UNIT, - CHROME, - FIREFOX + HTML_UNIT("HtmlUnit"), + CHROME("Chrome"), + FIREFOX("Firefox"); + + private final String textualRepresentation; + + Browser(final String textualRepresentation) { + this.textualRepresentation = textualRepresentation; + } + + @Override + public String toString() { + return textualRepresentation; + } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java index 5a76d1a..e6ede80 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java @@ -38,6 +38,8 @@ import org.eclipse.jetty.server.ServerConnector; import org.eclipse.jetty.util.ssl.SslContextFactory; import org.eclipse.jetty.util.thread.QueuedThreadPool; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Provides a skeletal implementation of a crawler to minimize the effort for users to implement @@ -46,6 +48,8 @@ */ public abstract class RestfulBaseCrawler extends BaseCrawler { + private static final Logger LOGGER = LoggerFactory.getLogger(RestfulBaseCrawler.class); + private final WebApiConfiguration webApiConfig; private final Javalin webServer; @@ -118,6 +122,8 @@ protected RestfulBaseCrawler(final WebApiConfiguration webApiConfig, final Crawl byte[] secretKey = accessControlConfig.getSecretKey() .orElseGet(() -> { + LOGGER.debug("Generating secret key for signer algorithm"); + try { return KeyGenerator.getInstance("HmacSHA256") .generateKey() @@ -155,6 +161,7 @@ public WebApiConfiguration getWebApiConfiguration() { protected void onStart() { super.onStart(); + LOGGER.debug("Starting web server"); webServer.start(); } @@ -165,6 +172,7 @@ protected void onStart() { protected void onStop() { super.onStop(); + LOGGER.debug("Stopping web server"); webServer.stop(); } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 3cdc44c..0550509 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -33,12 +33,16 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.http.NameValuePair; import org.apache.http.client.utils.URIBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Manages crawl requests and provides crawl candidates to the crawler. */ public final class CrawlFrontier implements Serializable { + private static final Logger LOGGER = LoggerFactory.getLogger(CrawlFrontier.class); + private final CrawlerConfiguration config; private final StatsCounter statsCounter; private final Set urlFingerprints; @@ -69,12 +73,16 @@ public CrawlFrontier(final CrawlerConfiguration config, final StatsCounter stats * @param isCrawlSeed indicates if the request is a crawl seed */ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { + LOGGER.debug("Feeding request: {}", request); + if (config.isOffsiteRequestFilterEnabled()) { boolean inCrawlDomain = config.getAllowedCrawlDomains() .stream() .anyMatch(crawlDomain -> crawlDomain.contains(request.getDomain())); if (!inCrawlDomain) { + LOGGER.debug("Filtering offsite request"); + statsCounter.recordOffsiteRequest(); return; } @@ -83,6 +91,8 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { if (config.isDuplicateRequestFilterEnabled()) { String urlFingerprint = createFingerprintForUrl(request.getRequestUrl()); if (urlFingerprints.contains(urlFingerprint)) { + LOGGER.debug("Filtering duplicate request"); + statsCounter.recordDuplicateRequest(); return; } @@ -97,6 +107,8 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1; if (crawlDepthLimit != 0 && nextCrawlDepth > crawlDepthLimit) { + LOGGER.debug("Filtering crawl depth limit exceeding request"); + statsCounter.recordCrawlDepthLimitExceedingRequest(); return; } @@ -105,6 +117,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { .setCrawlDepth(nextCrawlDepth); } + LOGGER.debug("Adding request to the list of crawl candidates"); candidates.add(builder.build()); statsCounter.recordRemainingCrawlCandidate(); } @@ -132,6 +145,8 @@ public CrawlCandidate getNextCandidate() { * Resets the crawl frontier to its initial state. */ public void reset() { + LOGGER.debug("Setting crawl frontier to its initial state"); + urlFingerprints.clear(); candidates.clear(); @@ -142,6 +157,8 @@ public void reset() { * Feeds all the crawl seeds to the crawl frontier. */ private void feedCrawlSeeds() { + LOGGER.debug("Feeding crawl seeds"); + config.getCrawlSeeds().forEach((CrawlRequest request) -> feedRequest(request, true)); } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CustomCallbackManager.java b/src/main/java/com/github/peterbencze/serritor/internal/CustomCallbackManager.java index 289dc22..676c71e 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CustomCallbackManager.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CustomCallbackManager.java @@ -24,12 +24,16 @@ import java.util.Map; import java.util.function.Consumer; import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Manages custom callbacks associated with events. */ public final class CustomCallbackManager { + private static final Logger LOGGER = LoggerFactory.getLogger(CustomCallbackManager.class); + private final Map, List>> customCallbacks; @@ -50,6 +54,9 @@ public CustomCallbackManager() { public void addCustomCallback( final Class eventClass, final PatternMatchingCallback callback) { + LOGGER.debug("Adding custom callback for event {} with URL pattern {}", + eventClass.getSimpleName(), callback.getUrlPattern()); + customCallbacks.computeIfAbsent(eventClass, key -> new ArrayList<>()).add(callback); } @@ -70,16 +77,22 @@ public void callCustomOrDefault( final T eventObject, final Consumer defaultCallback) { String requestUrl = eventObject.getCrawlCandidate().getRequestUrl().toString(); - List> applicableCustomCallbacks = + List> applicableCustomCallbacks = customCallbacks.getOrDefault(eventClass, Collections.emptyList()) .stream() .filter(callback -> callback.getUrlPattern().matcher(requestUrl).matches()) - .map(PatternMatchingCallback::getCallback) .collect(Collectors.toList()); if (!applicableCustomCallbacks.isEmpty()) { - applicableCustomCallbacks.forEach(op -> ((Consumer) op).accept(eventObject)); + applicableCustomCallbacks.forEach(callback -> { + LOGGER.debug("Calling custom callback for event {} with URL pattern {}", + eventClass.getSimpleName(), callback.getUrlPattern()); + + ((Consumer) callback.getCallback()).accept(eventObject); + }); } else { + LOGGER.debug("Calling default callback for event {}", eventClass.getSimpleName()); + defaultCallback.accept(eventObject); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java b/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java index e8d0baf..9a02784 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java @@ -28,12 +28,16 @@ import io.javalin.security.AccessManager; import io.javalin.security.Role; import java.util.Set; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A JWT-based access manager. */ public final class JwtAccessManager implements AccessManager { + private static final Logger LOGGER = LoggerFactory.getLogger(JwtAccessManager.class); + private final Algorithm signerAlgorithm; /** @@ -57,20 +61,30 @@ public void manage( final Handler handler, final Context ctx, final Set permittedRoles) throws Exception { + LOGGER.debug("Incoming request from {} to path {}", ctx.ip(), ctx.path()); + if (!permittedRoles.contains(UserRole.UNAUTHENTICATED)) { + LOGGER.debug("Checking JWT"); + String jwt = ctx.attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME); if (jwt == null) { + LOGGER.debug("Returning unauthorized response: no JWT present in context"); + throw new UnauthorizedResponse(); } JWTVerifier verifier = JWT.require(signerAlgorithm).build(); try { verifier.verify(jwt); + LOGGER.debug("JWT verified"); } catch (JWTVerificationException e) { + LOGGER.debug("Returning unauthorized response: JWT verification failed"); + throw new UnauthorizedResponse(); } } + LOGGER.debug("Letting request through"); handler.handle(ctx); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java b/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java index 3e51aba..e3d8012 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java @@ -21,12 +21,16 @@ import io.javalin.security.AccessManager; import io.javalin.security.Role; import java.util.Set; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A no-operation access manager that is used when access control is disabled. */ public final class NoopAccessManager implements AccessManager { + private static final Logger LOGGER = LoggerFactory.getLogger(NoopAccessManager.class); + /** * Simply lets the request pass through without credential checking. * @@ -39,6 +43,8 @@ public void manage( final Handler handler, final Context ctx, final Set permittedRoles) throws Exception { + LOGGER.debug("Incoming request from {} to path {}", ctx.ip(), ctx.path()); + LOGGER.debug("Letting request through"); handler.handle(ctx); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java index 4eca563..5d1c3a3 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java @@ -19,12 +19,16 @@ import io.javalin.Context; import io.javalin.Handler; import java.util.Optional; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A before-handler which extracts the JWT from the Authorization header or the cookie. */ public final class JwtHandler implements Handler { + private static final Logger LOGGER = LoggerFactory.getLogger(JwtHandler.class); + public static final String CONTEXT_ATTRIBUTE_NAME = "JWT"; static final String COOKIE_NAME = "JWT"; @@ -34,12 +38,18 @@ public final class JwtHandler implements Handler { * @param ctx the context object */ @Override - public void handle(final Context ctx) throws Exception { + public void handle(final Context ctx) { Optional jwtFromHeaderOpt = extractJwtFromHeader(ctx); if (jwtFromHeaderOpt.isPresent()) { + LOGGER.debug("JWT found in headers"); + ctx.attribute(CONTEXT_ATTRIBUTE_NAME, jwtFromHeaderOpt.get()); } else { - extractJwtFromCookie(ctx).ifPresent(jwt -> ctx.attribute(CONTEXT_ATTRIBUTE_NAME, jwt)); + extractJwtFromCookie(ctx).ifPresent(jwt -> { + LOGGER.debug("JWT found in cookies"); + + ctx.attribute(CONTEXT_ATTRIBUTE_NAME, jwt); + }); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java index 04ff7b6..dbbe8cb 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java @@ -31,12 +31,16 @@ import java.util.Base64; import java.util.Date; import org.mindrot.jbcrypt.BCrypt; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A handler that is used to verify the authentication credentials of the user. */ public final class LoginHandler implements Handler { + private static final Logger LOGGER = LoggerFactory.getLogger(LoginHandler.class); + private final AccessControlConfiguration accessControlConfig; private final Algorithm signerAlgorithm; @@ -59,13 +63,20 @@ public LoginHandler( * @param ctx the context object */ @Override - public void handle(final Context ctx) throws Exception { + public void handle(final Context ctx) { LoginDto loginDto = ctx.bodyAsClass(LoginDto.class); - User user = accessControlConfig.getUser(loginDto.getUsername()) - .orElseThrow(UnauthorizedResponse::new); + String username = loginDto.getUsername(); + User user = accessControlConfig.getUser(username) + .orElseThrow(() -> { // type inference bug, see JDK-8047338 + LOGGER.debug("Failed login for user {}: user does not exist", username); + + throw new UnauthorizedResponse(); + }); if (!BCrypt.checkpw(loginDto.getPassword(), user.getPasswordHash())) { + LOGGER.debug("Failed login for user {}: incorrect password", username); + throw new UnauthorizedResponse(); } @@ -73,7 +84,7 @@ public void handle(final Context ctx) throws Exception { Date expiryDate = Date.from(Instant.now().plus(tokenValidDuration)); String jwt = JWT.create() .withExpiresAt(expiryDate) - .withClaim("username", user.getUsername()) + .withClaim("username", username) .sign(signerAlgorithm); if (accessControlConfig.isCookieAuthenticationEnabled()) { @@ -82,8 +93,10 @@ public void handle(final Context ctx) throws Exception { ctx.cookie(JwtHandler.COOKIE_NAME, jwt, cookieAgeInSeconds); ctx.cookie(XsrfTokenHandler.COOKIE_NAME, generateXsrfToken(), cookieAgeInSeconds); } else { - ctx.json(new JwtDto(user.getUsername(), expiryDate, jwt)); + ctx.json(new JwtDto(username, expiryDate, jwt)); } + + LOGGER.debug("User {} logged in", username); } /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java index a30a369..2af937c 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java @@ -23,6 +23,8 @@ import java.util.Arrays; import java.util.List; import java.util.Optional; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A before-handler that is responsible for the validation of the XSRF token header if an XSRF @@ -30,6 +32,8 @@ */ public final class XsrfTokenHandler implements Handler { + private static final Logger LOGGER = LoggerFactory.getLogger(XsrfTokenHandler.class); + static final String COOKIE_NAME = "XSRF-TOKEN"; static final String HEADER_NAME = "X-XSRF-TOKEN"; @@ -42,14 +46,18 @@ public final class XsrfTokenHandler implements Handler { * @param ctx the context object */ @Override - public void handle(final Context ctx) throws Exception { + public void handle(final Context ctx) { HttpMethod requestMethod = HttpMethod.valueOf(ctx.method()); if (XSRF_SAFE_HTTP_METHODS.contains(requestMethod)) { + LOGGER.debug("The request method is safe, not checking XSRF token"); return; } + LOGGER.debug("Checking XSRF token"); Optional.ofNullable(ctx.cookie(COOKIE_NAME)).ifPresent(xsrfTokenInCookie -> { if (!xsrfTokenInCookie.equals(ctx.header(HEADER_NAME))) { + LOGGER.debug("Returning unauthorized response: XSRF token missing or incorrect"); + throw new UnauthorizedResponse("XSRF token missing or incorrect"); } }); From a53bbad137e3982d63aa5cbfa9271640be986e55 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 7 Apr 2019 12:44:34 +0200 Subject: [PATCH 54/63] Add string representations for web related entities --- .../serritor/api/RestfulBaseCrawler.java | 1 + .../api/web/AccessControlConfiguration.java | 15 +++++++++++++++ .../api/web/SslContextConfiguration.java | 14 ++++++++++++++ .../peterbencze/serritor/api/web/User.java | 14 ++++++++++++++ .../serritor/api/web/WebApiConfiguration.java | 17 +++++++++++++++++ 5 files changed, 61 insertions(+) diff --git a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java index e6ede80..f29490c 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java @@ -162,6 +162,7 @@ protected void onStart() { super.onStart(); LOGGER.debug("Starting web server"); + LOGGER.debug("Using configuration: {}", webApiConfig); webServer.start(); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java index df49d4d..08ea4be 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java @@ -22,6 +22,8 @@ import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Configuration of the access management. @@ -69,6 +71,19 @@ public boolean isCookieAuthenticationEnabled() { return isCookieAuthenticationEnabled; } + /** + * Returns a string representation of this access control configuration instance. + * + * @return a string representation of this access control configuration instance + */ + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("users", users) + .append("isCookieAuthenticationEnabled", isCookieAuthenticationEnabled) + .toString(); + } + /** * Builder for {@link AccessControlConfiguration}. */ diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/SslContextConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/web/SslContextConfiguration.java index c24f902..c44eeb9 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/web/SslContextConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/web/SslContextConfiguration.java @@ -18,6 +18,8 @@ import java.util.Optional; import org.apache.commons.lang3.Validate; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Configuration of the SSL context. @@ -84,4 +86,16 @@ public String getKeyStorePassword() { public Optional getKeyManagerPassword() { return Optional.ofNullable(keyManagerPassword); } + + /** + * Returns a string representation of this SSL context configuration instance. + * + * @return a string representation of this SSL context configuration instance + */ + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("keyStorePath", keyStorePath) + .toString(); + } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/User.java b/src/main/java/com/github/peterbencze/serritor/api/web/User.java index 5609e1a..f61bbc9 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/web/User.java +++ b/src/main/java/com/github/peterbencze/serritor/api/web/User.java @@ -17,6 +17,8 @@ package com.github.peterbencze.serritor.api.web; import org.apache.commons.lang3.Validate; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Represents a user of the web API. @@ -60,6 +62,18 @@ public String getPasswordHash() { return passwordHash; } + /** + * Returns a string representation of this user instance. + * + * @return a string representation of this user instance + */ + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("username", username) + .toString(); + } + /** * Determines if the version of the BCrypt algorithm used to create the hash is supported. * diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java index 17b1598..92d36a1 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java @@ -20,6 +20,8 @@ import java.util.List; import java.util.Optional; import org.apache.commons.lang3.Validate; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Configuration for the web API. @@ -83,6 +85,21 @@ public Optional getAccessControlConfiguration() { return Optional.ofNullable(accessControlConfig); } + /** + * Returns a string representation of this web API configuration. + * + * @return a string representation of this web API configuration + */ + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("port", port) + .append("corsOrigins", corsOrigins) + .append("sslContextConfiguration", sslContextConfig) + .append("accessControlConfiguration", accessControlConfig) + .toString(); + } + /** * Builder for {@link WebApiConfiguration}. */ From 70ec169754c77aa4b517bbb1ca513ac2487f0dfa Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 26 May 2019 14:58:24 +0200 Subject: [PATCH 55/63] Add web API implementation --- pom.xml | 33 +- .../api/{BaseCrawler.java => Crawler.java} | 8 +- .../api/CrawlerWithSecuredWebApi.java | 209 ++++ .../serritor/api/CrawlerWithWebApi.java | 142 +++ .../serritor/api/RestfulBaseCrawler.java | 238 ----- .../api/web/AccessControlConfiguration.java | 100 +- .../serritor/api/web/ServerConfiguration.java | 273 +++++ .../peterbencze/serritor/api/web/User.java | 60 +- .../serritor/api/web/WebApiConfiguration.java | 188 ---- .../web/WebApiException.java} | 18 +- .../serritor/api/web/http/HttpHandler.java | 39 + .../web => api/web/http}/HttpMethod.java | 5 +- .../api/web/socket/WebSocketHandler.java | 87 ++ .../serritor/internal/util/KeyFactory.java | 65 ++ .../serritor/internal/web/ApiEndpoint.java | 69 -- .../internal/web/JsonErrorHandler.java | 76 ++ .../serritor/internal/web/SecuredWebApi.java | 211 ++++ .../serritor/internal/web/WebApi.java | 217 ++++ .../web/accessmanager/JwtAccessManager.java | 90 -- .../web/accessmanager/NoopAccessManager.java | 50 - .../internal/web/handler/JwtHandler.java | 85 -- .../internal/web/handler/LoginHandler.java | 115 --- .../web/handler/XsrfTokenHandler.java | 65 -- .../internal/web/http/CsrfFilter.java | 114 +++ .../internal/web/http/HttpServlet.java | 67 ++ .../web/http/auth/BCryptCredential.java | 48 + .../http/auth/BCryptCredentialProvider.java | 42 + .../web/http/auth/JwtAuthenticator.java | 361 +++++++ .../web/http/auth/JwtUserIdentity.java | 72 ++ .../web/http/auth/JwtUserPrincipal.java | 44 + .../internal/web/http/dto/ErrorDto.java | 59 ++ .../internal/web/{ => http}/dto/JwtDto.java | 24 +- .../internal/web/{ => http}/dto/LoginDto.java | 20 +- .../internal/web/socket/WebSocketFactory.java | 97 ++ .../web/socket/WebSocketHandlerWrapper.java | 128 +++ .../web/socket/WebSocketSessionManager.java | 84 ++ ...pse.jetty.util.security.CredentialProvider | 1 + .../accessmanager/JwtAccessManagerTest.java | 82 -- .../internal/web/handler/JwtHandlerTest.java | 59 -- .../web/handler/LoginHandlerTest.java | 118 --- .../web/handler/XsrfTokenHandlerTest.java | 54 - .../it/{SerritorIT.java => CrawlingIT.java} | 14 +- .../it/web/TestCrawlerWithSecuredWebApi.java | 98 ++ .../it/web/TestCrawlerWithWebApi.java | 89 ++ .../peterbencze/serritor/it/web/WebApiIT.java | 940 ++++++++++++++++++ src/test/resources/keystore.jks | Bin 0 -> 2215 bytes 46 files changed, 3779 insertions(+), 1279 deletions(-) rename src/main/java/com/github/peterbencze/serritor/api/{BaseCrawler.java => Crawler.java} (99%) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/CrawlerWithSecuredWebApi.java create mode 100644 src/main/java/com/github/peterbencze/serritor/api/CrawlerWithWebApi.java delete mode 100644 src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java create mode 100644 src/main/java/com/github/peterbencze/serritor/api/web/ServerConfiguration.java delete mode 100644 src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java rename src/main/java/com/github/peterbencze/serritor/{internal/web/UserRole.java => api/web/WebApiException.java} (59%) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/web/http/HttpHandler.java rename src/main/java/com/github/peterbencze/serritor/{internal/web => api/web/http}/HttpMethod.java (87%) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/web/socket/WebSocketHandler.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/util/KeyFactory.java delete mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/ApiEndpoint.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/JsonErrorHandler.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/SecuredWebApi.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/WebApi.java delete mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java delete mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java delete mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java delete mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java delete mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/http/CsrfFilter.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/http/HttpServlet.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/BCryptCredential.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/BCryptCredentialProvider.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtAuthenticator.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtUserIdentity.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtUserPrincipal.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/http/dto/ErrorDto.java rename src/main/java/com/github/peterbencze/serritor/internal/web/{ => http}/dto/JwtDto.java (65%) rename src/main/java/com/github/peterbencze/serritor/internal/web/{ => http}/dto/LoginDto.java (63%) create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketFactory.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketHandlerWrapper.java create mode 100644 src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketSessionManager.java create mode 100644 src/main/resources/META-INF/services/org.eclipse.jetty.util.security.CredentialProvider delete mode 100644 src/test/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManagerTest.java delete mode 100644 src/test/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandlerTest.java delete mode 100644 src/test/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandlerTest.java delete mode 100644 src/test/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandlerTest.java rename src/test/java/com/github/peterbencze/serritor/it/{SerritorIT.java => CrawlingIT.java} (96%) create mode 100644 src/test/java/com/github/peterbencze/serritor/it/web/TestCrawlerWithSecuredWebApi.java create mode 100644 src/test/java/com/github/peterbencze/serritor/it/web/TestCrawlerWithWebApi.java create mode 100644 src/test/java/com/github/peterbencze/serritor/it/web/WebApiIT.java create mode 100644 src/test/resources/keystore.jks diff --git a/pom.xml b/pom.xml index c1d422e..4740415 100644 --- a/pom.xml +++ b/pom.xml @@ -72,9 +72,24 @@ 27.0.1-jre - io.javalin - javalin - 2.6.0 + org.eclipse.jetty + jetty-server + 9.4.18.v20190429 + + + org.eclipse.jetty + jetty-servlet + 9.4.18.v20190429 + + + org.eclipse.jetty + jetty-servlets + 9.4.18.v20190429 + + + org.eclipse.jetty.websocket + websocket-server + 9.4.18.v20190429 com.fasterxml.jackson.datatype @@ -114,6 +129,18 @@ 2.21.0 test + + org.awaitility + awaitility + 3.1.6 + test + + + net.jodah + failsafe + 2.0.1 + test + diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/Crawler.java similarity index 99% rename from src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java rename to src/main/java/com/github/peterbencze/serritor/api/Crawler.java index cdab6a4..0de2d0e 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/Crawler.java @@ -77,9 +77,9 @@ * Provides a skeletal implementation of a crawler to minimize the effort for users to implement * their own. */ -public abstract class BaseCrawler { +public abstract class Crawler { - private static final Logger LOGGER = LoggerFactory.getLogger(BaseCrawler.class); + private static final Logger LOGGER = LoggerFactory.getLogger(Crawler.class); private final CrawlerConfiguration config; private final Stopwatch runTimeStopwatch; @@ -100,7 +100,7 @@ public abstract class BaseCrawler { * * @param config the configuration of the crawler */ - protected BaseCrawler(final CrawlerConfiguration config) { + protected Crawler(final CrawlerConfiguration config) { this(new CrawlerState(Validate.notNull(config, "The config parameter cannot be null"))); } @@ -109,7 +109,7 @@ protected BaseCrawler(final CrawlerConfiguration config) { * * @param state the state to restore the crawler to */ - protected BaseCrawler(final CrawlerState state) { + protected Crawler(final CrawlerState state) { Validate.notNull(state, "The state parameter cannot be null"); config = state.getStateObject(CrawlerConfiguration.class) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithSecuredWebApi.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithSecuredWebApi.java new file mode 100644 index 0000000..8592770 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithSecuredWebApi.java @@ -0,0 +1,209 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api; + +import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; +import com.github.peterbencze.serritor.api.web.ServerConfiguration; +import com.github.peterbencze.serritor.api.web.http.HttpHandler; +import com.github.peterbencze.serritor.api.web.http.HttpMethod; +import com.github.peterbencze.serritor.api.web.socket.WebSocketHandler; +import com.github.peterbencze.serritor.internal.web.SecuredWebApi; +import java.util.Set; +import org.apache.commons.lang3.Validate; +import org.eclipse.jetty.websocket.api.Session; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A crawler implementation with secured web API support. It allows users to register HTTP and + * WebSocket endpoints that can be used to interact with the crawler while it is running. Users are + * required to authenticate before they can access restricted endpoints (if they are authorized to + * do so). + */ +public abstract class CrawlerWithSecuredWebApi extends Crawler { + + private static final Logger LOGGER = LoggerFactory.getLogger(CrawlerWithSecuredWebApi.class); + + private final SecuredWebApi webApi; + + /** + * Base constructor which sets up the web server and the crawler with the provided + * configurations. + * + * @param serverConfig the configuration of the web server + * @param accessControlConfig the access control configuration + * @param crawlerConfig the configuration of the crawler + */ + protected CrawlerWithSecuredWebApi( + final ServerConfiguration serverConfig, + final AccessControlConfiguration accessControlConfig, + final CrawlerConfiguration crawlerConfig) { + this(serverConfig, accessControlConfig, new CrawlerState(crawlerConfig)); + } + + /** + * Base constructor which sets up the web server with the provided configuration and restores + * the crawler to the given state. + * + * @param serverConfig the configuration of the web server + * @param accessControlConfig the access control configuration + * @param state the state to restore the crawler to + */ + protected CrawlerWithSecuredWebApi( + final ServerConfiguration serverConfig, + final AccessControlConfiguration accessControlConfig, + final CrawlerState state) { + super(state); + + webApi = new SecuredWebApi(serverConfig, accessControlConfig); + } + + /** + * Returns the configuration of the web server. + * + * @return the configuration of the web server + */ + public final ServerConfiguration getServerConfiguration() { + return webApi.getServerConfiguration(); + } + + /** + * Returns the access control configuration. + * + * @return the access control configuration + */ + public final AccessControlConfiguration getAccessControlConfiguration() { + return webApi.getAccessControlConfiguration(); + } + + /** + * {@inheritDoc} + */ + @Override + protected void onStart() { + super.onStart(); + + LOGGER.info("Starting web server"); + webApi.start(); + } + + /** + * {@inheritDoc} + */ + @Override + protected void onStop() { + super.onStop(); + + LOGGER.info("Stopping web server"); + webApi.stop(); + } + + /** + * Returns a set of open WebSocket sessions that represent connections to the specific + * endpoint. + * + * @param socketHandlerClass the class of the WebSocket endpoint handler + * + * @return a set of open WebSocket sessions that represent connections to the specific endpoint + */ + protected final Set getOpenWebSocketSessions( + final Class socketHandlerClass) { + Validate.notNull(socketHandlerClass, "The socketHandlerClass parameter cannot be null"); + + return webApi.getOpenWebSocketSessions(socketHandlerClass); + } + + /** + * Adds an HTTP endpoint to the web API that is accessible to anyone (regardless of if they are + * authenticated or not). + * + * @param httpMethod the HTTP method of the endpoint + * @param path the path of the endpoint + * @param handler the handler of the endpoint + */ + protected final void addHttpEndpoint( + final HttpMethod httpMethod, + final String path, + final HttpHandler handler) { + Validate.notNull(httpMethod, "The httpMethod parameter cannot be null"); + Validate.notBlank(path, "The path parameter cannot be null or blank"); + Validate.notNull(handler, "The handler parameter cannot be null"); + + webApi.addHttpEndpoint(httpMethod, path, handler); + } + + /** + * Adds an HTTP endpoint to the web API that is only accessible for users who are authenticated + * and have any of the roles specified. + * + * @param httpMethod the HTTP method of the endpoint + * @param path the path of the endpoint + * @param allowedRoles the set of allowed roles + * @param handler the handler of the endpoint + */ + protected final void addHttpEndpoint( + final HttpMethod httpMethod, + final String path, + final Set allowedRoles, + final HttpHandler handler) { + Validate.notNull(httpMethod, "The httpMethod parameter cannot be null"); + Validate.notBlank(path, "The path parameter cannot be null or blank"); + Validate.notEmpty(allowedRoles, + "The allowedRoles parameter cannot be null or empty"); + Validate.noNullElements(allowedRoles, + "The allowedRoles parameter cannot contain null elements"); + Validate.notNull(handler, "The handler parameter cannot be null"); + + webApi.addHttpEndpoint(httpMethod, path, allowedRoles, handler); + } + + /** + * Adds a WebSocket endpoint to the web API that is accessible to anyone (regardless of if they + * are authenticated or not). + * + * @param path the path of the endpoint + * @param handler the handler of the endpoint + */ + protected final void addWebSocketEndpoint(final String path, final WebSocketHandler handler) { + Validate.notBlank(path, "The path parameter cannot be null or blank"); + Validate.notNull(handler, "The handler parameter cannot be null"); + + webApi.addWebSocketEndpoint(path, handler); + } + + /** + * Adds a WebSocket endpoint to the web API that is only accessible for users who are + * authenticated and have any of the roles specified. + * + * @param path the path of the endpoint + * @param allowedRoles the set of allowed roles + * @param handler the handler of the endpoint + */ + protected final void addWebSocketEndpoint( + final String path, + final Set allowedRoles, + final WebSocketHandler handler) { + Validate.notBlank(path, "The path parameter cannot be null or blank"); + Validate.notEmpty(allowedRoles, + "The allowedRoles parameter cannot be null or empty"); + Validate.noNullElements(allowedRoles, + "The allowedRoles parameter cannot contain null elements"); + Validate.notNull(handler, "The handler parameter cannot be null"); + + webApi.addWebSocketEndpoint(path, allowedRoles, handler); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithWebApi.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithWebApi.java new file mode 100644 index 0000000..195c2bb --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithWebApi.java @@ -0,0 +1,142 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api; + +import com.github.peterbencze.serritor.api.web.ServerConfiguration; +import com.github.peterbencze.serritor.api.web.http.HttpHandler; +import com.github.peterbencze.serritor.api.web.http.HttpMethod; +import com.github.peterbencze.serritor.api.web.socket.WebSocketHandler; +import com.github.peterbencze.serritor.internal.web.WebApi; +import java.util.Set; +import org.apache.commons.lang3.Validate; +import org.eclipse.jetty.websocket.api.Session; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A crawler implementation with web API support. It allows users to register HTTP and WebSocket + * endpoints that can be used to interact with the crawler while it is running. + */ +public abstract class CrawlerWithWebApi extends Crawler { + + private static final Logger LOGGER = LoggerFactory.getLogger(CrawlerWithWebApi.class); + + private final WebApi webApi; + + /** + * Base constructor which sets up the web server and the crawler with the provided + * configurations. + * + * @param serverConfig the configuration of the web server + * @param crawlerConfig the configuration of the crawler + */ + protected CrawlerWithWebApi( + final ServerConfiguration serverConfig, + final CrawlerConfiguration crawlerConfig) { + this(serverConfig, new CrawlerState(crawlerConfig)); + } + + /** + * Base constructor which sets up the web server with the provided configuration and restores + * the crawler to the given state. + * + * @param serverConfig the configuration of the web server + * @param state the state to restore the crawler to + */ + protected CrawlerWithWebApi(final ServerConfiguration serverConfig, final CrawlerState state) { + super(state); + + webApi = new WebApi(serverConfig); + } + + /** + * Returns the configuration of the web server. + * + * @return the configuration of the web server + */ + public final ServerConfiguration getServerConfiguration() { + return webApi.getServerConfiguration(); + } + + /** + * {@inheritDoc} + */ + @Override + protected void onStart() { + super.onStart(); + + LOGGER.info("Starting web server"); + webApi.start(); + } + + /** + * {@inheritDoc} + */ + @Override + protected void onStop() { + super.onStop(); + + LOGGER.info("Stopping web server"); + webApi.stop(); + } + + /** + * Returns a set of open WebSocket sessions that represent connections to the specific + * endpoint. + * + * @param socketHandlerClass the class of the WebSocket endpoint handler + * + * @return a set of open WebSocket sessions that represent connections to the specific endpoint + */ + protected final Set getOpenWebSocketSessions( + final Class socketHandlerClass) { + Validate.notNull(socketHandlerClass, "The socketHandlerClass parameter cannot be null"); + + return webApi.getOpenWebSocketSessions(socketHandlerClass); + } + + /** + * Adds an HTTP endpoint to the web API. + * + * @param httpMethod the HTTP method of the endpoint + * @param path the path of the endpoint + * @param handler the handler of the endpoint + */ + protected final void addHttpEndpoint( + final HttpMethod httpMethod, + final String path, + final HttpHandler handler) { + Validate.notNull(httpMethod, "The httpMethod parameter cannot be null"); + Validate.notBlank(path, "The path parameter cannot be null or blank"); + Validate.notNull(handler, "The handler parameter cannot be null"); + + webApi.addHttpEndpoint(httpMethod, path, handler); + } + + /** + * Adds a WebSocket endpoint to the web API. + * + * @param path the path of the endpoint + * @param handler the handler of the endpoint + */ + protected final void addWebSocketEndpoint(final String path, final WebSocketHandler handler) { + Validate.notBlank(path, "The path parameter cannot be null or blank"); + Validate.notNull(handler, "The handler parameter cannot be null"); + + webApi.addWebSocketEndpoint(path, handler); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java deleted file mode 100644 index f29490c..0000000 --- a/src/main/java/com/github/peterbencze/serritor/api/RestfulBaseCrawler.java +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.api; - -import com.auth0.jwt.algorithms.Algorithm; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; -import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; -import com.github.peterbencze.serritor.api.web.WebApiConfiguration; -import com.github.peterbencze.serritor.internal.web.ApiEndpoint; -import com.github.peterbencze.serritor.internal.web.accessmanager.JwtAccessManager; -import com.github.peterbencze.serritor.internal.web.accessmanager.NoopAccessManager; -import com.github.peterbencze.serritor.internal.web.handler.JwtHandler; -import com.github.peterbencze.serritor.internal.web.handler.LoginHandler; -import com.github.peterbencze.serritor.internal.web.handler.XsrfTokenHandler; -import io.javalin.Handler; -import io.javalin.Javalin; -import io.javalin.apibuilder.ApiBuilder; -import io.javalin.json.JavalinJackson; -import java.security.NoSuchAlgorithmException; -import java.util.Optional; -import javax.crypto.KeyGenerator; -import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.ServerConnector; -import org.eclipse.jetty.util.ssl.SslContextFactory; -import org.eclipse.jetty.util.thread.QueuedThreadPool; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Provides a skeletal implementation of a crawler to minimize the effort for users to implement - * their own. It also exposes a RESTful web API that can be used to interact with the crawler while - * it is running. - */ -public abstract class RestfulBaseCrawler extends BaseCrawler { - - private static final Logger LOGGER = LoggerFactory.getLogger(RestfulBaseCrawler.class); - - private final WebApiConfiguration webApiConfig; - private final Javalin webServer; - - /** - * Base constructor which sets up the crawler with the provided configuration. The web API is - * initialized with the default settings. - * - * @param crawlerConfig the configuration of the crawler - */ - protected RestfulBaseCrawler(final CrawlerConfiguration crawlerConfig) { - this(WebApiConfiguration.createDefault(), crawlerConfig); - } - - /** - * Base constructor which sets up the web API and the crawler with the provided configurations. - * - * @param webApiConfig the configuration of the web API - * @param crawlerConfig the configuration of the crawler - */ - protected RestfulBaseCrawler( - final WebApiConfiguration webApiConfig, - final CrawlerConfiguration crawlerConfig) { - this(webApiConfig, new CrawlerState(crawlerConfig)); - } - - /** - * Base constructor which restores the crawler to the provided state. The web API is initialized - * with the default settings. - * - * @param state the state to restore the crawler to - */ - protected RestfulBaseCrawler(final CrawlerState state) { - this(WebApiConfiguration.createDefault(), state); - } - - /** - * Base constructor which sets up the web API with the provided configuration and restores the - * crawler to the provided state. - * - * @param webApiConfig the configuration of the web API - * @param state the state to restore the crawler to - */ - protected RestfulBaseCrawler(final WebApiConfiguration webApiConfig, final CrawlerState state) { - super(state); - - this.webApiConfig = webApiConfig; - - webServer = Javalin.create() - .disableStartupBanner() - .server(() -> createServer(webApiConfig)) - .routes(() -> { - registerEndpoint(ApiEndpoint.STOP_CRAWLER, ctx -> stop()); - registerEndpoint(ApiEndpoint.GET_CONFIG, - ctx -> ctx.json(getCrawlerConfiguration())); - registerEndpoint(ApiEndpoint.GET_STATS, ctx -> ctx.json(getCrawlStats())); - }); - - webApiConfig.getCorsOrigins().forEach(webServer::enableCorsForOrigin); - - Optional accessControlConfigOpt - = webApiConfig.getAccessControlConfiguration(); - if (accessControlConfigOpt.isPresent()) { - AccessControlConfiguration accessControlConfig = accessControlConfigOpt.get(); - - webServer.before(new JwtHandler()); - - if (accessControlConfig.isCookieAuthenticationEnabled()) { - webServer.before(new XsrfTokenHandler()); - } - - byte[] secretKey = accessControlConfig.getSecretKey() - .orElseGet(() -> { - LOGGER.debug("Generating secret key for signer algorithm"); - - try { - return KeyGenerator.getInstance("HmacSHA256") - .generateKey() - .getEncoded(); - } catch (NoSuchAlgorithmException e) { - throw new IllegalStateException(e); - } - }); - Algorithm signerAlgorithm = Algorithm.HMAC256(secretKey); - - webServer.accessManager(new JwtAccessManager(signerAlgorithm)); - - webServer.routes(() -> registerEndpoint(ApiEndpoint.LOGIN, - new LoginHandler(accessControlConfig, signerAlgorithm))); - } else { - webServer.accessManager(new NoopAccessManager()); - } - - JavalinJackson.configure(new ObjectMapper().registerModule(new Jdk8Module())); - } - - /** - * Returns the configuration of the web API. - * - * @return the configuration of the web API - */ - public WebApiConfiguration getWebApiConfiguration() { - return webApiConfig; - } - - /** - * {@inheritDoc} - */ - @Override - protected void onStart() { - super.onStart(); - - LOGGER.debug("Starting web server"); - LOGGER.debug("Using configuration: {}", webApiConfig); - webServer.start(); - } - - /** - * {@inheritDoc} - */ - @Override - protected void onStop() { - super.onStop(); - - LOGGER.debug("Stopping web server"); - webServer.stop(); - } - - /** - * Creates and configures a Jetty HTTP servlet server. - * - * @param webApiConfig the configuration of the web API - * - * @return the configured Jetty HTTP servlet server - */ - private static Server createServer(final WebApiConfiguration webApiConfig) { - Server server = new Server(new QueuedThreadPool(250, 8, 60_000)); - - ServerConnector serverConnector = webApiConfig.getSslContextConfiguration() - .map(sslContextConfig -> { - SslContextFactory sslContextFactory = new SslContextFactory(); - sslContextFactory.setKeyStorePath(sslContextConfig.getKeyStorePath()); - sslContextFactory.setKeyStorePassword(sslContextConfig.getKeyStorePassword()); - sslContextConfig.getKeyManagerPassword() - .ifPresent(sslContextFactory::setKeyManagerPassword); - - return new ServerConnector(server, sslContextFactory); - }) - .orElseGet(() -> new ServerConnector(server)); - serverConnector.setPort(webApiConfig.getPort()); - - server.addConnector(serverConnector); - - return server; - } - - /** - * Adds an endpoint to the web API. - * - * @param apiEndpoint the endpoint - * @param handler the handler of the endpoint - */ - private static void registerEndpoint(final ApiEndpoint apiEndpoint, final Handler handler) { - switch (apiEndpoint.getHttpMethod()) { - case HEAD: - ApiBuilder.head(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); - break; - case GET: - ApiBuilder.get(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); - break; - case POST: - ApiBuilder.post(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); - break; - case PUT: - ApiBuilder.put(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); - break; - case PATCH: - ApiBuilder.patch(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); - break; - case DELETE: - ApiBuilder.delete(apiEndpoint.getPath(), handler, apiEndpoint.getUserRoles()); - break; - default: - throw new IllegalArgumentException("Unsupported HTTP method"); - } - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java index 08ea4be..e28310c 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/web/AccessControlConfiguration.java @@ -16,10 +16,10 @@ package com.github.peterbencze.serritor.api.web; +import java.time.Duration; import java.util.ArrayList; import java.util.List; import java.util.Optional; -import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; import org.apache.commons.lang3.builder.ToStringBuilder; @@ -30,27 +30,36 @@ */ public final class AccessControlConfiguration { + private final String authenticationPath; private final List users; - private final byte[] secretKey; + private final String secretKey; private final boolean isCookieAuthenticationEnabled; + private final Duration jwtExpirationDuration; private AccessControlConfiguration(final AccessControlConfigurationBuilder builder) { + authenticationPath = builder.authenticationPath; users = builder.users; secretKey = builder.secretKey; isCookieAuthenticationEnabled = builder.isCookieAuthenticationEnabled; + jwtExpirationDuration = builder.jwtExpirationDuration; } /** - * Returns the user with the given username. + * Returns the authentication path. * - * @param username the username of the user + * @return the authentication path + */ + public String getAuthenticationPath() { + return authenticationPath; + } + + /** + * Returns the list of users who have access to the web API. * - * @return the user with the given username + * @return the list of users who have access to the web API */ - public Optional getUser(final String username) { - return users.stream() - .filter(user -> StringUtils.equalsIgnoreCase(user.getUsername(), username)) - .findFirst(); + public List getUsers() { + return users; } /** @@ -58,7 +67,7 @@ public Optional getUser(final String username) { * * @return the secret key to be used for the JWT signing algorithm */ - public Optional getSecretKey() { + public Optional getSecretKey() { return Optional.ofNullable(secretKey); } @@ -71,6 +80,15 @@ public boolean isCookieAuthenticationEnabled() { return isCookieAuthenticationEnabled; } + /** + * Returns the expiration duration of the JWT which is used to authenticate. + * + * @return the expiration duration of the JWT which is used to authenticate + */ + public Duration getJwtExpirationDuration() { + return jwtExpirationDuration; + } + /** * Returns a string representation of this access control configuration instance. * @@ -79,8 +97,10 @@ public boolean isCookieAuthenticationEnabled() { @Override public String toString() { return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("authenticationPath", authenticationPath) .append("users", users) .append("isCookieAuthenticationEnabled", isCookieAuthenticationEnabled) + .append("jwtExpirationDuration", jwtExpirationDuration) .toString(); } @@ -89,10 +109,15 @@ public String toString() { */ public static final class AccessControlConfigurationBuilder { + private static final String DEFAULT_AUTHENTICATION_PATH = "/api/auth"; + private static final Duration DEFAULT_JWT_EXPIRATION_DURATION = Duration.ofHours(1); + private final List users; - private byte[] secretKey; + private String authenticationPath; + private String secretKey; private boolean isCookieAuthenticationEnabled; + private Duration jwtExpirationDuration; /** * Creates a {@link AccessControlConfigurationBuilder} instance. @@ -104,6 +129,25 @@ public AccessControlConfigurationBuilder(final User rootUser) { users = new ArrayList<>(); users.add(rootUser); + + authenticationPath = DEFAULT_AUTHENTICATION_PATH; + jwtExpirationDuration = DEFAULT_JWT_EXPIRATION_DURATION; + } + + /** + * Sets the authentication path. The default path is /api/auth. + * + * @param authenticationPath the authentication path + * + * @return the AccessControlConfigurationBuilder instance + */ + public AccessControlConfigurationBuilder setAuthenticationPath( + final String authenticationPath) { + Validate.notBlank(authenticationPath, + "The authenticationPath parameter cannot be null or blank"); + + this.authenticationPath = authenticationPath; + return this; } /** @@ -129,20 +173,44 @@ public AccessControlConfigurationBuilder addUser(final User newUser) { * * @return the AccessControlConfigurationBuilder instance */ - public AccessControlConfigurationBuilder setSecretKey(final Byte[] secretKey) { - Validate.notEmpty(secretKey, "The secretKey parameter cannot be empty"); - this.secretKey = ArrayUtils.toPrimitive(secretKey); + public AccessControlConfigurationBuilder setSecretKey(final String secretKey) { + Validate.notBlank(secretKey, "The secretKey parameter cannot be null or blank"); + this.secretKey = secretKey; return this; } /** * If enabled, the JWT will be stored in a cookie. * + * @param isCookieAuthenticationEnabled true enables, false + * disables cookie authentication + * + * @return the AccessControlConfigurationBuilder instance + */ + public AccessControlConfigurationBuilder setCookieAuthenticationEnabled( + final boolean isCookieAuthenticationEnabled) { + this.isCookieAuthenticationEnabled = isCookieAuthenticationEnabled; + return this; + } + + /** + * Sets the expiration duration of the JWT which is used to authenticate. + * + * @param jwtExpirationDuration the expiration duration of the JWT + * * @return the AccessControlConfigurationBuilder instance */ - public AccessControlConfigurationBuilder enableCookieAuthentication() { - isCookieAuthenticationEnabled = true; + public AccessControlConfigurationBuilder setJwtExpirationDuration( + final Duration jwtExpirationDuration) { + Validate.notNull(jwtExpirationDuration, + "The jwtExpirationDuration parameter cannot be null"); + Validate.isTrue(!jwtExpirationDuration.isZero(), + "The JWT expiration duration cannot be zero"); + Validate.isTrue(!jwtExpirationDuration.isNegative(), + "The JWT expiration duration cannot be negative"); + + this.jwtExpirationDuration = jwtExpirationDuration; return this; } diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/ServerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/web/ServerConfiguration.java new file mode 100644 index 0000000..544b8f7 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/web/ServerConfiguration.java @@ -0,0 +1,273 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.web; + +import com.github.peterbencze.serritor.api.web.http.HttpMethod; +import com.google.common.collect.ImmutableSet; +import com.google.common.net.HttpHeaders; +import java.util.Collections; +import java.util.Optional; +import java.util.Set; +import org.apache.commons.lang3.Validate; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; + +/** + * Configuration for the web server. + */ +public final class ServerConfiguration { + + private final int port; + private final Set corsAllowedOrigins; + private final Set corsAllowedMethods; + private final Set corsAllowedHeaders; + private final Set corsExposedHeaders; + private final SslContextConfiguration sslContextConfig; + + private ServerConfiguration(final ServerConfigurationBuilder builder) { + port = builder.port; + corsAllowedOrigins = builder.corsAllowedOrigins; + corsAllowedMethods = builder.corsAllowedMethods; + corsAllowedHeaders = builder.corsAllowedHeaders; + corsExposedHeaders = builder.corsExposedHeaders; + sslContextConfig = builder.sslContextConfig; + } + + /** + * Creates the default configuration of the web server. + * + * @return the default configuration of the web server + */ + public static ServerConfiguration createDefault() { + return new ServerConfigurationBuilder().build(); + } + + /** + * Returns the port number used by the web server. + * + * @return the port number used by the web server + */ + public int getPort() { + return port; + } + + /** + * Returns the set of origins that are allowed to access the resources. + * + * @return the set of origins that are allowed to access the resources + */ + public Set getCorsAllowedOrigins() { + return corsAllowedOrigins; + } + + /** + * Returns the set of HTTP methods that are allowed to be used when accessing the resources. + * + * @return the set of HTTP methods that are allowed to be used when accessing the resources + */ + public Set getCorsAllowedMethods() { + return corsAllowedMethods; + } + + /** + * Returns the set of HTTP headers that are allowed to be specified when accessing the + * resources. + * + * @return the set of HTTP headers that are allowed to be specified when accessing the + * resources + */ + public Set getCorsAllowedHeaders() { + return corsAllowedHeaders; + } + + /** + * Returns the set of HTTP headers that are allowed to be exposed on the client. + * + * @return the set of HTTP headers that are allowed to be exposed on the client + */ + public Set getCorsExposedHeaders() { + return corsExposedHeaders; + } + + /** + * Returns the SSL context configuration. + * + * @return the SSL context configuration + */ + public Optional getSslContextConfiguration() { + return Optional.ofNullable(sslContextConfig); + } + + /** + * Returns a string representation of this web server configuration. + * + * @return a string representation of this web server configuration + */ + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("port", port) + .append("corsAllowedOrigins", corsAllowedOrigins) + .append("corsAllowedMethods", corsAllowedMethods) + .append("corsAllowedHeaders", corsAllowedHeaders) + .append("corsExposedHeaders", corsExposedHeaders) + .append("sslContextConfiguration", sslContextConfig) + .toString(); + } + + /** + * Builder for {@link ServerConfiguration}. + */ + public static final class ServerConfigurationBuilder { + + private static final int DEFAULT_PORT = 8080; + + private static final Set DEFAULT_CORS_ALLOWED_ORIGINS = ImmutableSet.of("*"); + private static final Set DEFAULT_CORS_ALLOWED_METHODS = + ImmutableSet.of(HttpMethod.GET.toString(), HttpMethod.POST.toString(), + HttpMethod.HEAD.toString()); + private static final Set DEFAULT_CORS_ALLOWED_HEADERS = + ImmutableSet.of(HttpHeaders.X_REQUESTED_WITH, HttpHeaders.CONTENT_TYPE, + HttpHeaders.ACCEPT, HttpHeaders.ORIGIN); + private static final Set DEFAULT_CORS_EXPOSED_HEADERS = Collections.emptySet(); + + private int port; + private Set corsAllowedOrigins; + private Set corsAllowedMethods; + private Set corsAllowedHeaders; + private Set corsExposedHeaders; + private SslContextConfiguration sslContextConfig; + + /** + * Creates a {@link ServerConfigurationBuilder} instance. + */ + public ServerConfigurationBuilder() { + port = DEFAULT_PORT; + corsAllowedOrigins = DEFAULT_CORS_ALLOWED_ORIGINS; + corsAllowedMethods = DEFAULT_CORS_ALLOWED_METHODS; + corsAllowedHeaders = DEFAULT_CORS_ALLOWED_HEADERS; + corsExposedHeaders = DEFAULT_CORS_EXPOSED_HEADERS; + } + + /** + * Sets the port number to be used by the web server. + * + * @param port the port number to use + * + * @return the ServerConfigurationBuilder instance + */ + public ServerConfigurationBuilder setPort(final int port) { + this.port = port; + return this; + } + + /** + * Sets the set of origins that are allowed to access the resources. All origins (*) are + * allowed by default. + * + * @param allowedOrigins the set of allowed origins + * + * @return the ServerConfigurationBuilder instance + */ + public ServerConfigurationBuilder setCorsAllowedOrigins(final Set allowedOrigins) { + Validate.notEmpty(allowedOrigins, + "The allowedOrigins parameter cannot be null or empty"); + Validate.noNullElements(allowedOrigins, + "The allowedOrigins parameter cannot contain null elements"); + + corsAllowedOrigins = allowedOrigins; + return this; + } + + /** + * Sets the set of HTTP methods that are allowed to be used when accessing the resources. + * The default methods are: GET, POST and HEAD. + * + * @param allowedMethods the set of allowed HTTP methods + * + * @return the ServerConfigurationBuilder instance + */ + public ServerConfigurationBuilder setCorsAllowedMethods(final Set allowedMethods) { + Validate.notEmpty(allowedMethods, + "The allowedMethods parameter cannot be null or empty"); + Validate.noNullElements(allowedMethods, + "The allowedMethods parameter cannot contain null elements"); + + corsAllowedMethods = allowedMethods; + return this; + } + + /** + * Sets the set of HTTP headers that are allowed to be specified when accessing the + * resources. The default headers are X-Requested-With, Content-Type, Accept and Origin. + * + * @param allowedHeaders the set of allowed HTTP headers + * + * @return the ServerConfigurationBuilder instance + */ + public ServerConfigurationBuilder setCorsAllowedHeaders(final Set allowedHeaders) { + Validate.notNull(allowedHeaders, + "The allowedHeaders parameter cannot be null or empty"); + Validate.noNullElements(allowedHeaders, + "The allowedHeaders parameter cannot contain null elements"); + + corsAllowedHeaders = allowedHeaders; + return this; + } + + /** + * Sets the set of HTTP headers that are allowed to be exposed on the client. No headers are + * exposed by default. + * + * @param exposedHeaders the set of exposed HTTP headers + * + * @return the ServerConfigurationBuilder instance + */ + public ServerConfigurationBuilder setCorsExposedHeaders(final Set exposedHeaders) { + Validate.notEmpty(exposedHeaders, + "The exposedHeaders parameter cannot be null or empty"); + Validate.noNullElements(exposedHeaders, + "The exposedHeaders parameter cannot contain null elements"); + + corsExposedHeaders = exposedHeaders; + return this; + } + + /** + * Enables the use of SSL. + * + * @param sslContextConfig the SSL context configuration + * + * @return the ServerConfigurationBuilder instance + */ + public ServerConfigurationBuilder withSsl(final SslContextConfiguration sslContextConfig) { + Validate.notNull(sslContextConfig, "The sslContextConfig parameter cannot be null"); + + this.sslContextConfig = sslContextConfig; + return this; + } + + /** + * Builds the configured ServerConfiguration instance. + * + * @return the configured ServerConfiguration instance + */ + public ServerConfiguration build() { + return new ServerConfiguration(this); + } + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/User.java b/src/main/java/com/github/peterbencze/serritor/api/web/User.java index f61bbc9..411cb4b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/web/User.java +++ b/src/main/java/com/github/peterbencze/serritor/api/web/User.java @@ -16,6 +16,10 @@ package com.github.peterbencze.serritor.api.web; +import com.github.peterbencze.serritor.internal.web.http.auth.BCryptCredential; +import java.util.Collections; +import java.util.Set; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.commons.lang3.builder.ToStringStyle; @@ -26,22 +30,41 @@ public final class User { private final String username; - private final String passwordHash; + private final String password; + private final Set roles; /** * Creates a {@link User} instance. * - * @param username the unique username of the user - * @param passwordHash the BCrypt hash of the user's password + * @param username the unique username of the user + * @param password the BCrypt hash of the user's password + * @param roles the roles associated with the user */ - public User(final String username, final String passwordHash) { + public User(final String username, final String password, final Set roles) { Validate.notBlank(username, "The username parameter cannot be null or blank"); - Validate.notBlank(passwordHash, "The passwordHash parameter cannot be null or blank"); - Validate.isTrue(isSupportedSaltVersion(passwordHash), - "Unsupported BCrypt salt version (only $2$ or $2a$ are supported)"); + Validate.notBlank(password, "The password parameter cannot be null or blank"); + + if (password.startsWith(BCryptCredential.PREFIX)) { + Validate.isTrue(isSupportedSaltVersion(password), + "Unsupported BCrypt salt version (only $2$ or $2a$ are supported)"); + } + + Validate.noNullElements(roles, + "The roles parameter cannot be null or contain null elements"); this.username = username; - this.passwordHash = passwordHash; + this.password = password; + this.roles = roles; + } + + /** + * Creates a {@link User} instance. + * + * @param username the unique username of the user + * @param password the BCrypt hash of the user's password + */ + public User(final String username, final String password) { + this(username, password, Collections.emptySet()); } /** @@ -58,8 +81,17 @@ public String getUsername() { * * @return the BCrypt hash of the user's password */ - public String getPasswordHash() { - return passwordHash; + public String getPassword() { + return password; + } + + /** + * Returns the roles of the user. + * + * @return the roles of the user + */ + public Set getRoles() { + return roles; } /** @@ -71,17 +103,19 @@ public String getPasswordHash() { public String toString() { return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) .append("username", username) + .append("roles", roles) .toString(); } /** * Determines if the version of the BCrypt algorithm used to create the hash is supported. * - * @param passwordHash the BCrypt hash + * @param password the BCrypt hash * * @return true if the version is supported, false otherwise */ - private static boolean isSupportedSaltVersion(final String passwordHash) { - return passwordHash.startsWith("$2$") || passwordHash.startsWith("$2a$"); + private static boolean isSupportedSaltVersion(final String password) { + String passwordWithoutPrefix = StringUtils.removeStart(password, BCryptCredential.PREFIX); + return StringUtils.startsWithAny(passwordWithoutPrefix, "$2$", "$2a$"); } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java deleted file mode 100644 index 92d36a1..0000000 --- a/src/main/java/com/github/peterbencze/serritor/api/web/WebApiConfiguration.java +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.api.web; - -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import org.apache.commons.lang3.Validate; -import org.apache.commons.lang3.builder.ToStringBuilder; -import org.apache.commons.lang3.builder.ToStringStyle; - -/** - * Configuration for the web API. - */ -public final class WebApiConfiguration { - - private final int port; - private final List corsOrigins; - private final SslContextConfiguration sslContextConfig; - private final AccessControlConfiguration accessControlConfig; - - private WebApiConfiguration(final WebApiConfigurationBuilder builder) { - port = builder.port; - corsOrigins = builder.corsOrigins; - sslContextConfig = builder.sslContextConfig; - accessControlConfig = builder.accessControlConfig; - } - - /** - * Returns the default configuration of the web API. - * - * @return the default configuration of the web API - */ - public static WebApiConfiguration createDefault() { - return new WebApiConfigurationBuilder().build(); - } - - /** - * Returns the port number used by the web server. - * - * @return the port number used by the web server - */ - public int getPort() { - return port; - } - - /** - * Returns the list of allowed CORS origins. - * - * @return the list of allowed CORS origins - */ - public List getCorsOrigins() { - return corsOrigins; - } - - /** - * Returns the SSL context configuration. - * - * @return the SSL context configuration - */ - public Optional getSslContextConfiguration() { - return Optional.ofNullable(sslContextConfig); - } - - /** - * Returns the access control configuration. - * - * @return the access control configuration - */ - public Optional getAccessControlConfiguration() { - return Optional.ofNullable(accessControlConfig); - } - - /** - * Returns a string representation of this web API configuration. - * - * @return a string representation of this web API configuration - */ - @Override - public String toString() { - return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) - .append("port", port) - .append("corsOrigins", corsOrigins) - .append("sslContextConfiguration", sslContextConfig) - .append("accessControlConfiguration", accessControlConfig) - .toString(); - } - - /** - * Builder for {@link WebApiConfiguration}. - */ - public static final class WebApiConfigurationBuilder { - - private static final int DEFAULT_PORT = 8080; - - private final List corsOrigins; - - private int port; - private SslContextConfiguration sslContextConfig; - private AccessControlConfiguration accessControlConfig; - - /** - * Creates a {@link WebApiConfigurationBuilder} instance. - */ - public WebApiConfigurationBuilder() { - corsOrigins = new ArrayList<>(); - port = DEFAULT_PORT; - } - - /** - * Sets the port number to be used by the web server. - * - * @param port the port number to use - * - * @return the WebApiConfigurationBuilder instance - */ - public WebApiConfigurationBuilder setPort(final int port) { - this.port = port; - return this; - } - - /** - * Configures the web server to accept cross origin requests for the specific origin. The - * wildcard symbol "*" can be used to enable CORS for all origins. - * - * @param origin the origin from which the server should accept cross origin requests - * - * @return the WebApiConfigurationBuilder instance - */ - public WebApiConfigurationBuilder enableCorsForOrigin(final String origin) { - corsOrigins.add(origin); - return this; - } - - /** - * Enables the use of SSL. - * - * @param sslContextConfig the SSL context configuration - * - * @return the WebApiConfigurationBuilder instance - */ - public WebApiConfigurationBuilder withSsl(final SslContextConfiguration sslContextConfig) { - Validate.notNull(sslContextConfig, "The sslContextConfig parameter cannot be null"); - - this.sslContextConfig = sslContextConfig; - return this; - } - - /** - * Enables access control. - * - * @param accessControlConfig the access control configuration - * - * @return the WebApiConfigurationBuilder instance - */ - public WebApiConfigurationBuilder withAccessControl( - final AccessControlConfiguration accessControlConfig) { - Validate.notNull(accessControlConfig, - "The accessControlConfig parameter cannot be null"); - - this.accessControlConfig = accessControlConfig; - return this; - } - - /** - * Builds the configured WebApiConfiguration instance. - * - * @return the configured WebApiConfiguration instance - */ - public WebApiConfiguration build() { - return new WebApiConfiguration(this); - } - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/UserRole.java b/src/main/java/com/github/peterbencze/serritor/api/web/WebApiException.java similarity index 59% rename from src/main/java/com/github/peterbencze/serritor/internal/web/UserRole.java rename to src/main/java/com/github/peterbencze/serritor/api/web/WebApiException.java index a376204..f1ab346 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/UserRole.java +++ b/src/main/java/com/github/peterbencze/serritor/api/web/WebApiException.java @@ -14,15 +14,19 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal.web; - -import io.javalin.security.Role; +package com.github.peterbencze.serritor.api.web; /** - * Represents a user's role. + * An exception that is thrown when an unexpected error occurs during the lifecycle of the web API. */ -public enum UserRole implements Role { +public final class WebApiException extends RuntimeException { - UNAUTHENTICATED, - AUTHENTICATED + /** + * Creates a {@link WebApiException} instance. + * + * @param cause the cause of the exception + */ + public WebApiException(final Throwable cause) { + super(cause); + } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/http/HttpHandler.java b/src/main/java/com/github/peterbencze/serritor/api/web/http/HttpHandler.java new file mode 100644 index 0000000..ff7556d --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/web/http/HttpHandler.java @@ -0,0 +1,39 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.web.http; + +import java.io.IOException; +import java.io.Serializable; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +/** + * Common interface of the HTTP endpoint handlers. + */ +@FunctionalInterface +public interface HttpHandler extends Serializable { + + /** + * Handles the incoming HTTP request. + * + * @param request an object to provide client request information to the handler + * @param response an object to assist the handler in sending a response to the client + * + * @throws IOException if an error occurs in the handler + */ + void handle(HttpServletRequest request, HttpServletResponse response) throws IOException; +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/HttpMethod.java b/src/main/java/com/github/peterbencze/serritor/api/web/http/HttpMethod.java similarity index 87% rename from src/main/java/com/github/peterbencze/serritor/internal/web/HttpMethod.java rename to src/main/java/com/github/peterbencze/serritor/api/web/http/HttpMethod.java index bffeea3..effef61 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/HttpMethod.java +++ b/src/main/java/com/github/peterbencze/serritor/api/web/http/HttpMethod.java @@ -14,10 +14,10 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal.web; +package com.github.peterbencze.serritor.api.web.http; /** - * Specifies the possible HTTP methods. + * The supported HTTP methods. */ public enum HttpMethod { HEAD, @@ -26,7 +26,6 @@ public enum HttpMethod { PUT, PATCH, DELETE, - CONNECT, OPTIONS, TRACE } diff --git a/src/main/java/com/github/peterbencze/serritor/api/web/socket/WebSocketHandler.java b/src/main/java/com/github/peterbencze/serritor/api/web/socket/WebSocketHandler.java new file mode 100644 index 0000000..306f6bb --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/web/socket/WebSocketHandler.java @@ -0,0 +1,87 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.web.socket; + +import java.io.IOException; +import org.eclipse.jetty.websocket.api.Session; + +/** + * Common interface of the WebSocket endpoint handlers. + */ +public interface WebSocketHandler { + + /** + * Called when a client connects. + * + * @param session the WebSocket session + * + * @throws IOException if an I/O error occurs in the handler + */ + default void onConnect(Session session) throws IOException { + } + + /** + * Called when a client sends a text message. + * + * @param session the WebSocket session + * @param message the message + * + * @throws IOException if an I/O error occurs in the handler + */ + default void onMessage(Session session, String message) throws IOException { + } + + /** + * Called when a client sends a binary message. + * + * @param session the WebSocket session + * @param payload the raw payload array + * @param offset the offset in the payload array where the data starts + * @param length the length of bytes in the payload + * + * @throws IOException if an I/O error occurs in the handler + */ + default void onMessage( + Session session, + byte[] payload, + int offset, + int length) throws IOException { + } + + /** + * Called when a client disconnects. + * + * @param session the WebSocket session + * @param statusCode the close status code + * @param reason the optional reason for the close + * + * @throws IOException if an I/O error occurs in the handler + */ + default void onClose(Session session, int statusCode, String reason) throws IOException { + } + + /** + * Called when a WebSocket error occurs. + * + * @param session the WebSocket session + * @param cause the cause of the error + * + * @throws IOException if an I/O error occurs in the handler + */ + default void onError(Session session, Throwable cause) throws IOException { + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/util/KeyFactory.java b/src/main/java/com/github/peterbencze/serritor/internal/util/KeyFactory.java new file mode 100644 index 0000000..c62a2c6 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/util/KeyFactory.java @@ -0,0 +1,65 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.util; + +import java.security.NoSuchAlgorithmException; +import java.security.SecureRandom; +import java.util.Base64; +import javax.crypto.KeyGenerator; + +/** + * A helper class used for creating secure random keys. + */ +public final class KeyFactory { + + /** + * Private constructor to hide the implicit public one. + */ + private KeyFactory() { + } + + /** + * Creates a key using the specific algorithm. + * + * @param algorithm the algorithm to use for key generation + * + * @return the generated key + */ + public static String createKey(final String algorithm) { + try { + return new String(KeyGenerator.getInstance(algorithm).generateKey().getEncoded()); + } catch (NoSuchAlgorithmException e) { + throw new IllegalStateException(e); + } + } + + /** + * Creates a secure random key. + * + * @param length the key size in bytes + * + * @return the generated key + */ + public static String createKey(final int length) { + SecureRandom secureRandom = new SecureRandom(); + byte[] randomBytes = new byte[length]; + + secureRandom.nextBytes(randomBytes); + + return Base64.getUrlEncoder().withoutPadding().encodeToString(randomBytes); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/ApiEndpoint.java b/src/main/java/com/github/peterbencze/serritor/internal/web/ApiEndpoint.java deleted file mode 100644 index dd0d5e4..0000000 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/ApiEndpoint.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.web; - -import io.javalin.security.Role; -import java.util.Collections; -import java.util.Set; - -/** - * Represents an endpoint of the web API. - */ -public enum ApiEndpoint { - - LOGIN(HttpMethod.POST, "api/auth", Collections.singleton(UserRole.UNAUTHENTICATED)), - STOP_CRAWLER(HttpMethod.DELETE, "api/crawler", Collections.singleton(UserRole.AUTHENTICATED)), - GET_CONFIG(HttpMethod.GET, "api/crawler/config", Collections.singleton(UserRole.AUTHENTICATED)), - GET_STATS(HttpMethod.GET, "api/crawler/stats", Collections.singleton(UserRole.AUTHENTICATED)); - - private final HttpMethod httpMethod; - private final String path; - private final Set userRoles; - - ApiEndpoint(final HttpMethod httpMethod, final String path, final Set userRoles) { - this.httpMethod = httpMethod; - this.path = path; - this.userRoles = userRoles; - } - - /** - * Returns the HTTP method associated with the endpoint. - * - * @return the HTTP method associated with the endpoint - */ - public HttpMethod getHttpMethod() { - return httpMethod; - } - - /** - * Returns the path of the endpoint. - * - * @return the path of the endpoint - */ - public String getPath() { - return path; - } - - /** - * Returns the user roles associated with the endpoint. - * - * @return the user roles associated with the endpoint - */ - public Set getUserRoles() { - return userRoles; - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/JsonErrorHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/JsonErrorHandler.java new file mode 100644 index 0000000..e0d925d --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/JsonErrorHandler.java @@ -0,0 +1,76 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.github.peterbencze.serritor.internal.web.http.dto.ErrorDto; +import java.io.IOException; +import java.io.Writer; +import java.util.Optional; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.eclipse.jetty.http.HttpStatus; +import org.eclipse.jetty.http.MimeTypes.Type; +import org.eclipse.jetty.server.Request; +import org.eclipse.jetty.server.handler.ErrorHandler; + +/** + * An error handler that formats the response as JSON. + */ +public final class JsonErrorHandler extends ErrorHandler { + + /** + * {@inheritDoc} + */ + @Override + protected void generateAcceptableResponse( + final Request baseRequest, + final HttpServletRequest request, + final HttpServletResponse response, + final int code, + final String message, + final String mimeType) throws IOException { + baseRequest.setHandled(true); + + String errorMessage = Optional.ofNullable(message) + // Ignore Jetty's default error message when the user is unauthorized + .filter(msg -> !("!role".equals(msg) && code == HttpStatus.FORBIDDEN_403)) + .orElseGet(() -> HttpStatus.getMessage(code)); + + Writer writer = getAcceptableWriter(baseRequest, request, response); + if (writer != null) { + response.setContentType(Type.APPLICATION_JSON.asString()); + handleErrorPage(request, writer, code, errorMessage); + } + } + + /** + * {@inheritDoc} + */ + @Override + protected void writeErrorPage( + final HttpServletRequest request, + final Writer writer, + final int code, + final String message, + final boolean showStacks) throws IOException { + ObjectMapper mapper = new ObjectMapper(); + ErrorDto errorDto = new ErrorDto(code, message); + + mapper.writeValue(writer, errorDto); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/SecuredWebApi.java b/src/main/java/com/github/peterbencze/serritor/internal/web/SecuredWebApi.java new file mode 100644 index 0000000..9b29b3e --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/SecuredWebApi.java @@ -0,0 +1,211 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web; + +import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; +import com.github.peterbencze.serritor.api.web.ServerConfiguration; +import com.github.peterbencze.serritor.api.web.User; +import com.github.peterbencze.serritor.api.web.http.HttpHandler; +import com.github.peterbencze.serritor.api.web.http.HttpMethod; +import com.github.peterbencze.serritor.api.web.socket.WebSocketHandler; +import com.github.peterbencze.serritor.internal.web.http.CsrfFilter; +import com.github.peterbencze.serritor.internal.web.http.auth.JwtAuthenticator; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; +import java.util.Set; +import javax.servlet.DispatcherType; +import org.eclipse.jetty.security.Authenticator; +import org.eclipse.jetty.security.ConstraintMapping; +import org.eclipse.jetty.security.ConstraintSecurityHandler; +import org.eclipse.jetty.security.HashLoginService; +import org.eclipse.jetty.security.LoginService; +import org.eclipse.jetty.security.UserStore; +import org.eclipse.jetty.util.security.Constraint; +import org.eclipse.jetty.util.security.Credential; + +/** + * A secured web API implementation that allows users to register HTTP and WebSocket endpoints that + * can be used to interact with the crawler while it is running. Users are required to authenticate + * before they can access restricted endpoints (if they are authorized to do so). + */ +public final class SecuredWebApi extends WebApi { + + private final AccessControlConfiguration accessControlConfig; + private final ConstraintSecurityHandler securityHandler; + + /** + * Creates a {@link SecuredWebApi} instance. + * + * @param serverConfig the configuration of the web server + * @param accessControlConfig the access control configuration + */ + public SecuredWebApi( + final ServerConfiguration serverConfig, + final AccessControlConfiguration accessControlConfig) { + super(serverConfig); + + this.accessControlConfig = accessControlConfig; + + Authenticator jwtAuthenticator = new JwtAuthenticator(accessControlConfig); + + LoginService loginService = createLoginService(accessControlConfig.getUsers()); + getServer().addBean(loginService); + + securityHandler = createSecurityHandler(jwtAuthenticator, loginService); + securityHandler.setHandler(getContextHandler()); + getServer().setHandler(securityHandler); + + getContextHandler().addFilter(CsrfFilter.class, "/*", EnumSet.of(DispatcherType.REQUEST)); + } + + /** + * Returns the access control configuration. + * + * @return the access control configuration + */ + public AccessControlConfiguration getAccessControlConfiguration() { + return accessControlConfig; + } + + /** + * Adds an HTTP endpoint to the web API that is accessible to anyone (regardless of if they are + * authenticated or not). + * + * @param httpMethod the HTTP method of the endpoint + * @param path the path of the endpoint + * @param handler the handler of the endpoint + */ + @Override + public void addHttpEndpoint( + final HttpMethod httpMethod, + final String path, + final HttpHandler handler) { + addHttpEndpoint(httpMethod, path, Collections.singleton(Constraint.ANY_AUTH), handler); + } + + /** + * Adds an HTTP endpoint to the web API that is only accessible for users who are authenticated + * and have any of the roles specified. + * + * @param httpMethod the HTTP method of the endpoint + * @param path the path of the endpoint + * @param allowedRoles the set of allowed roles + * @param handler the handler of the endpoint + */ + public void addHttpEndpoint( + final HttpMethod httpMethod, + final String path, + final Set allowedRoles, + final HttpHandler handler) { + super.addHttpEndpoint(httpMethod, path, handler); + + securityHandler.addConstraintMapping(createConstraintMapping(path, allowedRoles)); + } + + /** + * Adds a WebSocket endpoint to the web API that is accessible to anyone (regardless of if they + * are authenticated or not). + * + * @param path the path of the endpoint + * @param handler the handler of the endpoint + */ + @Override + public void addWebSocketEndpoint(final String path, final WebSocketHandler handler) { + addWebSocketEndpoint(path, Collections.singleton(Constraint.ANY_AUTH), handler); + } + + /** + * Adds a WebSocket endpoint to the web API that is only accessible for users who are + * authenticated and have any of the roles specified. + * + * @param path the path of the endpoint + * @param allowedRoles the set of allowed roles + * @param handler the handler of the endpoint + */ + public void addWebSocketEndpoint( + final String path, + final Set allowedRoles, + final WebSocketHandler handler) { + super.addWebSocketEndpoint(path, handler); + + securityHandler.addConstraintMapping(createConstraintMapping(path, allowedRoles)); + } + + /** + * Creates and configures a login service. + * + * @param users the list of users + * + * @return the configured login service + */ + private static LoginService createLoginService(List users) { + UserStore userStore = new UserStore(); + users.forEach(user -> { + Credential userCredential = Credential.getCredential(user.getPassword()); + String[] userRoles = user.getRoles().toArray(new String[0]); + + userStore.addUser(user.getUsername(), userCredential, userRoles); + }); + + HashLoginService loginService = new HashLoginService(); + loginService.setUserStore(userStore); + + return loginService; + } + + /** + * Creates a handler that enforces security constraints. + * + * @param authenticator the authenticator + * @param loginService the login service + * + * @return the configured security handler + */ + private static ConstraintSecurityHandler createSecurityHandler( + final Authenticator authenticator, + final LoginService loginService) { + ConstraintSecurityHandler handler = new ConstraintSecurityHandler(); + handler.setAuthenticator(authenticator); + handler.setLoginService(loginService); + + return handler; + } + + /** + * Creates a constraint mapping. + * + * @param path the path + * @param roles the allowed roles + * + * @return the constraint mapping + */ + private static ConstraintMapping createConstraintMapping( + final String path, + final Set roles) { + Constraint constraint = new Constraint(); + constraint.setName("auth"); + constraint.setAuthenticate(true); + constraint.setRoles(roles.toArray(new String[0])); + + ConstraintMapping mapping = new ConstraintMapping(); + mapping.setPathSpec(path); + mapping.setConstraint(constraint); + + return mapping; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/WebApi.java b/src/main/java/com/github/peterbencze/serritor/internal/web/WebApi.java new file mode 100644 index 0000000..78de5e2 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/WebApi.java @@ -0,0 +1,217 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web; + +import com.github.peterbencze.serritor.api.web.ServerConfiguration; +import com.github.peterbencze.serritor.api.web.WebApiException; +import com.github.peterbencze.serritor.api.web.http.HttpHandler; +import com.github.peterbencze.serritor.api.web.http.HttpMethod; +import com.github.peterbencze.serritor.api.web.socket.WebSocketHandler; +import com.github.peterbencze.serritor.internal.web.http.HttpServlet; +import com.github.peterbencze.serritor.internal.web.socket.WebSocketFactory; +import com.github.peterbencze.serritor.internal.web.socket.WebSocketHandlerWrapper; +import com.github.peterbencze.serritor.internal.web.socket.WebSocketSessionManager; +import java.util.EnumSet; +import java.util.Set; +import javax.servlet.DispatcherType; +import javax.servlet.ServletException; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.ServerConnector; +import org.eclipse.jetty.servlet.FilterHolder; +import org.eclipse.jetty.servlet.ServletContextHandler; +import org.eclipse.jetty.servlet.ServletHolder; +import org.eclipse.jetty.servlets.CrossOriginFilter; +import org.eclipse.jetty.util.ssl.SslContextFactory; +import org.eclipse.jetty.util.thread.QueuedThreadPool; +import org.eclipse.jetty.websocket.api.Session; +import org.eclipse.jetty.websocket.server.WebSocketUpgradeFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A web API implementation that allows users to register HTTP and WebSocket endpoints that can be + * used to interact with the crawler while it is running. + */ +public class WebApi { + + private static final Logger LOGGER = LoggerFactory.getLogger(WebApi.class); + + private final ServerConfiguration serverConfig; + private final Server server; + private final ServletContextHandler contextHandler; + private final WebSocketUpgradeFilter wsUpgradeFilter; + private final WebSocketSessionManager webSocketSessionManager; + + /** + * Creates a {@link WebApi} instance. + * + * @param serverConfig the configuration of the web server + */ + public WebApi(final ServerConfiguration serverConfig) { + this.serverConfig = serverConfig; + + contextHandler = new ServletContextHandler(); + contextHandler.setContextPath("/"); + + FilterHolder crossOriginFilter = + contextHandler.addFilter(CrossOriginFilter.class, "/*", + EnumSet.of(DispatcherType.REQUEST)); + crossOriginFilter.setInitParameter(CrossOriginFilter.ALLOWED_ORIGINS_PARAM, + String.join(",", serverConfig.getCorsAllowedOrigins())); + crossOriginFilter.setInitParameter(CrossOriginFilter.ALLOWED_METHODS_PARAM, + String.join(",", serverConfig.getCorsAllowedMethods())); + crossOriginFilter.setInitParameter(CrossOriginFilter.ALLOWED_HEADERS_PARAM, + String.join(",", serverConfig.getCorsAllowedMethods())); + crossOriginFilter.setInitParameter(CrossOriginFilter.EXPOSED_HEADERS_PARAM, + String.join(",", serverConfig.getCorsExposedHeaders())); + + try { + wsUpgradeFilter = WebSocketUpgradeFilter.configureContext(contextHandler); + } catch (ServletException e) { + LOGGER.error("Error while adding WebSocket upgrade filter", e); + throw new WebApiException(e); + } + + webSocketSessionManager = new WebSocketSessionManager(); + + server = createServer(serverConfig); + server.setHandler(contextHandler); + server.setErrorHandler(new JsonErrorHandler()); + } + + /** + * Returns the configuration of the web server. + * + * @return the configuration of the web server + */ + public ServerConfiguration getServerConfiguration() { + return serverConfig; + } + + /** + * Starts the web server. + */ + public void start() { + try { + server.start(); + } catch (Exception e) { + LOGGER.error("Error while running web server", e); + throw new WebApiException(e); + } + } + + /** + * Stops the web server. + */ + public void stop() { + try { + server.stop(); + } catch (Exception e) { + LOGGER.error("Error while stopping web server", e); + throw new WebApiException(e); + } + } + + /** + * Returns a set of open WebSocket sessions that represent connections to the specific + * endpoint. + * + * @param socketHandlerClass the class of the WebSocket endpoint handler + * + * @return a set of open WebSocket sessions that represent connections to the specific endpoint + */ + public Set getOpenWebSocketSessions( + final Class socketHandlerClass) { + return webSocketSessionManager.getOpenSessions(socketHandlerClass); + } + + /** + * Adds an HTTP endpoint to the web API. + * + * @param httpMethod the HTTP method of the endpoint + * @param path the path of the endpoint + * @param handler the handler of the endpoint + */ + public void addHttpEndpoint( + final HttpMethod httpMethod, + final String path, + final HttpHandler handler) { + HttpServlet servlet = new HttpServlet(httpMethod, handler); + contextHandler.addServlet(new ServletHolder(servlet), path); + } + + /** + * Adds a WebSocket endpoint to the web API. + * + * @param path the path of the endpoint + * @param handler the handler of the endpoint + */ + public void addWebSocketEndpoint(final String path, final WebSocketHandler handler) { + WebSocketHandlerWrapper handlerWrapper = + new WebSocketHandlerWrapper(webSocketSessionManager, handler); + WebSocketFactory socketFactory = + new WebSocketFactory(serverConfig.getCorsAllowedOrigins(), handlerWrapper); + + wsUpgradeFilter.addMapping(path, socketFactory); + } + + /** + * Returns the Jetty HTTP servlet server. + * + * @return the Jetty HTTP servlet server + */ + protected Server getServer() { + return server; + } + + /** + * Returns the servlet context. + * + * @return the servlet context + */ + protected ServletContextHandler getContextHandler() { + return contextHandler; + } + + /** + * Creates and configures a Jetty HTTP servlet server. + * + * @param serverConfig the configuration of the web server + * + * @return the configured Jetty HTTP servlet server + */ + private static Server createServer(final ServerConfiguration serverConfig) { + Server server = new Server(new QueuedThreadPool(250, 8, 60_000)); + + ServerConnector serverConnector = serverConfig.getSslContextConfiguration() + .map(sslContextConfig -> { + SslContextFactory sslContextFactory = new SslContextFactory.Server(); + sslContextFactory.setKeyStorePath(sslContextConfig.getKeyStorePath()); + sslContextFactory.setKeyStorePassword(sslContextConfig.getKeyStorePassword()); + sslContextConfig.getKeyManagerPassword() + .ifPresent(sslContextFactory::setKeyManagerPassword); + + return new ServerConnector(server, sslContextFactory); + }) + .orElseGet(() -> new ServerConnector(server)); + serverConnector.setPort(serverConfig.getPort()); + + server.addConnector(serverConnector); + + return server; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java b/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java deleted file mode 100644 index 9a02784..0000000 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManager.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.web.accessmanager; - -import com.auth0.jwt.JWT; -import com.auth0.jwt.algorithms.Algorithm; -import com.auth0.jwt.exceptions.JWTVerificationException; -import com.auth0.jwt.interfaces.JWTVerifier; -import com.github.peterbencze.serritor.internal.web.UserRole; -import com.github.peterbencze.serritor.internal.web.handler.JwtHandler; -import io.javalin.Context; -import io.javalin.Handler; -import io.javalin.UnauthorizedResponse; -import io.javalin.security.AccessManager; -import io.javalin.security.Role; -import java.util.Set; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A JWT-based access manager. - */ -public final class JwtAccessManager implements AccessManager { - - private static final Logger LOGGER = LoggerFactory.getLogger(JwtAccessManager.class); - - private final Algorithm signerAlgorithm; - - /** - * Creates a {@link JwtAccessManager} instance. - * - * @param signerAlgorithm the algorithm used for signing JWTs - */ - public JwtAccessManager(final Algorithm signerAlgorithm) { - this.signerAlgorithm = signerAlgorithm; - } - - /** - * Checks if the user is allowed to access the specific endpoint. - * - * @param handler the request handler - * @param ctx the context object - * @param permittedRoles the set of permitted roles - */ - @Override - public void manage( - final Handler handler, - final Context ctx, - final Set permittedRoles) throws Exception { - LOGGER.debug("Incoming request from {} to path {}", ctx.ip(), ctx.path()); - - if (!permittedRoles.contains(UserRole.UNAUTHENTICATED)) { - LOGGER.debug("Checking JWT"); - - String jwt = ctx.attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME); - if (jwt == null) { - LOGGER.debug("Returning unauthorized response: no JWT present in context"); - - throw new UnauthorizedResponse(); - } - - JWTVerifier verifier = JWT.require(signerAlgorithm).build(); - try { - verifier.verify(jwt); - LOGGER.debug("JWT verified"); - } catch (JWTVerificationException e) { - LOGGER.debug("Returning unauthorized response: JWT verification failed"); - - throw new UnauthorizedResponse(); - } - } - - LOGGER.debug("Letting request through"); - handler.handle(ctx); - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java b/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java deleted file mode 100644 index e3d8012..0000000 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/accessmanager/NoopAccessManager.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.web.accessmanager; - -import io.javalin.Context; -import io.javalin.Handler; -import io.javalin.security.AccessManager; -import io.javalin.security.Role; -import java.util.Set; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A no-operation access manager that is used when access control is disabled. - */ -public final class NoopAccessManager implements AccessManager { - - private static final Logger LOGGER = LoggerFactory.getLogger(NoopAccessManager.class); - - /** - * Simply lets the request pass through without credential checking. - * - * @param handler the request handler - * @param ctx the context object - * @param permittedRoles a set of permitted roles - */ - @Override - public void manage( - final Handler handler, - final Context ctx, - final Set permittedRoles) throws Exception { - LOGGER.debug("Incoming request from {} to path {}", ctx.ip(), ctx.path()); - LOGGER.debug("Letting request through"); - handler.handle(ctx); - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java deleted file mode 100644 index 5d1c3a3..0000000 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandler.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.web.handler; - -import io.javalin.Context; -import io.javalin.Handler; -import java.util.Optional; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A before-handler which extracts the JWT from the Authorization header or the cookie. - */ -public final class JwtHandler implements Handler { - - private static final Logger LOGGER = LoggerFactory.getLogger(JwtHandler.class); - - public static final String CONTEXT_ATTRIBUTE_NAME = "JWT"; - static final String COOKIE_NAME = "JWT"; - - /** - * Extracts JWT from the Authorization header or the cookie. - * - * @param ctx the context object - */ - @Override - public void handle(final Context ctx) { - Optional jwtFromHeaderOpt = extractJwtFromHeader(ctx); - if (jwtFromHeaderOpt.isPresent()) { - LOGGER.debug("JWT found in headers"); - - ctx.attribute(CONTEXT_ATTRIBUTE_NAME, jwtFromHeaderOpt.get()); - } else { - extractJwtFromCookie(ctx).ifPresent(jwt -> { - LOGGER.debug("JWT found in cookies"); - - ctx.attribute(CONTEXT_ATTRIBUTE_NAME, jwt); - }); - } - } - - /** - * Returns the JWT from the Authorization header. - * - * @param ctx the context object - * - * @return the JWT from the Authorization header - */ - private static Optional extractJwtFromHeader(final Context ctx) { - return Optional.ofNullable(ctx.header("Authorization")) - .flatMap(header -> { - String[] headerValueParts = header.split(" "); - if (headerValueParts.length != 2 || !"Bearer".equals(headerValueParts[0])) { - return Optional.empty(); - } - - return Optional.of(headerValueParts[1]); - }); - } - - /** - * Returns the JWT from the cookie. - * - * @param ctx the context object - * - * @return the JWT from the cookie - */ - private static Optional extractJwtFromCookie(final Context ctx) { - return Optional.ofNullable(ctx.cookie(COOKIE_NAME)); - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java deleted file mode 100644 index dbbe8cb..0000000 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandler.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.web.handler; - -import com.auth0.jwt.JWT; -import com.auth0.jwt.algorithms.Algorithm; -import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; -import com.github.peterbencze.serritor.api.web.User; -import com.github.peterbencze.serritor.internal.web.dto.JwtDto; -import com.github.peterbencze.serritor.internal.web.dto.LoginDto; -import io.javalin.Context; -import io.javalin.Handler; -import io.javalin.UnauthorizedResponse; -import java.security.SecureRandom; -import java.time.Duration; -import java.time.Instant; -import java.util.Base64; -import java.util.Date; -import org.mindrot.jbcrypt.BCrypt; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A handler that is used to verify the authentication credentials of the user. - */ -public final class LoginHandler implements Handler { - - private static final Logger LOGGER = LoggerFactory.getLogger(LoginHandler.class); - - private final AccessControlConfiguration accessControlConfig; - private final Algorithm signerAlgorithm; - - /** - * Creates a {@link LoginHandler} instance. - * - * @param accessControlConfig the access control configuration - * @param signerAlgorithm the algorithm used for signing JWTs - */ - public LoginHandler( - final AccessControlConfiguration accessControlConfig, - final Algorithm signerAlgorithm) { - this.accessControlConfig = accessControlConfig; - this.signerAlgorithm = signerAlgorithm; - } - - /** - * Verifies the authentication credentials of the user. - * - * @param ctx the context object - */ - @Override - public void handle(final Context ctx) { - LoginDto loginDto = ctx.bodyAsClass(LoginDto.class); - - String username = loginDto.getUsername(); - User user = accessControlConfig.getUser(username) - .orElseThrow(() -> { // type inference bug, see JDK-8047338 - LOGGER.debug("Failed login for user {}: user does not exist", username); - - throw new UnauthorizedResponse(); - }); - - if (!BCrypt.checkpw(loginDto.getPassword(), user.getPasswordHash())) { - LOGGER.debug("Failed login for user {}: incorrect password", username); - - throw new UnauthorizedResponse(); - } - - Duration tokenValidDuration = Duration.ofHours(1); - Date expiryDate = Date.from(Instant.now().plus(tokenValidDuration)); - String jwt = JWT.create() - .withExpiresAt(expiryDate) - .withClaim("username", username) - .sign(signerAlgorithm); - - if (accessControlConfig.isCookieAuthenticationEnabled()) { - int cookieAgeInSeconds = Math.toIntExact(tokenValidDuration.getSeconds()); - - ctx.cookie(JwtHandler.COOKIE_NAME, jwt, cookieAgeInSeconds); - ctx.cookie(XsrfTokenHandler.COOKIE_NAME, generateXsrfToken(), cookieAgeInSeconds); - } else { - ctx.json(new JwtDto(username, expiryDate, jwt)); - } - - LOGGER.debug("User {} logged in", username); - } - - /** - * Generates a random 128-bit XSRF token. - * - * @return the generated XSRF token - */ - private static String generateXsrfToken() { - SecureRandom secureRandom = new SecureRandom(); - byte[] randomBytes = new byte[16]; - - secureRandom.nextBytes(randomBytes); - - return Base64.getUrlEncoder().withoutPadding().encodeToString(randomBytes); - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java deleted file mode 100644 index 2af937c..0000000 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandler.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.web.handler; - -import com.github.peterbencze.serritor.internal.web.HttpMethod; -import io.javalin.Context; -import io.javalin.Handler; -import io.javalin.UnauthorizedResponse; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A before-handler that is responsible for the validation of the XSRF token header if an XSRF - * cookie is present in the request. - */ -public final class XsrfTokenHandler implements Handler { - - private static final Logger LOGGER = LoggerFactory.getLogger(XsrfTokenHandler.class); - - static final String COOKIE_NAME = "XSRF-TOKEN"; - static final String HEADER_NAME = "X-XSRF-TOKEN"; - - private static final List XSRF_SAFE_HTTP_METHODS - = Arrays.asList(HttpMethod.HEAD, HttpMethod.GET, HttpMethod.OPTIONS, HttpMethod.TRACE); - - /** - * Verifies that the XSRF token present in the cookie matches the one present in the header. - * - * @param ctx the context object - */ - @Override - public void handle(final Context ctx) { - HttpMethod requestMethod = HttpMethod.valueOf(ctx.method()); - if (XSRF_SAFE_HTTP_METHODS.contains(requestMethod)) { - LOGGER.debug("The request method is safe, not checking XSRF token"); - return; - } - - LOGGER.debug("Checking XSRF token"); - Optional.ofNullable(ctx.cookie(COOKIE_NAME)).ifPresent(xsrfTokenInCookie -> { - if (!xsrfTokenInCookie.equals(ctx.header(HEADER_NAME))) { - LOGGER.debug("Returning unauthorized response: XSRF token missing or incorrect"); - - throw new UnauthorizedResponse("XSRF token missing or incorrect"); - } - }); - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/http/CsrfFilter.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/CsrfFilter.java new file mode 100644 index 0000000..900a1db --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/CsrfFilter.java @@ -0,0 +1,114 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.http; + +import com.github.peterbencze.serritor.api.web.http.HttpMethod; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.stream.Stream; +import javax.servlet.Filter; +import javax.servlet.FilterChain; +import javax.servlet.FilterConfig; +import javax.servlet.ServletException; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.http.Cookie; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A filter that is used to prevent CSRF. If a cookie with the name of XSRF-TOKEN is present in the + * request, it will look for a header with the name of X-XSRF-TOKEN and check if these values match + * each other. + */ +public final class CsrfFilter implements Filter { + + private static final Logger LOGGER = LoggerFactory.getLogger(CsrfFilter.class); + + public static final String CSRF_COOKIE_NAME = "XSRF-TOKEN"; + public static final String CSRF_HEADER_NAME = "X-XSRF-TOKEN"; + + private static final List SAFE_HTTP_METHODS = + Arrays.asList(HttpMethod.HEAD, HttpMethod.GET, HttpMethod.OPTIONS, HttpMethod.TRACE); + + /** + * {@inheritDoc} + */ + @Override + public void init(FilterConfig filterConfig) { + // Do nothing + } + + /** + * {@inheritDoc} + */ + @Override + public void doFilter( + final ServletRequest servletRequest, + final ServletResponse servletResponse, + final FilterChain chain) throws IOException, ServletException { + HttpServletRequest httpServletRequest = (HttpServletRequest) servletRequest; + HttpServletResponse httpServletResponse = (HttpServletResponse) servletResponse; + + if (!SAFE_HTTP_METHODS.contains(HttpMethod.valueOf(httpServletRequest.getMethod()))) { + Optional tokenInCookieOpt = extractTokenFromCookie(httpServletRequest); + if (tokenInCookieOpt.isPresent()) { + LOGGER.info("CSRF cookie is present in request, checking token in header"); + + // Cannot use ifPresent because sendError throws IOException + String tokenInCookie = tokenInCookieOpt.get(); + + if (!tokenInCookie.equals(httpServletRequest.getHeader(CSRF_HEADER_NAME))) { + LOGGER.info("Missing or incorrect CSRF token"); + httpServletResponse.sendError(HttpServletResponse.SC_UNAUTHORIZED, + "Missing or incorrect CSRF token"); + return; + } + } + } + + chain.doFilter(httpServletRequest, httpServletResponse); + } + + /** + * {@inheritDoc} + */ + @Override + public void destroy() { + // Do nothing + } + + /** + * Extracts the CSRF token from the CSRF cookie. + * + * @param request the HTTP request + * + * @return the CSRF token in the CSRF cookie + */ + private static Optional extractTokenFromCookie(final HttpServletRequest request) { + return Optional.ofNullable(request.getCookies()) + .map(Stream::of) + .orElseGet(Stream::empty) + .filter(cookie -> CSRF_COOKIE_NAME.equals(cookie.getName())) + .map(Cookie::getValue) + .findFirst(); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/http/HttpServlet.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/HttpServlet.java new file mode 100644 index 0000000..73503f2 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/HttpServlet.java @@ -0,0 +1,67 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.http; + +import com.github.peterbencze.serritor.api.web.http.HttpHandler; +import com.github.peterbencze.serritor.api.web.http.HttpMethod; +import java.io.IOException; +import javax.servlet.GenericServlet; +import javax.servlet.ServletException; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +public final class HttpServlet extends GenericServlet { + + private final HttpMethod httpMethod; + private final HttpHandler handler; + + public HttpServlet(final HttpMethod httpMethod, final HttpHandler handler) { + this.httpMethod = httpMethod; + this.handler = handler; + } + + @Override + public void service(final ServletRequest servletRequest, final ServletResponse servletResponse) + throws ServletException, IOException { + if (!(servletRequest instanceof HttpServletRequest + && servletResponse instanceof HttpServletResponse)) { + throw new ServletException("Non-HTTP request or response"); + } + + HttpServletRequest httpServletRequest = (HttpServletRequest) servletRequest; + HttpServletResponse httpServletResponse = (HttpServletResponse) servletResponse; + + service(httpServletRequest, httpServletResponse); + } + + private void service(final HttpServletRequest request, final HttpServletResponse response) + throws IOException { + if (httpMethod.toString().equals(request.getMethod())) { + handler.handle(request, response); + } else { + String protocol = request.getProtocol(); + + if (protocol.endsWith("1.1")) { + response.sendError(HttpServletResponse.SC_METHOD_NOT_ALLOWED); + } else { + response.sendError(HttpServletResponse.SC_BAD_REQUEST); + } + } + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/BCryptCredential.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/BCryptCredential.java new file mode 100644 index 0000000..7723048 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/BCryptCredential.java @@ -0,0 +1,48 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.http.auth; + +import org.apache.commons.lang3.StringUtils; +import org.eclipse.jetty.util.security.Credential; +import org.mindrot.jbcrypt.BCrypt; + +/** + * A BCrypt credential implementation. + */ +public final class BCryptCredential extends Credential { + + public static final String PREFIX = "BCRYPT:"; + + private final String passwordHash; + + /** + * Creates a {@link BCryptCredential} instance. + * + * @param credential the BCrypt hash of the user's password (with the prefix) + */ + public BCryptCredential(final String credential) { + this.passwordHash = StringUtils.removeStart(credential, PREFIX); + } + + /** + * {@inheritDoc} + */ + @Override + public boolean check(final Object credentials) { + return BCrypt.checkpw(credentials.toString(), passwordHash); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/BCryptCredentialProvider.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/BCryptCredentialProvider.java new file mode 100644 index 0000000..02d9efd --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/BCryptCredentialProvider.java @@ -0,0 +1,42 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.http.auth; + +import org.eclipse.jetty.util.security.Credential; +import org.eclipse.jetty.util.security.CredentialProvider; + +/** + * Provider of BCrypt credentials. + */ +public final class BCryptCredentialProvider implements CredentialProvider { + + /** + * {@inheritDoc} + */ + @Override + public Credential getCredential(final String credential) { + return new BCryptCredential(credential); + } + + /** + * {@inheritDoc} + */ + @Override + public String getPrefix() { + return BCryptCredential.PREFIX; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtAuthenticator.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtAuthenticator.java new file mode 100644 index 0000000..ba808fa --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtAuthenticator.java @@ -0,0 +1,361 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.http.auth; + +import com.auth0.jwt.JWT; +import com.auth0.jwt.JWTVerifier; +import com.auth0.jwt.algorithms.Algorithm; +import com.auth0.jwt.exceptions.JWTVerificationException; +import com.auth0.jwt.interfaces.DecodedJWT; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.databind.JsonMappingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; +import com.github.peterbencze.serritor.internal.util.KeyFactory; +import com.github.peterbencze.serritor.internal.web.http.CsrfFilter; +import com.github.peterbencze.serritor.internal.web.http.dto.JwtDto; +import com.github.peterbencze.serritor.internal.web.http.dto.LoginDto; +import java.io.IOException; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.Date; +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import javax.security.auth.Subject; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.http.Cookie; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.eclipse.jetty.http.HttpHeader; +import org.eclipse.jetty.http.MimeTypes.Type; +import org.eclipse.jetty.security.AbstractLoginService.RolePrincipal; +import org.eclipse.jetty.security.Authenticator; +import org.eclipse.jetty.security.LoginService; +import org.eclipse.jetty.security.ServerAuthException; +import org.eclipse.jetty.security.UserAuthentication; +import org.eclipse.jetty.server.Authentication; +import org.eclipse.jetty.server.Authentication.User; +import org.eclipse.jetty.server.UserIdentity; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * An authentication implementation that authenticate users by JWTs. + */ +public final class JwtAuthenticator implements Authenticator { + + public static final String AUTH_COOKIE_NAME = "JWT"; + public static final int CSRF_TOKEN_BYTE_SIZE = 16; // 128-bit token + + private static final Logger LOGGER = LoggerFactory.getLogger(JwtAuthenticator.class); + + private static final String AUTH_METHOD = "JWT"; + + private static final String KEY_GENERATOR_ALGORITHM = "HmacSHA256"; + + private static final String AUTH_QUERY_PARAM_NAME = "access_token"; + + private static final String AUTH_HEADER_VALUE_PREFIX = "Bearer "; + + private final AccessControlConfiguration accessControlConfig; + private final Pattern authPathPattern; + private final Algorithm signerAlgorithm; + + private LoginService loginService; + + /** + * Creates a {@link JwtAuthenticator} instance. + * + * @param accessControlConfig the access control configuration + */ + public JwtAuthenticator(final AccessControlConfiguration accessControlConfig) { + this.accessControlConfig = accessControlConfig; + + String regex = String.format("^%s[/?#]?.*$", accessControlConfig.getAuthenticationPath()); + this.authPathPattern = Pattern.compile(regex); + + String secretKey = accessControlConfig.getSecretKey() + .orElseGet(() -> { + LOGGER.debug("Generating secret key for JWT signer algorithm"); + + return KeyFactory.createKey(KEY_GENERATOR_ALGORITHM); + }); + signerAlgorithm = Algorithm.HMAC256(secretKey); + } + + /** + * {@inheritDoc} + */ + @Override + public void setConfiguration(final AuthConfiguration authConfig) { + loginService = Optional.ofNullable(authConfig.getLoginService()) + .orElseThrow(() -> new IllegalStateException("Login service is not set")); + } + + /** + * {@inheritDoc} + */ + @Override + public String getAuthMethod() { + return AUTH_METHOD; + } + + /** + * {@inheritDoc} + */ + @Override + public void prepareRequest(final ServletRequest request) { + // Nothing to do + } + + /** + * {@inheritDoc} + */ + @Override + public Authentication validateRequest( + final ServletRequest servletRequest, + final ServletResponse servletResponse, + final boolean mandatory) throws ServerAuthException { + HttpServletRequest httpServletRequest = (HttpServletRequest) servletRequest; + HttpServletResponse httpServletResponse = (HttpServletResponse) servletResponse; + + LOGGER.info("Incoming request from IP {} to access path {}", + httpServletRequest.getRemoteAddr(), httpServletRequest.getRequestURI()); + + try { + if (isAuthenticationRequest(httpServletRequest.getRequestURI())) { + return authenticateWithCredentials(httpServletRequest, httpServletResponse); + } + + Optional jwtOpt = extractJwtFromRequest(httpServletRequest); + if (jwtOpt.isPresent()) { + try { + return authenticateWithJwt(jwtOpt.get()); + } catch (JWTVerificationException e) { + LOGGER.info("Failed authentication: JWT verification error"); + + httpServletResponse.sendError(HttpServletResponse.SC_UNAUTHORIZED); + return Authentication.SEND_FAILURE; + } + } else { + LOGGER.info("Failed authentication: no JWT found in request"); + + httpServletResponse.sendError(HttpServletResponse.SC_UNAUTHORIZED); + return Authentication.SEND_FAILURE; + } + } catch (IOException e) { + LOGGER.error("Error occurred during authentication: {}", e.getMessage()); + + throw new ServerAuthException(e); + } + } + + /** + * {@inheritDoc} + */ + @Override + public boolean secureResponse( + final ServletRequest request, + final ServletResponse response, + final boolean mandatory, + final User validatedUser) { + return true; + } + + /** + * Authenticates the user by credentials (username and password). + * + * @param request the request + * @param response the response + * + * @return an authentication + * @throws IOException if an I/O error occurs during the authentication + */ + private Authentication authenticateWithCredentials( + final HttpServletRequest request, + final HttpServletResponse response) throws IOException { + ObjectMapper mapper = new ObjectMapper(); + + try { + LoginDto loginDto = mapper.readValue(request.getInputStream(), LoginDto.class); + + String username = loginDto.getUsername(); + + UserIdentity userIdentity = loginService.login(username, loginDto.getPassword(), + request); + if (userIdentity == null) { + LOGGER.info("Failed authentication for user {}: wrong credentials", username); + + response.sendError(HttpServletResponse.SC_UNAUTHORIZED); + return Authentication.SEND_FAILURE; + } + + LOGGER.info("User {} successfully authenticated (with credentials)", username); + + Set userRoles = userIdentity.getSubject() + .getPrincipals(RolePrincipal.class) + .stream() + .map(RolePrincipal::getName) + .collect(Collectors.toSet()); + + Instant expiryInstant = Instant.now() + .plus(accessControlConfig.getJwtExpirationDuration()) + .truncatedTo(ChronoUnit.SECONDS); + Date expiryDate = Date.from(expiryInstant); + + String jwt = JWT.create() + .withExpiresAt(expiryDate) + .withClaim("name", username) + .withArrayClaim("roles", userRoles.toArray(new String[0])) + .sign(signerAlgorithm); + + if (accessControlConfig.isCookieAuthenticationEnabled()) { + int maxAgeInSeconds = + Math.toIntExact(accessControlConfig.getJwtExpirationDuration() + .getSeconds()); + + Cookie authCookie = new Cookie(AUTH_COOKIE_NAME, jwt); + authCookie.setPath("/"); + authCookie.setMaxAge(maxAgeInSeconds); + response.addCookie(authCookie); + + String csrfToken = KeyFactory.createKey(CSRF_TOKEN_BYTE_SIZE); + Cookie csrfCookie = new Cookie(CsrfFilter.CSRF_COOKIE_NAME, csrfToken); + csrfCookie.setPath("/"); + csrfCookie.setMaxAge(maxAgeInSeconds); + response.addCookie(csrfCookie); + } + + response.setContentType(Type.APPLICATION_JSON.asString()); + + JwtDto jwtDto = new JwtDto(username, userRoles, expiryDate, jwt); + mapper.writeValue(response.getOutputStream(), jwtDto); + + return new UserAuthentication(getAuthMethod(), userIdentity); + } catch (JsonParseException | JsonMappingException e) { + response.sendError(HttpServletResponse.SC_BAD_REQUEST); + return Authentication.SEND_FAILURE; + } + } + + /** + * Authenticates the user by JWT. + * + * @param jwt the JWT to authenticate with + * + * @return an authentication + */ + private Authentication authenticateWithJwt(final String jwt) { + JWTVerifier verifier = JWT.require(signerAlgorithm).build(); + DecodedJWT decodedJwt = verifier.verify(jwt); + + String username = decodedJwt.getClaim("name").asString(); + + LOGGER.info("User {} successfully authenticated (with JWT)", username); + + JwtUserPrincipal principal = new JwtUserPrincipal(username); + + Subject subject = new Subject(); + subject.getPrincipals().add(principal); + + Set roles = new HashSet<>(decodedJwt.getClaim("roles").asList(String.class)); + + JwtUserIdentity userIdentity = new JwtUserIdentity(subject, principal, roles); + + return new UserAuthentication(getAuthMethod(), userIdentity); + } + + /** + * Indicates if the request is an an authentication request. + * + * @param requestUri the request URL + * + * @return true if the request is authentication request, false + * otherwise + */ + private boolean isAuthenticationRequest(final String requestUri) { + return authPathPattern.matcher(requestUri).matches(); + } + + /** + * Extracts the JWT from the request, if present. If the request is a WebSocket upgrade request, + * it extracts the token from the query parameters. Otherwise, it looks for the token in the + * Authorization header and if not found it also checks the cookies. + * + * @param request the request + * + * @return the JWT from the request + */ + private static Optional extractJwtFromRequest(final HttpServletRequest request) { + Optional jwtOpt; + if (isWebSocketUpgradeRequest(request)) { + jwtOpt = Optional.ofNullable(request.getParameter(AUTH_QUERY_PARAM_NAME)); + } else { + jwtOpt = extractJwtFromHeader(request); + } + + return jwtOpt.map(Optional::of).orElseGet(() -> extractJwtFromCookie(request)); + } + + /** + * Indicates if the request is a WebSocket upgrade request. + * + * @param request the request + * + * @return true if the request is a WebSocket upgrade request, + * false otherwise + */ + private static boolean isWebSocketUpgradeRequest(final HttpServletRequest request) { + return Optional.ofNullable(request.getHeader(HttpHeader.UPGRADE.toString())) + .filter("websocket"::equals) + .isPresent(); + } + + /** + * Extracts the JWT from the Authorization header, if present. + * + * @param request the request + * + * @return the JWT from the Authorization header + */ + private static Optional extractJwtFromHeader(final HttpServletRequest request) { + return Optional.ofNullable(request.getHeader(HttpHeader.AUTHORIZATION.toString())) + .filter(headerValue -> headerValue.startsWith(AUTH_HEADER_VALUE_PREFIX)) + .map(headerValue -> headerValue.substring(AUTH_HEADER_VALUE_PREFIX.length())); + } + + /** + * Extracts the JWT from the cookies, if present. + * + * @param request the request + * + * @return the JWT from the cookies + */ + private static Optional extractJwtFromCookie(final HttpServletRequest request) { + return Optional.ofNullable(request.getCookies()) + .map(Stream::of) + .orElseGet(Stream::empty) + .filter(cookie -> AUTH_COOKIE_NAME.equals(cookie.getName())) + .map(Cookie::getValue) + .findFirst(); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtUserIdentity.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtUserIdentity.java new file mode 100644 index 0000000..5b5d554 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtUserIdentity.java @@ -0,0 +1,72 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.http.auth; + +import java.security.Principal; +import java.util.Set; +import javax.security.auth.Subject; +import org.eclipse.jetty.server.UserIdentity; + +/** + * A user object that encapsulates user identity. + */ +public final class JwtUserIdentity implements UserIdentity { + + private final Subject subject; + private final Principal userPrincipal; + private final Set roles; + + /** + * Creates a {@link JwtUserIdentity} instance. + * + * @param subject the user subject + * @param userPrincipal the user principal + * @param roles the roles of the user + */ + public JwtUserIdentity( + final Subject subject, + final Principal userPrincipal, + final Set roles) { + this.subject = subject; + this.userPrincipal = userPrincipal; + this.roles = roles; + } + + /** + * {@inheritDoc} + */ + @Override + public Subject getSubject() { + return subject; + } + + /** + * {@inheritDoc} + */ + @Override + public Principal getUserPrincipal() { + return userPrincipal; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isUserInRole(final String role, final Scope scope) { + return roles.contains(role); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtUserPrincipal.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtUserPrincipal.java new file mode 100644 index 0000000..3dce954 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtUserPrincipal.java @@ -0,0 +1,44 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.http.auth; + +import java.security.Principal; + +/** + * Represents a user principal. + */ +public final class JwtUserPrincipal implements Principal { + + private final String name; + + /** + * Creates a {@link JwtUserPrincipal}. + * + * @param name the name of the user + */ + public JwtUserPrincipal(final String name) { + this.name = name; + } + + /** + * {@inheritDoc} + */ + @Override + public String getName() { + return name; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/http/dto/ErrorDto.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/dto/ErrorDto.java new file mode 100644 index 0000000..166c1d8 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/dto/ErrorDto.java @@ -0,0 +1,59 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.http.dto; + +import com.fasterxml.jackson.annotation.JsonProperty; + +/** + * A DTO that is used by the web server to transfer error details to the client. + */ +public final class ErrorDto { + + private final int code; + private final String message; + + /** + * Creates a {@link ErrorDto} instance. + * + * @param code the HTTP status code + * @param message the HTTP status message + */ + public ErrorDto( + @JsonProperty(value = "code", required = true) final int code, + @JsonProperty(value = "message", required = true) final String message) { + this.code = code; + this.message = message; + } + + /** + * Returns the HTTP status code. + * + * @return the HTTP status code + */ + public int getCode() { + return code; + } + + /** + * Returns the HTTP status message. + * + * @return the HTTP status message + */ + public String getMessage() { + return message; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/dto/JwtDto.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/dto/JwtDto.java similarity index 65% rename from src/main/java/com/github/peterbencze/serritor/internal/web/dto/JwtDto.java rename to src/main/java/com/github/peterbencze/serritor/internal/web/http/dto/JwtDto.java index 63aa09e..d2cb921 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/dto/JwtDto.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/dto/JwtDto.java @@ -14,16 +14,19 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal.web.dto; +package com.github.peterbencze.serritor.internal.web.http.dto; +import com.fasterxml.jackson.annotation.JsonProperty; import java.util.Date; +import java.util.Set; /** - * A DTO that is used to send back the generated JWT to the user. + * A DTO that is used to transfer the JWT between the client and the web server. */ public final class JwtDto { private final String username; + private final Set roles; private final Date expiryDate; private final String jwt; @@ -31,11 +34,17 @@ public final class JwtDto { * Creates a {@link JwtDto} instance. * * @param username the username of the authenticated user + * @param roles the roles of the user * @param expiryDate the expiry date of the JWT * @param jwt the generated JWT */ - public JwtDto(final String username, final Date expiryDate, final String jwt) { + public JwtDto( + @JsonProperty(value = "username", required = true) final String username, + @JsonProperty(value = "roles", required = true) final Set roles, + @JsonProperty(value = "expiryDate", required = true) final Date expiryDate, + @JsonProperty(value = "jwt", required = true) final String jwt) { this.username = username; + this.roles = roles; this.expiryDate = expiryDate; this.jwt = jwt; } @@ -49,6 +58,15 @@ public String getUsername() { return username; } + /** + * Returns the roles of the user. + * + * @return the roles of the user + */ + public Set getRoles() { + return roles; + } + /** * Returns the expiry date of the JWT. * diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/dto/LoginDto.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/dto/LoginDto.java similarity index 63% rename from src/main/java/com/github/peterbencze/serritor/internal/web/dto/LoginDto.java rename to src/main/java/com/github/peterbencze/serritor/internal/web/http/dto/LoginDto.java index e13cfa2..4183bd7 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/dto/LoginDto.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/dto/LoginDto.java @@ -14,12 +14,12 @@ * limitations under the License. */ -package com.github.peterbencze.serritor.internal.web.dto; +package com.github.peterbencze.serritor.internal.web.http.dto; import com.fasterxml.jackson.annotation.JsonProperty; /** - * A DTO that is used to send the user authentication credentials to the web server. + * A DTO that is used by the client to transfer user authentication credentials to the web server. */ public final class LoginDto { @@ -29,29 +29,29 @@ public final class LoginDto { /** * Creates a {@link LoginDto} instance. * - * @param username the username of the user - * @param password the password of the user + * @param username the username provided by the user + * @param password the password provided by the user */ public LoginDto( - @JsonProperty("username") final String username, - @JsonProperty("password") final String password) { + @JsonProperty(value = "username", required = true) final String username, + @JsonProperty(value = "password", required = true) final String password) { this.username = username; this.password = password; } /** - * Returns the username of the user. + * Returns the username provided by the user. * - * @return the username of the user + * @return the username provided by the user */ public String getUsername() { return username; } /** - * Returns the password of the user. + * Returns the password provided by the user. * - * @return the password of the user + * @return the password provided by the user */ public String getPassword() { return password; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketFactory.java b/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketFactory.java new file mode 100644 index 0000000..b565b36 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketFactory.java @@ -0,0 +1,97 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.socket; + +import java.io.IOException; +import java.util.Set; +import javax.servlet.http.HttpServletResponse; +import org.eclipse.jetty.http.HttpHeader; +import org.eclipse.jetty.http.HttpStatus; +import org.eclipse.jetty.websocket.servlet.ServletUpgradeRequest; +import org.eclipse.jetty.websocket.servlet.ServletUpgradeResponse; +import org.eclipse.jetty.websocket.servlet.WebSocketCreator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A factory that is used to create WebSockets. + */ +public final class WebSocketFactory implements WebSocketCreator { + + private static final Logger LOGGER = LoggerFactory.getLogger(WebSocketFactory.class); + + private final Set allowedOrigins; + private final WebSocketHandlerWrapper handlerWrapper; + + /** + * Creates a {@link WebSocketFactory} instance. + * + * @param allowedOrigins the set of allowed origins + * @param handlerWrapper the WebSocket handler wrapper + */ + public WebSocketFactory( + final Set allowedOrigins, + final WebSocketHandlerWrapper handlerWrapper) { + this.allowedOrigins = allowedOrigins; + this.handlerWrapper = handlerWrapper; + } + + /** + * {@inheritDoc} + */ + @Override + public Object createWebSocket( + final ServletUpgradeRequest request, + final ServletUpgradeResponse response) { + String origin = request.getHeader(HttpHeader.ORIGIN.toString()); + if (origin != null) { + LOGGER.info("WebSocket upgrade request from origin {}", origin); + + if (isAllowedOrigin(origin)) { + LOGGER.info("Origin is allowed"); + return handlerWrapper; + } + + LOGGER.info("Origin is not allowed"); + + try { + response.sendError(HttpServletResponse.SC_UNAUTHORIZED, + HttpStatus.getMessage(HttpServletResponse.SC_UNAUTHORIZED)); + } catch (IOException e) { + LOGGER.error("Error while sending response", e); + } + + return null; + } + + LOGGER.info("No Origin header present in request"); + return null; + } + + /** + * Indicates whether the specific origin is allowed to access the resource or not. + * + * @param origin the request origin + * + * @return true if the origin is allowed, false otherwise + */ + private boolean isAllowedOrigin(final String origin) { + return allowedOrigins.stream() + .anyMatch(allowedOrigin -> + "*".equals(allowedOrigin) || origin.startsWith(allowedOrigin)); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketHandlerWrapper.java b/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketHandlerWrapper.java new file mode 100644 index 0000000..0e5bce3 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketHandlerWrapper.java @@ -0,0 +1,128 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.socket; + +import com.github.peterbencze.serritor.api.web.socket.WebSocketHandler; +import java.io.IOException; +import org.eclipse.jetty.websocket.api.Session; +import org.eclipse.jetty.websocket.api.annotations.OnWebSocketClose; +import org.eclipse.jetty.websocket.api.annotations.OnWebSocketConnect; +import org.eclipse.jetty.websocket.api.annotations.OnWebSocketError; +import org.eclipse.jetty.websocket.api.annotations.OnWebSocketMessage; +import org.eclipse.jetty.websocket.api.annotations.WebSocket; + +/** + * A WebSocket handler wrapper that notifies the WebSocket session manager when a client connects or + * disconnects and also delegates the handling of the event to the user defined handler. + */ +@WebSocket +public final class WebSocketHandlerWrapper { + + private final WebSocketSessionManager sessionManager; + private final WebSocketHandler delegateHandler; + + /** + * Creates a {@link WebSocketHandlerWrapper} instance. + * + * @param sessionManager the session manager which tracks the currently open WebSocket + * sessions + * @param delegateHandler the user defined handler which actually handles the event + */ + public WebSocketHandlerWrapper( + final WebSocketSessionManager sessionManager, + final WebSocketHandler delegateHandler) { + this.sessionManager = sessionManager; + this.delegateHandler = delegateHandler; + } + + /** + * Called when a client connects. + * + * @param session the WebSocket session + * + * @throws IOException if an I/O error occurs in the handler + */ + @OnWebSocketConnect + public void onWebSocketConnect(final Session session) throws IOException { + sessionManager.addSession(delegateHandler.getClass(), session); + delegateHandler.onConnect(session); + } + + /** + * Called when a client sends a text message. + * + * @param session the WebSocket session + * @param message the message + * + * @throws IOException if an I/O error occurs in the handler + */ + @OnWebSocketMessage + public void onWebSocketText(final Session session, final String message) throws IOException { + delegateHandler.onMessage(session, message); + } + + /** + * Called when a client sends a binary message. + * + * @param session the WebSocket session + * @param payload the raw payload array + * @param offset the offset in the payload array where the data starts + * @param length the length of bytes in the payload + * + * @throws IOException if an I/O error occurs in the handler + */ + @OnWebSocketMessage + public void onWebSocketBinary( + final Session session, + final byte[] payload, + final int offset, + final int length) throws IOException { + delegateHandler.onMessage(session, payload, offset, length); + } + + /** + * Called when a client disconnects. + * + * @param session the WebSocket session + * @param statusCode the close status code + * @param reason the optional reason for the close + * + * @throws IOException if an I/O error occurs in the handler + */ + @OnWebSocketClose + public void onWebSocketClose( + final Session session, + final int statusCode, + final String reason) throws IOException { + sessionManager.removeSession(delegateHandler.getClass(), session); + delegateHandler.onClose(session, statusCode, reason); + } + + /** + * Called when a WebSocket error occurs. + * + * @param session the WebSocket session + * @param cause the cause of the error + * + * @throws IOException if an I/O error occurs in the handler + */ + @OnWebSocketError + public void onWebSocketError(final Session session, final Throwable cause) throws IOException { + sessionManager.removeSession(delegateHandler.getClass(), session); + delegateHandler.onError(session, cause); + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketSessionManager.java b/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketSessionManager.java new file mode 100644 index 0000000..d59d60a --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketSessionManager.java @@ -0,0 +1,84 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.internal.web.socket; + +import com.github.peterbencze.serritor.api.web.socket.WebSocketHandler; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.stream.Collectors; +import org.eclipse.jetty.websocket.api.Session; + +/** + * A WebSocket session manager that tracks open sessions. + */ +public final class WebSocketSessionManager { + + private final Map, Set> sessionsBySocket; + + /** + * Creates a {@link WebSocketSessionManager} instance. + */ + public WebSocketSessionManager() { + sessionsBySocket = new ConcurrentHashMap<>(); + } + + /** + * Returns a set of open WebSocket sessions that represent connections to the specific + * endpoint. + * + * @param socketHandlerClass the class of the WebSocket endpoint handler + * + * @return a set of open WebSocket sessions that represent connections to the specific endpoint + */ + public Set getOpenSessions( + final Class socketHandlerClass) { + return sessionsBySocket.getOrDefault(socketHandlerClass, ConcurrentHashMap.newKeySet()) + .stream() + .filter(Session::isOpen) + .collect(Collectors.toSet()); + } + + /** + * Adds a WebSocket session to the set of open sessions. This method is called when a client + * connects to a WebSocket endpoint. + * + * @param socketHandlerClass the class of the WebSocket endpoint handler + * @param session the open WebSocket session + */ + public void addSession( + final Class socketHandlerClass, + final Session session) { + sessionsBySocket.computeIfAbsent(socketHandlerClass, key -> ConcurrentHashMap.newKeySet()) + .add(session); + } + + /** + * Removes a WebSocket session from the set of open sessions. This method is called when a + * client disconnects from a WebSocket endpoint. + * + * @param socketHandlerClass the class of the WebSocket endpoint handler + * @param session the no longer open WebSocket session + */ + public void removeSession( + final Class socketHandlerClass, + final Session session) { + if (sessionsBySocket.containsKey(socketHandlerClass)) { + sessionsBySocket.get(socketHandlerClass).remove(session); + } + } +} diff --git a/src/main/resources/META-INF/services/org.eclipse.jetty.util.security.CredentialProvider b/src/main/resources/META-INF/services/org.eclipse.jetty.util.security.CredentialProvider new file mode 100644 index 0000000..15c1e8e --- /dev/null +++ b/src/main/resources/META-INF/services/org.eclipse.jetty.util.security.CredentialProvider @@ -0,0 +1 @@ +com.github.peterbencze.serritor.internal.web.http.auth.BCryptCredentialProvider diff --git a/src/test/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManagerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManagerTest.java deleted file mode 100644 index 70c8048..0000000 --- a/src/test/java/com/github/peterbencze/serritor/internal/web/accessmanager/JwtAccessManagerTest.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.web.accessmanager; - -import com.auth0.jwt.algorithms.Algorithm; -import com.auth0.jwt.exceptions.SignatureVerificationException; -import com.auth0.jwt.interfaces.DecodedJWT; -import com.github.peterbencze.serritor.internal.web.UserRole; -import com.github.peterbencze.serritor.internal.web.handler.JwtHandler; -import io.javalin.Context; -import io.javalin.Handler; -import io.javalin.UnauthorizedResponse; -import io.javalin.security.Role; -import java.util.Set; -import org.junit.Before; -import org.junit.Test; -import org.mockito.Mockito; - -/** - * Test cases for {@link JwtAccessManager}. - */ -public final class JwtAccessManagerTest { - - private Handler handlerMock; - private Context contextMock; - private Set permittedRolesMock; - - private Algorithm signerAlgorithm; - private JwtAccessManager jwtAccessManager; - - @Before - public void before() { - handlerMock = Mockito.mock(Handler.class); - contextMock = Mockito.mock(Context.class); - permittedRolesMock = Mockito.mock(Set.class); - - signerAlgorithm = Mockito.spy(Algorithm.HMAC256("secret")); - jwtAccessManager = new JwtAccessManager(signerAlgorithm); - } - - @Test(expected = UnauthorizedResponse.class) - public void testManageWhenEndpointIsRestrictedAndJwtIsNotPresent() throws Exception { - Mockito.when(permittedRolesMock.contains(Mockito.any(UserRole.class))).thenReturn(false); - Mockito.when(contextMock.attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME)).thenReturn(null); - - jwtAccessManager.manage(handlerMock, contextMock, permittedRolesMock); - } - - @Test(expected = UnauthorizedResponse.class) - public void testManageWhenEndpointIsRestrictedAndJwtIsInvalid() throws Exception { - Mockito.when(permittedRolesMock.contains(Mockito.any(UserRole.class))).thenReturn(false); - Mockito.when(contextMock.attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME)) - .thenReturn("eyJhbGciOiJIUzI1NiJ9.e30.XmNK3GpH3Ys_7wsYBfq4C3M6goz71I7dTgUkuIa5lyQ"); - Mockito.doThrow(SignatureVerificationException.class).when(signerAlgorithm) - .verify(Mockito.any(DecodedJWT.class)); - - jwtAccessManager.manage(handlerMock, contextMock, permittedRolesMock); - } - - @Test - public void testManageWhenEndpointIsNotRestricted() throws Exception { - Mockito.when(permittedRolesMock.contains(Mockito.any(UserRole.class))).thenReturn(true); - - jwtAccessManager.manage(handlerMock, contextMock, permittedRolesMock); - - Mockito.verify(handlerMock).handle(Mockito.eq(contextMock)); - } -} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandlerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandlerTest.java deleted file mode 100644 index 6b5a017..0000000 --- a/src/test/java/com/github/peterbencze/serritor/internal/web/handler/JwtHandlerTest.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.web.handler; - -import io.javalin.Context; -import org.junit.Before; -import org.junit.Test; -import org.mockito.Mockito; - -/** - * Test cases for {@link JwtHandler}. - */ -public final class JwtHandlerTest { - - private static final String JWT = "foo.bar.baz"; - - private Context contextMock; - - private JwtHandler jwtHandler; - - @Before - public void before() { - contextMock = Mockito.mock(Context.class); - - jwtHandler = new JwtHandler(); - } - - @Test - public void testHandleWhenJwtIsPresentInHeader() throws Exception { - Mockito.when(contextMock.header(Mockito.eq("Authorization"))).thenReturn("Bearer " + JWT); - - jwtHandler.handle(contextMock); - - Mockito.verify(contextMock).attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME, JWT); - } - - @Test - public void testHandleWhenJwtIsPresentInCookie() throws Exception { - Mockito.when(contextMock.cookie(JwtHandler.COOKIE_NAME)).thenReturn(JWT); - - jwtHandler.handle(contextMock); - - Mockito.verify(contextMock).attribute(JwtHandler.CONTEXT_ATTRIBUTE_NAME, JWT); - } -} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandlerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandlerTest.java deleted file mode 100644 index 79c4d23..0000000 --- a/src/test/java/com/github/peterbencze/serritor/internal/web/handler/LoginHandlerTest.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.web.handler; - -import com.auth0.jwt.algorithms.Algorithm; -import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; -import com.github.peterbencze.serritor.api.web.User; -import com.github.peterbencze.serritor.internal.web.dto.JwtDto; -import com.github.peterbencze.serritor.internal.web.dto.LoginDto; -import io.javalin.Context; -import io.javalin.UnauthorizedResponse; -import java.util.Optional; -import org.junit.Before; -import org.junit.Test; -import org.mockito.Mockito; - -/** - * Test cases for {@link LoginHandler}. - */ -public final class LoginHandlerTest { - - private static final String INCORRECT_PASSWORD_HASH - = "$2a$10$Jh4rXRRgeI6WDsb8X7XXpuOJlF1ntM6OJ4ObdNiEaI0AH6d4Lcmky"; - private static final String CORRECT_PASSWORD_HASH - = "$2a$10$baEfqZy/tI3RoKlxQk6jGe9L5nf3NMTEOSWKasVArYH3Ki44pNSU2"; - - private AccessControlConfiguration accessControlConfigMock; - private Context contextMock; - private User userMock; - - private LoginHandler loginHandler; - - @Before - public void before() { - accessControlConfigMock = Mockito.mock(AccessControlConfiguration.class); - - LoginDto loginDtoMock = Mockito.mock(LoginDto.class); - Mockito.when(loginDtoMock.getUsername()).thenReturn("foo"); - Mockito.when(loginDtoMock.getPassword()).thenReturn("bar"); - - contextMock = Mockito.mock(Context.class); - Mockito.when(contextMock.bodyAsClass(LoginDto.class)).thenReturn(loginDtoMock); - - userMock = Mockito.mock(User.class); - - Algorithm signerAlgorithm = Mockito.spy(Algorithm.HMAC256("secret")); - loginHandler = new LoginHandler(accessControlConfigMock, signerAlgorithm); - } - - @Test(expected = UnauthorizedResponse.class) - public void testHandleWhenUserDoesNotExist() throws Exception { - Mockito.when(accessControlConfigMock.getUser(Mockito.anyString())) - .thenReturn(Optional.empty()); - - loginHandler.handle(contextMock); - } - - @Test(expected = UnauthorizedResponse.class) - public void testHandleWhenPasswordIsIncorrect() throws Exception { - Mockito.when(userMock.getPasswordHash()).thenReturn(INCORRECT_PASSWORD_HASH); - - Mockito.when(accessControlConfigMock.getUser(Mockito.anyString())) - .thenReturn(Optional.of(userMock)); - - loginHandler.handle(contextMock); - } - - @Test - public void testHandleWhenPasswordIsCorrectAndCookieAuthenticationIsDisabled() - throws Exception { - Mockito.when(userMock.getPasswordHash()).thenReturn(CORRECT_PASSWORD_HASH); - - Mockito.when(accessControlConfigMock.getUser(Mockito.anyString())) - .thenReturn(Optional.of(userMock)); - Mockito.when(accessControlConfigMock.isCookieAuthenticationEnabled()).thenReturn(false); - - loginHandler.handle(contextMock); - - Mockito.verify(contextMock).json(Mockito.any(JwtDto.class)); - Mockito.verify(contextMock, Mockito.never()) - .cookie(Mockito.eq(JwtHandler.COOKIE_NAME), Mockito.anyString(), Mockito.anyInt()); - Mockito.verify(contextMock, Mockito.never()) - .cookie(Mockito.eq(XsrfTokenHandler.COOKIE_NAME), Mockito.anyString(), - Mockito.anyInt()); - } - - @Test - public void testHandleWhenPasswordIsCorrectAndCookieAuthenticationIsEnabled() throws Exception { - Mockito.when(userMock.getPasswordHash()).thenReturn(CORRECT_PASSWORD_HASH); - - Mockito.when(accessControlConfigMock.getUser(Mockito.anyString())) - .thenReturn(Optional.of(userMock)); - Mockito.when(accessControlConfigMock.isCookieAuthenticationEnabled()).thenReturn(true); - - loginHandler.handle(contextMock); - - Mockito.verify(contextMock, Mockito.never()).json(Mockito.any(JwtDto.class)); - Mockito.verify(contextMock) - .cookie(Mockito.eq(JwtHandler.COOKIE_NAME), Mockito.anyString(), Mockito.anyInt()); - Mockito.verify(contextMock) - .cookie(Mockito.eq(XsrfTokenHandler.COOKIE_NAME), Mockito.anyString(), - Mockito.anyInt()); - } -} diff --git a/src/test/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandlerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandlerTest.java deleted file mode 100644 index cc08c27..0000000 --- a/src/test/java/com/github/peterbencze/serritor/internal/web/handler/XsrfTokenHandlerTest.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.internal.web.handler; - -import io.javalin.Context; -import io.javalin.UnauthorizedResponse; -import org.junit.Before; -import org.junit.Test; -import org.mockito.Mockito; - -/** - * Test cases for {@link XsrfTokenHandler}. - */ -public final class XsrfTokenHandlerTest { - - private Context contextMock; - - private XsrfTokenHandler xsrfTokenHandler; - - @Before - public void before() { - contextMock = Mockito.mock(Context.class); - Mockito.when(contextMock.method()).thenReturn("POST"); - Mockito.when(contextMock.cookie(XsrfTokenHandler.COOKIE_NAME)).thenReturn("foo"); - - xsrfTokenHandler = new XsrfTokenHandler(); - } - - @Test(expected = UnauthorizedResponse.class) - public void testHandleWhenHeaderIsNotPresent() throws Exception { - xsrfTokenHandler.handle(contextMock); - } - - @Test(expected = UnauthorizedResponse.class) - public void testHandleWhenHeaderContainsInvalidToken() throws Exception { - Mockito.when(contextMock.header(XsrfTokenHandler.HEADER_NAME)).thenReturn("bar"); - - xsrfTokenHandler.handle(contextMock); - } -} diff --git a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java b/src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java similarity index 96% rename from src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java rename to src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java index 9a9a250..d3a9871 100644 --- a/src/test/java/com/github/peterbencze/serritor/it/SerritorIT.java +++ b/src/test/java/com/github/peterbencze/serritor/it/CrawlingIT.java @@ -16,7 +16,7 @@ package com.github.peterbencze.serritor.it; -import com.github.peterbencze.serritor.api.BaseCrawler; +import com.github.peterbencze.serritor.api.Crawler; import com.github.peterbencze.serritor.api.Browser; import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlerConfiguration; @@ -47,7 +47,7 @@ /** * Integration test cases for Serritor. */ -public class SerritorIT { +public final class CrawlingIT { private static WireMockServer mockServer; private static BrowserMobProxyServer proxyServer; @@ -80,7 +80,7 @@ public void testFileDownload() throws IOException { .addCrawlSeed(CrawlRequest.createDefault("http://te.st/foo")) .build(); - BaseCrawler crawler = new BaseCrawler(config) { + Crawler crawler = new Crawler(config) { @Override protected void onNonHtmlResponse(final NonHtmlResponseEvent event) { super.onNonHtmlResponse(event); @@ -122,7 +122,7 @@ public void testResumeState() throws IOException { .addCrawlSeed(CrawlRequest.createDefault("http://te.st/bar")) .build(); - BaseCrawler crawler = new BaseCrawler(config) { + Crawler crawler = new Crawler(config) { @Override protected void onResponseSuccess(final ResponseSuccessEvent event) { super.onResponseSuccess(event); @@ -133,7 +133,7 @@ protected void onResponseSuccess(final ResponseSuccessEvent event) { }; crawler.start(Browser.HTML_UNIT, capabilities); - crawler = new BaseCrawler(crawler.getState()) { + crawler = new Crawler(crawler.getState()) { }; crawler.resume(Browser.HTML_UNIT, capabilities); @@ -164,7 +164,7 @@ public void testHttpClientCookieSynchronization() { .addCrawlSeed(CrawlRequest.createDefault("http://te.st/bar")) .build(); - BaseCrawler crawler = new BaseCrawler(config) { + Crawler crawler = new Crawler(config) { }; crawler.start(Browser.HTML_UNIT, capabilities); @@ -190,7 +190,7 @@ public void testRedirectHandling() { .addCrawlSeed(CrawlRequest.createDefault("http://te.st/foo")) .build(); - BaseCrawler crawler = new BaseCrawler(config) { + Crawler crawler = new Crawler(config) { }; crawler.start(Browser.HTML_UNIT, capabilities); diff --git a/src/test/java/com/github/peterbencze/serritor/it/web/TestCrawlerWithSecuredWebApi.java b/src/test/java/com/github/peterbencze/serritor/it/web/TestCrawlerWithSecuredWebApi.java new file mode 100644 index 0000000..d029207 --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/it/web/TestCrawlerWithSecuredWebApi.java @@ -0,0 +1,98 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.it.web; + +import com.github.peterbencze.serritor.api.CrawlerConfiguration; +import com.github.peterbencze.serritor.api.CrawlerConfiguration.CrawlerConfigurationBuilder; +import com.github.peterbencze.serritor.api.CrawlerWithSecuredWebApi; +import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; +import com.github.peterbencze.serritor.api.web.ServerConfiguration; +import com.github.peterbencze.serritor.api.web.WebApiException; +import com.github.peterbencze.serritor.api.web.http.HttpMethod; +import com.github.peterbencze.serritor.api.web.socket.WebSocketHandler; +import java.time.Duration; +import java.util.Collections; +import java.util.concurrent.atomic.AtomicBoolean; +import net.jodah.failsafe.Failsafe; +import net.jodah.failsafe.RetryPolicy; +import org.apache.commons.io.IOUtils; +import org.awaitility.Awaitility; + +/** + * A crawler implementation with secured web API support that is used by the tests. + */ +public final class TestCrawlerWithSecuredWebApi extends CrawlerWithSecuredWebApi { + + private static final CrawlerConfiguration CRAWLER_CONFIG = + new CrawlerConfigurationBuilder().build(); + + private static final RetryPolicy RETRY_POLICY = new RetryPolicy<>() + .handle(WebApiException.class) + .withDelay(Duration.ofSeconds(1)) + .withMaxRetries(10); + + private AtomicBoolean isServerStarted; + private AtomicBoolean isStopAllowed; + + public TestCrawlerWithSecuredWebApi( + final ServerConfiguration serverConfig, + final AccessControlConfiguration accessControlConfig) { + super(serverConfig, accessControlConfig, CRAWLER_CONFIG); + + isServerStarted = new AtomicBoolean(); + isStopAllowed = new AtomicBoolean(); + + addHttpEndpoint(HttpMethod.GET, "/api/http/test", + (request, response) -> IOUtils.write("It works!", response.getWriter())); + + addHttpEndpoint(HttpMethod.GET, "/api/http/test-with-role", + Collections.singleton("test-role"), + (request, response) -> IOUtils.write("It works!", response.getWriter())); + + addHttpEndpoint(HttpMethod.POST, "/api/http/test-csrf", + (request, response) -> IOUtils.write("It works!", response.getWriter())); + + addWebSocketEndpoint("/api/ws/test", new WebSocketHandler() { + }); + + addWebSocketEndpoint("/api/ws/test-with-role", Collections.singleton("test-role"), + new WebSocketHandler() { + }); + } + + public AtomicBoolean isServerStarted() { + return isServerStarted; + } + + public void allowStop() { + isStopAllowed.set(true); + } + + @Override + protected void onStart() { + Failsafe.with(RETRY_POLICY).run(() -> { + super.onStart(); + isServerStarted.set(true); + }); + } + + @Override + protected void onStop() { + Awaitility.await().untilTrue(isStopAllowed); + super.onStop(); + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/it/web/TestCrawlerWithWebApi.java b/src/test/java/com/github/peterbencze/serritor/it/web/TestCrawlerWithWebApi.java new file mode 100644 index 0000000..35763db --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/it/web/TestCrawlerWithWebApi.java @@ -0,0 +1,89 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.it.web; + +import com.github.peterbencze.serritor.api.CrawlerConfiguration; +import com.github.peterbencze.serritor.api.CrawlerConfiguration.CrawlerConfigurationBuilder; +import com.github.peterbencze.serritor.api.CrawlerWithWebApi; +import com.github.peterbencze.serritor.api.web.ServerConfiguration; +import com.github.peterbencze.serritor.api.web.WebApiException; +import com.github.peterbencze.serritor.api.web.http.HttpMethod; +import com.github.peterbencze.serritor.api.web.socket.WebSocketHandler; +import java.time.Duration; +import java.util.concurrent.atomic.AtomicBoolean; +import net.jodah.failsafe.Failsafe; +import net.jodah.failsafe.RetryPolicy; +import org.apache.commons.io.IOUtils; +import org.awaitility.Awaitility; + +/** + * A crawler implementation with web API support that is used by the tests. + */ +public final class TestCrawlerWithWebApi extends CrawlerWithWebApi { + + private static final CrawlerConfiguration CRAWLER_CONFIG = + new CrawlerConfigurationBuilder().build(); + + private static final RetryPolicy RETRY_POLICY = new RetryPolicy<>() + .handle(WebApiException.class) + .withDelay(Duration.ofSeconds(1)) + .withMaxRetries(10); + + private final AtomicBoolean isServerStarted; + private final AtomicBoolean isStopAllowed; + private final WebSocketHandler webSocketHandler; + + public TestCrawlerWithWebApi(final ServerConfiguration serverConfig) { + super(serverConfig, CRAWLER_CONFIG); + + isServerStarted = new AtomicBoolean(); + isStopAllowed = new AtomicBoolean(); + + addHttpEndpoint(HttpMethod.GET, "/api/http/test", (request, response) -> + IOUtils.write("It works!", response.getWriter())); + + webSocketHandler = new WebSocketHandler() { + }; + addWebSocketEndpoint("/api/ws/test", webSocketHandler); + } + + public int getOpenWebSocketSessionCount() { + return super.getOpenWebSocketSessions(webSocketHandler.getClass()).size(); + } + + public AtomicBoolean isServerStarted() { + return isServerStarted; + } + + public void allowStop() { + isStopAllowed.set(true); + } + + @Override + protected void onStart() { + Failsafe.with(RETRY_POLICY).run(() -> { + super.onStart(); + isServerStarted.set(true); + }); + } + + @Override + protected void onStop() { + Awaitility.await().untilTrue(isStopAllowed); + super.onStop(); + } +} diff --git a/src/test/java/com/github/peterbencze/serritor/it/web/WebApiIT.java b/src/test/java/com/github/peterbencze/serritor/it/web/WebApiIT.java new file mode 100644 index 0000000..335086e --- /dev/null +++ b/src/test/java/com/github/peterbencze/serritor/it/web/WebApiIT.java @@ -0,0 +1,940 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.it.web; + +import com.auth0.jwt.JWT; +import com.auth0.jwt.JWTVerifier; +import com.auth0.jwt.algorithms.Algorithm; +import com.auth0.jwt.interfaces.DecodedJWT; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; +import com.github.peterbencze.serritor.api.web.AccessControlConfiguration.AccessControlConfigurationBuilder; +import com.github.peterbencze.serritor.api.web.ServerConfiguration; +import com.github.peterbencze.serritor.api.web.ServerConfiguration.ServerConfigurationBuilder; +import com.github.peterbencze.serritor.api.web.SslContextConfiguration; +import com.github.peterbencze.serritor.api.web.User; +import com.github.peterbencze.serritor.internal.util.KeyFactory; +import com.github.peterbencze.serritor.internal.web.http.CsrfFilter; +import com.github.peterbencze.serritor.internal.web.http.auth.JwtAuthenticator; +import com.github.peterbencze.serritor.internal.web.http.dto.ErrorDto; +import com.github.peterbencze.serritor.internal.web.http.dto.JwtDto; +import com.github.peterbencze.serritor.internal.web.http.dto.LoginDto; +import java.io.IOException; +import java.net.HttpCookie; +import java.net.URI; +import java.nio.charset.StandardCharsets; +import java.security.KeyManagementException; +import java.security.KeyStoreException; +import java.security.NoSuchAlgorithmException; +import java.util.Base64; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import javax.net.ssl.SSLContext; +import org.apache.http.client.CookieStore; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.conn.ssl.SSLConnectionSocketFactory; +import org.apache.http.conn.ssl.TrustSelfSignedStrategy; +import org.apache.http.cookie.Cookie; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.BasicCookieStore; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.cookie.BasicClientCookie; +import org.apache.http.ssl.SSLContextBuilder; +import org.apache.http.util.EntityUtils; +import org.awaitility.Awaitility; +import org.eclipse.jetty.http.HttpHeader; +import org.eclipse.jetty.http.HttpStatus; +import org.eclipse.jetty.http.MimeTypes.Type; +import org.eclipse.jetty.util.ssl.SslContextFactory; +import org.eclipse.jetty.websocket.api.WebSocketAdapter; +import org.eclipse.jetty.websocket.client.ClientUpgradeRequest; +import org.eclipse.jetty.websocket.client.WebSocketClient; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Integration test cases for the crawler's web API. + */ +public final class WebApiIT { + + private static final String USERNAME = "test-username"; + private static final String PASSWORD = "test-password"; + + private static final String KEY_STORE_PATH = + WebApiIT.class.getClassLoader().getResource("keystore.jks").getFile(); + private static final SslContextConfiguration SSL_CONTEXT_CONFIG = + new SslContextConfiguration(KEY_STORE_PATH, PASSWORD); + + private static final User ROOT_USER = new User(USERNAME, PASSWORD); + + private static final String SECRET_KEY = KeyFactory.createKey("HmacSHA256"); + private static final JWTVerifier JWT_VERIFIER = + JWT.require(Algorithm.HMAC256(SECRET_KEY)).build(); + + private static final CookieStore COOKIE_STORE = new BasicCookieStore(); + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private static CloseableHttpClient HTTP_CLIENT; + + @BeforeClass + public static void beforeClass() + throws KeyStoreException, NoSuchAlgorithmException, KeyManagementException { + SSLContext sslContext = new SSLContextBuilder() + .loadTrustMaterial(null, new TrustSelfSignedStrategy()) + .build(); + SSLConnectionSocketFactory socketFactory = new SSLConnectionSocketFactory(sslContext); + HTTP_CLIENT = HttpClients.custom() + .setDefaultCookieStore(COOKIE_STORE) + .setSSLSocketFactory(socketFactory) + .setRetryHandler(new DefaultHttpRequestRetryHandler(3, true)) + .build(); + } + + @AfterClass + public static void afterClass() throws IOException { + HTTP_CLIENT.close(); + } + + @Test + public void testHttpEndpointWhenEndpointExists() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + TestCrawlerWithWebApi crawler = new TestCrawlerWithWebApi(serverConfig); + + try { + Executors.newSingleThreadExecutor().execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + HttpGet request = new HttpGet("http://localhost:8080/api/http/test"); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + Assert.assertEquals("It works!", content); + } + } finally { + crawler.allowStop(); + } + } + + @Test + public void testHttpEndpointWhenEndpointDoesNotExist() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + TestCrawlerWithWebApi crawler = new TestCrawlerWithWebApi(serverConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + HttpGet request = new HttpGet("http://localhost:8080/http/nonexistent"); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + + Assert.assertEquals(HttpStatus.NOT_FOUND_404, + response.getStatusLine().getStatusCode()); + Assert.assertEquals(HttpStatus.NOT_FOUND_404, errorDto.getCode()); + Assert.assertEquals(HttpStatus.getMessage(HttpStatus.NOT_FOUND_404), + errorDto.getMessage()); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + } + } + + @Test + public void testHttpEndpointWhenUsingSsl() throws IOException { + ServerConfiguration serverConfig = new ServerConfigurationBuilder() + .withSsl(SSL_CONTEXT_CONFIG) + .build(); + TestCrawlerWithWebApi crawler = new TestCrawlerWithWebApi(serverConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + HttpGet request = new HttpGet("https://localhost:8080/api/http/test"); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + Assert.assertEquals("It works!", content); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + } + } + + @Test + public void testHttpEndpointWhenNoJwtPresentInRequest() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER).build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + HttpGet request = new HttpGet("http://localhost:8080/api/http/test"); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + + Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, + response.getStatusLine().getStatusCode()); + Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, errorDto.getCode()); + Assert.assertEquals(HttpStatus.getMessage(HttpStatus.UNAUTHORIZED_401), + errorDto.getMessage()); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + } + } + + @Test + public void testHttpEndpointWhenInvalidJwtProvidedInHeader() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER).build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + HttpGet request = new HttpGet("http://localhost:8080/api/http/test"); + request.setHeader(HttpHeader.AUTHORIZATION.asString(), "Bearer invalid"); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + + Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, + response.getStatusLine().getStatusCode()); + Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, errorDto.getCode()); + Assert.assertEquals(HttpStatus.getMessage(HttpStatus.UNAUTHORIZED_401), + errorDto.getMessage()); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + } + } + + @Test + public void testHttpEndpointWhenInvalidJwtProvidedInCookie() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER).build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + HttpGet request = new HttpGet("http://localhost:8080/api/http/test"); + BasicClientCookie authCookie = + new BasicClientCookie(JwtAuthenticator.AUTH_COOKIE_NAME, "invalid"); + authCookie.setDomain("localhost"); + authCookie.setPath("/"); + COOKIE_STORE.addCookie(authCookie); + + try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + + Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, + response.getStatusLine().getStatusCode()); + Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, errorDto.getCode()); + Assert.assertEquals(HttpStatus.getMessage(HttpStatus.UNAUTHORIZED_401), + errorDto.getMessage()); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + COOKIE_STORE.clear(); + } + } + + @Test + public void testLoginWhenCredentialsAreCorrect() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER) + .setSecretKey(SECRET_KEY) + .build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); + HttpPost request = new HttpPost("http://localhost:8080/api/auth"); + request.addHeader(HttpHeader.CONTENT_TYPE.asString(), + Type.APPLICATION_JSON_UTF_8.asString()); + request.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + + try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + JwtDto jwtDto = MAPPER.readValue(content, JwtDto.class); + DecodedJWT decodedJwt = JWT_VERIFIER.verify(jwtDto.getJwt()); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + Assert.assertEquals(USERNAME, jwtDto.getUsername()); + Assert.assertEquals(USERNAME, decodedJwt.getClaim("name").asString()); + Assert.assertEquals(jwtDto.getExpiryDate(), decodedJwt.getExpiresAt()); + Assert.assertEquals(jwtDto.getRoles(), + new HashSet<>(decodedJwt.getClaim("roles").asList(String.class))); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + } + } + + @Test + public void testLoginWhenCredentialsAreCorrectAndCookieAuthenticationIsEnabled() + throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER) + .setSecretKey(SECRET_KEY) + .setCookieAuthenticationEnabled(true) + .build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); + HttpPost request = new HttpPost("http://localhost:8080/api/auth"); + request.addHeader(HttpHeader.CONTENT_TYPE.asString(), + Type.APPLICATION_JSON_UTF_8.asString()); + request.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + + try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + JwtDto jwtDto = MAPPER.readValue(content, JwtDto.class); + DecodedJWT decodedJwt = JWT_VERIFIER.verify(jwtDto.getJwt()); + + Optional jwtCookieOpt = COOKIE_STORE.getCookies() + .stream() + .filter(cookie -> JwtAuthenticator.AUTH_COOKIE_NAME.equals(cookie.getName())) + .findFirst(); + + Optional csrfCookieOpt = COOKIE_STORE.getCookies() + .stream() + .filter(cookie -> CsrfFilter.CSRF_COOKIE_NAME.equals(cookie.getName())) + .findFirst(); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + Assert.assertEquals(USERNAME, jwtDto.getUsername()); + Assert.assertEquals(USERNAME, decodedJwt.getClaim("name").asString()); + Assert.assertEquals(jwtDto.getExpiryDate(), decodedJwt.getExpiresAt()); + Assert.assertEquals(jwtDto.getRoles(), + new HashSet<>(decodedJwt.getClaim("roles").asList(String.class))); + + Assert.assertTrue(jwtCookieOpt.isPresent()); + Cookie jwtCookie = jwtCookieOpt.get(); + Assert.assertEquals(jwtDto.getJwt(), jwtCookie.getValue()); + Assert.assertEquals("/", jwtCookie.getPath()); + Assert.assertEquals(jwtDto.getExpiryDate(), jwtCookie.getExpiryDate()); + + Assert.assertTrue(csrfCookieOpt.isPresent()); + Cookie csrfCookie = csrfCookieOpt.get(); + Assert.assertEquals(JwtAuthenticator.CSRF_TOKEN_BYTE_SIZE, + Base64.getUrlDecoder().decode(csrfCookie.getValue()).length); + Assert.assertEquals("/", csrfCookie.getPath()); + Assert.assertEquals(jwtDto.getExpiryDate(), csrfCookie.getExpiryDate()); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + COOKIE_STORE.clear(); + } + } + + @Test + public void testLoginWhenCredentialsAreIncorrect() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER).build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + LoginDto loginDto = new LoginDto(USERNAME, "wrong-password"); + HttpPost request = new HttpPost("http://localhost:8080/api/auth"); + request.addHeader(HttpHeader.CONTENT_TYPE.asString(), + Type.APPLICATION_JSON_UTF_8.asString()); + request.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + + try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + + Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, + response.getStatusLine().getStatusCode()); + Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, errorDto.getCode()); + Assert.assertEquals(HttpStatus.getMessage(HttpStatus.UNAUTHORIZED_401), + errorDto.getMessage()); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + } + } + + @Test + public void testHttpEndpointWhenUserIsNotAuthorized() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER) + .setCookieAuthenticationEnabled(true) + .build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + // Log in + LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); + HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); + loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), + Type.APPLICATION_JSON_UTF_8.asString()); + loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { + EntityUtils.consumeQuietly(response.getEntity()); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + } + + // Try accessing endpoint + HttpGet endpointRequest = new HttpGet("http://localhost:8080/api/http/test-with-role"); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(endpointRequest)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + + Assert.assertEquals(HttpStatus.FORBIDDEN_403, + response.getStatusLine().getStatusCode()); + Assert.assertEquals(HttpStatus.FORBIDDEN_403, errorDto.getCode()); + Assert.assertEquals(HttpStatus.getMessage(HttpStatus.FORBIDDEN_403), + errorDto.getMessage()); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + COOKIE_STORE.clear(); + } + } + + @Test + public void testHttpEndpointWhenUserIsAuthorized() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + User rootUser = new User(USERNAME, PASSWORD, Collections.singleton("test-role")); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(rootUser) + .setCookieAuthenticationEnabled(true) + .build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + // Log in + LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); + HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); + loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), + Type.APPLICATION_JSON_UTF_8.asString()); + loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { + EntityUtils.consumeQuietly(response.getEntity()); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + } + + // Try accessing endpoint + HttpGet endpointRequest = new HttpGet("http://localhost:8080/api/http/test-with-role"); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(endpointRequest)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + Assert.assertEquals("It works!", content); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + COOKIE_STORE.clear(); + } + } + + @Test + public void testHttpEndpointCsrfProtectionWhenHeaderIsNotPresentInRequest() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER) + .setCookieAuthenticationEnabled(true) + .build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + // Log in + LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); + HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); + loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), + Type.APPLICATION_JSON_UTF_8.asString()); + loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { + EntityUtils.consumeQuietly(response.getEntity()); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + } + + // Try accessing endpoint + HttpPost endpointRequest = new HttpPost("http://localhost:8080/api/http/test-csrf"); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(endpointRequest)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + + Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, + response.getStatusLine().getStatusCode()); + Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, errorDto.getCode()); + Assert.assertEquals("Missing or incorrect CSRF token", errorDto.getMessage()); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + COOKIE_STORE.clear(); + } + } + + @Test + public void testHttpEndpointCsrfProtectionWhenHeaderIsPresentInRequest() throws IOException { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER) + .setCookieAuthenticationEnabled(true) + .build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + // Log in + LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); + HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); + loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), + Type.APPLICATION_JSON_UTF_8.asString()); + loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { + EntityUtils.consumeQuietly(response.getEntity()); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + } + + // Try accessing endpoint + HttpPost endpointRequest = new HttpPost("http://localhost:8080/api/http/test-csrf"); + COOKIE_STORE.getCookies() + .stream() + .filter(cookie -> CsrfFilter.CSRF_COOKIE_NAME.equals(cookie.getName())) + .findFirst() + .ifPresent(cookie -> + endpointRequest.setHeader(CsrfFilter.CSRF_HEADER_NAME, + cookie.getValue())); + + try (CloseableHttpResponse response = HTTP_CLIENT.execute(endpointRequest)) { + String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + Assert.assertEquals("It works!", content); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + COOKIE_STORE.clear(); + } + } + + @Test + public void testWebSocketEndpointWhenEndpointExists() throws Exception { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + TestCrawlerWithWebApi crawler = new TestCrawlerWithWebApi(serverConfig); + + WebSocketClient wsClient = new WebSocketClient(); + WebSocketAdapter clientSocket = new WebSocketAdapter(); + ClientUpgradeRequest upgradeRequest = new ClientUpgradeRequest(); + // Must set the Origin header, otherwise the socket endpoint creation will fail + upgradeRequest.setHeader(HttpHeader.ORIGIN.toString(), "localhost"); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + wsClient.start(); + wsClient.connect(clientSocket, URI.create("ws://localhost:8080/api/ws/test"), + upgradeRequest); + Awaitility.await().untilAsserted(() -> Assert.assertTrue(clientSocket.isConnected())); + } finally { + crawler.allowStop(); + executor.shutdown(); + wsClient.stop(); + } + } + + @Test + public void testWebSocketEndpointWhenEndpointDoesNotExist() throws Exception { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + TestCrawlerWithWebApi crawler = new TestCrawlerWithWebApi(serverConfig); + + WebSocketClient wsClient = new WebSocketClient(); + WebSocketAdapter clientSocket = new WebSocketAdapter(); + ClientUpgradeRequest upgradeRequest = new ClientUpgradeRequest(); + // Must set the Origin header, otherwise the socket endpoint creation will fail + upgradeRequest.setHeader(HttpHeader.ORIGIN.toString(), "localhost"); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + wsClient.start(); + wsClient.connect(clientSocket, URI.create("ws://localhost:8080/api/ws/nonexistent"), + upgradeRequest); + Awaitility.await().pollDelay(1, TimeUnit.SECONDS).untilAsserted(() -> + Assert.assertTrue(clientSocket.isNotConnected())); + } finally { + crawler.allowStop(); + executor.shutdown(); + wsClient.stop(); + } + } + + @Test + public void testWebSocketEndpointWhenUsingSsl() throws Exception { + ServerConfiguration serverConfig = new ServerConfigurationBuilder() + .withSsl(SSL_CONTEXT_CONFIG) + .build(); + TestCrawlerWithWebApi crawler = new TestCrawlerWithWebApi(serverConfig); + + SslContextFactory sslContextFactory = new SslContextFactory.Client(true); + org.eclipse.jetty.client.HttpClient httpClient = + new org.eclipse.jetty.client.HttpClient(sslContextFactory); + httpClient.start(); + WebSocketClient wsClient = new WebSocketClient(httpClient); + WebSocketAdapter clientSocket = new WebSocketAdapter(); + ClientUpgradeRequest upgradeRequest = new ClientUpgradeRequest(); + // Must set the Origin header, otherwise the socket endpoint creation will fail + upgradeRequest.setHeader(HttpHeader.ORIGIN.toString(), "localhost"); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + wsClient.start(); + wsClient.connect(clientSocket, URI.create("wss://localhost:8080/api/ws/test"), + upgradeRequest); + Awaitility.await().untilAsserted(() -> Assert.assertTrue(clientSocket.isConnected())); + } finally { + crawler.allowStop(); + executor.shutdown(); + httpClient.stop(); + wsClient.stop(); + } + } + + @Test + public void testWebSocketEndpointWhenNoJwtIsPresentInRequest() throws Exception { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER).build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + WebSocketClient wsClient = new WebSocketClient(); + WebSocketAdapter clientSocket = new WebSocketAdapter(); + ClientUpgradeRequest upgradeRequest = new ClientUpgradeRequest(); + // Must set the Origin header, otherwise the socket endpoint creation will fail + upgradeRequest.setHeader(HttpHeader.ORIGIN.toString(), "localhost"); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + wsClient.start(); + wsClient.connect(clientSocket, URI.create("ws://localhost:8080/api/ws/test"), + upgradeRequest); + Awaitility.await().pollDelay(1, TimeUnit.SECONDS).untilAsserted(() -> + Assert.assertTrue(clientSocket.isNotConnected())); + } finally { + crawler.allowStop(); + executor.shutdown(); + wsClient.stop(); + } + } + + @Test + public void testWebSocketEndpointWhenInvalidJwtIsProvidedInQueryParameter() throws Exception { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER).build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + WebSocketClient wsClient = new WebSocketClient(); + WebSocketAdapter clientSocket = new WebSocketAdapter(); + ClientUpgradeRequest upgradeRequest = new ClientUpgradeRequest(); + // Must set the Origin header, otherwise the socket endpoint creation will fail + upgradeRequest.setHeader(HttpHeader.ORIGIN.toString(), "localhost"); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + wsClient.start(); + wsClient.connect(clientSocket, + URI.create("ws://localhost:8080/test?access_token=invalid"), upgradeRequest); + Awaitility.await().pollDelay(1, TimeUnit.SECONDS).untilAsserted(() -> + Assert.assertTrue(clientSocket.isNotConnected())); + } finally { + crawler.allowStop(); + executor.shutdown(); + wsClient.stop(); + } + } + + @Test + public void testWebSocketEndpointWhenUserIsNotAuthorized() throws Exception { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(ROOT_USER) + .setCookieAuthenticationEnabled(true) + .build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + WebSocketClient wsClient = new WebSocketClient(); + WebSocketAdapter clientSocket = new WebSocketAdapter(); + ClientUpgradeRequest upgradeRequest = new ClientUpgradeRequest(); + // Must set the Origin header, otherwise the socket endpoint creation will fail + upgradeRequest.setHeader(HttpHeader.ORIGIN.toString(), "localhost"); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + // Log in + LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); + HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); + loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), + Type.APPLICATION_JSON_UTF_8.asString()); + loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { + EntityUtils.consumeQuietly(response.getEntity()); + + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + } + + // Try accessing endpoint + upgradeRequest.setCookies(convertCookies(COOKIE_STORE.getCookies())); + wsClient.start(); + wsClient.connect(clientSocket, URI.create("ws://localhost:8080/test-with-role"), + upgradeRequest); + Awaitility.await().pollDelay(1, TimeUnit.SECONDS).untilAsserted(() -> + Assert.assertTrue(clientSocket.isNotConnected())); + } finally { + crawler.allowStop(); + executor.shutdown(); + wsClient.stop(); + COOKIE_STORE.clear(); + } + } + + @Test + public void testWebSocketEndpointWhenUserIsAuthorized() throws Exception { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + User rootUser = new User(USERNAME, PASSWORD, Collections.singleton("test-role")); + AccessControlConfiguration accessControlConfig = + new AccessControlConfigurationBuilder(rootUser) + .setCookieAuthenticationEnabled(true) + .build(); + TestCrawlerWithSecuredWebApi crawler = + new TestCrawlerWithSecuredWebApi(serverConfig, accessControlConfig); + + WebSocketClient wsClient = new WebSocketClient(); + WebSocketAdapter clientSocket = new WebSocketAdapter(); + ClientUpgradeRequest upgradeRequest = new ClientUpgradeRequest(); + // Must set the Origin header, otherwise the socket endpoint creation will fail + upgradeRequest.setHeader(HttpHeader.ORIGIN.toString(), "localhost"); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + // Log in + LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); + HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); + loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), + Type.APPLICATION_JSON_UTF_8.asString()); + loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { + Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); + + // Try accessing endpoint + upgradeRequest.setCookies(convertCookies(COOKIE_STORE.getCookies())); + wsClient.start(); + wsClient.connect(clientSocket, + URI.create("ws://localhost:8080/api/ws/test-with-role"), upgradeRequest); + Awaitility.await().untilAsserted(() -> + Assert.assertTrue(clientSocket.isConnected())); + } + } finally { + crawler.allowStop(); + executor.shutdown(); + wsClient.stop(); + COOKIE_STORE.clear(); + } + } + + @Test + public void testWebSocketEndpointWhenOriginIsNotAllowed() throws Exception { + ServerConfiguration serverConfig = new ServerConfigurationBuilder() + .setCorsAllowedOrigins(Collections.singleton("http://example.com")) + .build(); + TestCrawlerWithWebApi crawler = new TestCrawlerWithWebApi(serverConfig); + + WebSocketClient wsClient = new WebSocketClient(); + WebSocketAdapter clientSocket = new WebSocketAdapter(); + ClientUpgradeRequest upgradeRequest = new ClientUpgradeRequest(); + // Must set the Origin header, otherwise the socket endpoint creation will fail + upgradeRequest.setHeader(HttpHeader.ORIGIN.toString(), "localhost"); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + wsClient.start(); + wsClient.connect(clientSocket, URI.create("ws://localhost:8080/api/ws/test"), + upgradeRequest); + Awaitility.await().pollDelay(1, TimeUnit.SECONDS).untilAsserted(() -> + Assert.assertTrue(clientSocket.isNotConnected())); + } finally { + crawler.allowStop(); + executor.shutdown(); + wsClient.stop(); + } + } + + @Test + public void testWebSocketSessionManagement() throws Exception { + ServerConfiguration serverConfig = ServerConfiguration.createDefault(); + TestCrawlerWithWebApi crawler = new TestCrawlerWithWebApi(serverConfig); + + WebSocketClient wsClient = new WebSocketClient(); + WebSocketAdapter clientSocket = new WebSocketAdapter(); + ClientUpgradeRequest upgradeRequest = new ClientUpgradeRequest(); + // Must set the Origin header, otherwise the socket endpoint creation will fail + upgradeRequest.setHeader(HttpHeader.ORIGIN.toString(), "localhost"); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + executor.execute(crawler::start); + Awaitility.await().atMost(30, TimeUnit.SECONDS).untilTrue(crawler.isServerStarted()); + + wsClient.start(); + wsClient.connect(clientSocket, URI.create("ws://localhost:8080/api/ws/test"), + upgradeRequest); + Awaitility.await().untilAsserted(() -> Assert.assertTrue(clientSocket.isConnected())); + Assert.assertEquals(1, crawler.getOpenWebSocketSessionCount()); + + wsClient.stop(); + Awaitility.await().untilAsserted(() -> + Assert.assertTrue(clientSocket.isNotConnected())); + Assert.assertEquals(0, crawler.getOpenWebSocketSessionCount()); + } finally { + crawler.allowStop(); + executor.shutdown(); + wsClient.stop(); + } + } + + private static List convertCookies(final List cookiesToConvert) { + return cookiesToConvert.stream() + .map(cookieToConvert -> { + HttpCookie convertedCookie = + new HttpCookie(cookieToConvert.getName(), cookieToConvert.getValue()); + convertedCookie.setDomain(cookieToConvert.getDomain()); + convertedCookie.setPath(cookieToConvert.getPath()); + + return convertedCookie; + }) + .collect(Collectors.toList()); + } +} diff --git a/src/test/resources/keystore.jks b/src/test/resources/keystore.jks new file mode 100644 index 0000000000000000000000000000000000000000..69847fb568df2d56be9e30d31df0a26c15573dbe GIT binary patch literal 2215 zcmcJQ`8N~{7sqEama!EjW+)Qb#*8gw$-WHX$u@*6V-RM_l%*6ymPlqsh!|tC6!Yq} zCZ&)iOTCz~)v+|n&Lr6~y`IkVp7T6^!284f;q${i_uTKj=lkU?^OgYs0BFB}zY7|L zyL=UQc^|_Dw^+RZ03R4dhWtW_LWR_zKnPG3bN~qC13<`-MYGXU=yz}OH_L=86w_uh zB{S&hE*_PM?AH@f&OzNho*#F6#*!NK`I}d0LzM3@mFjNmyG8f!i^#+tkZ>g5D>wD( zFubpJs5l)m)^e-1@QxxXO~(e$?Scz)01;A2^0;P2>DI<^YZ}qk^*#19QDszo=cg28o6q=3N?p5cw#X$@gkEL`BXADL zT)gcwiipj2mRUiHRG*b_Io=Ovt-H~FI@uUML_8P}F!UZG>yVrjY`%79x+^m_yN6m} zbL%c=smzOIanH`SYU5$3;p?eAF)w8O1U2ufR!rT7;W`NhcYidgL(8UBm4BsZTVznr z&KmaTQXlJbd#WS-is`d9kmzT^S@H0kOOAy!zvD1&Q>U$V z#q!nj+JP5X^FA4kt$>m1*}M;mx|2^=8B#E9ZH;4X%XVZ+u3`T=sn6f9rOja2ezYob zK)A^@B@Df{z5D689tkkVGs@4-nmQ=zn1OgZK70PvsHj-0;<@Vcvq?2VeZjLApq2HP z^T;YydA1|=?vacm%oxIv`pV4cnx@ilU&z%zqHtl9hJHtd!R^{k*PS)ld!ORFW01rx zLn9ISl%Lee@oTFxaP02&jzmkAOXae2r*3t_s4{P=25+*C5qZdr2me@jQ|9x^5LXo?ZVL3Zlc0b^B#?FX_ESH2GyPA#}%O;e}Y!XM={9(vFGaG$j7 zQzZADpSmRe3bG2BDyQm1Gste9iy3oNMi99(NGg5@K7?ZCbk1vapFqsv{-bz zZT6~+X<)W`fs*#DtyUzPYp5L)bu}Sph+!}TJ@M+GSDX}8Ey!EKh}utF_?q=WZD^gD z7>C#MzMH^1$@yR%!aAo@@?cT|!PXoMJBCb~Q#jO2v^T;4FAU9LHs{M{wIcC&DhS_E?TCyZy&C0n!4VCCc3vI3VP~k@Rx*yj_xWX}R2{qj~-cA}`X} z2IwbeQR=Nc&=u_w_;b;i^Q6$=X4(`zH>!m?E$}Xc6ao*;-Ray)F}fkQ zVT-xL2czT`UwSaZDW;X{3PLfhCfUQ@$kyVlACDM2;TGi7INt{OH6<*@|LhOEqHIHc zCif{R8bqFx3Qs9DXHge&5SrWHaAej_lzl}i?tWzDG2Nn`Wns6}N+ieueCc*OM4@_t*RPUQN(ZWDf6BcCTy7i9nd zfPGM8Fcw7ynU#P+KoAHbzvi2a5`hY;rr4%ziU5HyJ^;!G4MPb+!EO+!1jyD6ErNpm z2?7!jS6me7f5`t&6#6H^{=$PO@jrXP5`y7`K>zR%LKF!lCnbV5KIxbbXo`LvA3sgY& zaA)Q#6VMT~#1Wm!T+onH)AJJw_2tEL#wo^K0n1N5W8mFoHxigMO_z1O@_tOnKB%lfJKo7Q`>;i)n9 zZ{f0ly3EFHN+4A{elYjW@r+aM*k42jq)zUF=chfA8C>2XILJED$-?wGOg{#-Av&+l z4tninQW!)|t-$j6M_8RU_w$RvysDayQz_noYa}_Xy}=tXY3uF$>ZA5^+P=)oX^hTd m(u%Qpi5VyAB$Cx?ukECx)$a90Y2lqfCqbE=b Date: Sun, 26 May 2019 22:38:00 +0200 Subject: [PATCH 56/63] Upgrade plugin and dependency versions --- pom.xml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pom.xml b/pom.xml index 4740415..ed44e6a 100644 --- a/pom.xml +++ b/pom.xml @@ -59,7 +59,7 @@ org.seleniumhq.selenium htmlunit-driver - 2.33.3 + 2.35.1 net.lightbody.bmp @@ -69,7 +69,7 @@ com.google.guava guava - 27.0.1-jre + 27.1-jre org.eclipse.jetty @@ -94,12 +94,12 @@ com.fasterxml.jackson.datatype jackson-datatype-jdk8 - 2.8.9 + 2.9.9 org.slf4j slf4j-api - 1.7.25 + 1.7.26 com.auth0 @@ -126,7 +126,7 @@ com.github.tomakehurst wiremock-jre8-standalone - 2.21.0 + 2.23.2 test @@ -148,7 +148,7 @@ org.apache.maven.plugins maven-source-plugin - 3.0.1 + 3.1.0 attach-source @@ -161,7 +161,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.0.1 + 3.1.0 attach-javadoc @@ -185,7 +185,7 @@ com.puppycrawl.tools checkstyle - 8.18 + 8.20 @@ -203,7 +203,7 @@ org.apache.maven.plugins maven-failsafe-plugin - 2.22.1 + 2.22.2 -Djdk.net.URLClassPath.disableClassPathURLCheck=true From f00aa52e9d115e9a4e4d67098399b749b61156f1 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 27 May 2019 21:28:31 +0200 Subject: [PATCH 57/63] Add thread safety comments --- .../java/com/github/peterbencze/serritor/api/Crawler.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/Crawler.java b/src/main/java/com/github/peterbencze/serritor/api/Crawler.java index 0de2d0e..7505656 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/Crawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/Crawler.java @@ -126,7 +126,7 @@ protected Crawler(final CrawlerState state) { } /** - * Returns the configuration of the crawler. + * Returns the configuration of the crawler. This method is thread-safe. * * @return the configuration of the crawler */ @@ -135,7 +135,7 @@ public final CrawlerConfiguration getCrawlerConfiguration() { } /** - * Returns summary statistics about the crawl progress. + * Returns summary statistics about the crawl progress. This method is thread-safe. * * @return summary statistics about the crawl progress */ @@ -331,7 +331,7 @@ protected final void registerCustomCallback( } /** - * Gracefully stops the crawler. + * Gracefully stops the crawler. This method is thread-safe. */ protected final void stop() { Validate.validState(!isStopped.get(), "The crawler is not started."); From 857cd3d5cc933ae7976869fd1564f88509afbfad Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 28 May 2019 19:39:31 +0200 Subject: [PATCH 58/63] Add helper class for sending JSON objects as HTTP response or WebSocket message --- .../serritor/api/helper/JsonSender.java | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/helper/JsonSender.java diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/JsonSender.java b/src/main/java/com/github/peterbencze/serritor/api/helper/JsonSender.java new file mode 100644 index 0000000..9a12ca6 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/JsonSender.java @@ -0,0 +1,97 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.helper; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.concurrent.Future; +import javax.servlet.ServletResponse; +import org.eclipse.jetty.http.MimeTypes.Type; +import org.eclipse.jetty.websocket.api.Session; + +/** + * A helper class that can be used to send JSON objects as HTTP response or WebSocket message. + */ +public final class JsonSender { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + /** + * Private constructor to hide the implicit public one. + */ + private JsonSender() { + } + + /** + * Returns the singleton object mapper that is used for reading and writing JSON. + * + * @return the singleton object mapper that is used for reading and writing JSON + */ + public static ObjectMapper getObjectMapper() { + return OBJECT_MAPPER; + } + + /** + * Sends a JSON HTTP response. + * + * @param response the response + * @param object the object to send as JSON + */ + public static void sendJsonResponse(final ServletResponse response, final Object object) { + response.setContentType(Type.APPLICATION_JSON_UTF_8.asString()); + + try { + OBJECT_MAPPER.writeValue(response.getOutputStream(), object); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Sends a JSON WebSocket message, blocking until all bytes of the message has been + * transmitted. + * + * @param session the WebSocket session + * @param object the object to send as JSON + */ + public static void sendJsonMessage(final Session session, final Object object) { + try { + session.getRemote().sendString(OBJECT_MAPPER.writeValueAsString(object)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Initiates the asynchronous transmission of a JSON WebSocket message. This method may return + * before the message is transmitted. + * + * @param session the WebSocket session + * @param object the object to send as JSON + * + * @return a Future object that can be used to track progress of the transmission + */ + public static Future sendJsonMessageByFuture(final Session session, final Object object) { + try { + return session.getRemote().sendStringByFuture(OBJECT_MAPPER.writeValueAsString(object)); + } catch (JsonProcessingException e) { + throw new UncheckedIOException(e); + } + } +} From 5e584118e674cce1d30496f0ddd762c970ed579c Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 29 May 2019 22:07:19 +0200 Subject: [PATCH 59/63] Add functionality for reading JSON structures --- .../serritor/api/helper/JsonReaderWriter.java | 139 ++++++++++++++++++ .../serritor/api/helper/JsonSender.java | 97 ------------ 2 files changed, 139 insertions(+), 97 deletions(-) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/helper/JsonReaderWriter.java delete mode 100644 src/main/java/com/github/peterbencze/serritor/api/helper/JsonSender.java diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/JsonReaderWriter.java b/src/main/java/com/github/peterbencze/serritor/api/helper/JsonReaderWriter.java new file mode 100644 index 0000000..5751d8a --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/JsonReaderWriter.java @@ -0,0 +1,139 @@ +/* + * Copyright 2019 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.peterbencze.serritor.api.helper; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.concurrent.Future; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.eclipse.jetty.http.MimeTypes.Type; +import org.eclipse.jetty.websocket.api.Session; + +/** + * A helper class that is intended to make it easier for users to read and write JSON structures to + * HTTP or WebSocket streams. + */ +public final class JsonReaderWriter { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + /** + * Private constructor to hide the implicit public one. + */ + private JsonReaderWriter() { + } + + /** + * Returns the singleton object mapper instance that is used for reading and writing JSON. + * + * @return the singleton object mapper instance that is used for reading and writing JSON + */ + public static ObjectMapper getObjectMapper() { + return OBJECT_MAPPER; + } + + /** + * Deserializes the JSON structure contained by the HTTP request body into an instance of the + * specified type. + * + * @param request the HTTP request + * @param objectType the runtime class of the object + * @param the type of the object + * + * @return the deserialized object + */ + public static T readJsonRequest( + final HttpServletRequest request, + final Class objectType) { + try { + return OBJECT_MAPPER.readValue(request.getInputStream(), objectType); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Serializes the specified object and writes the JSON structure to the response. + * + * @param response the HTTP response + * @param object the object to serialize + */ + public static void writeJsonResponse(final HttpServletResponse response, final Object object) { + response.setContentType(Type.APPLICATION_JSON_UTF_8.asString()); + + try { + OBJECT_MAPPER.writeValue(response.getOutputStream(), object); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Deserializes the JSON structure contained by the WebSocket message into an instance of the + * specified type. + * + * @param message the WebSocket message + * @param objectType the runtime class of the object + * @param the type of the object + * + * @return the deserialized object + */ + public static T readJsonMessage(final String message, final Class objectType) { + try { + return OBJECT_MAPPER.readValue(message, objectType); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Serializes the specified object and writes the JSON structure to the WebSocket, blocking + * until all bytes of the message have been transmitted. + * + * @param session the WebSocket session + * @param object the object to serialize + */ + public static void writeJsonMessage(final Session session, final Object object) { + try { + session.getRemote().sendString(OBJECT_MAPPER.writeValueAsString(object)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Serializes the specified object and asynchronously writes the JSON structure to the + * WebSocket. This method may return before the message is transmitted. + * + * @param session the WebSocket session + * @param object the object to serialize + * + * @return a Future object that can be used to track progress of the transmission + */ + public static Future writeJsonMessageByFuture( + final Session session, + final Object object) { + try { + return session.getRemote().sendStringByFuture(OBJECT_MAPPER.writeValueAsString(object)); + } catch (JsonProcessingException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/JsonSender.java b/src/main/java/com/github/peterbencze/serritor/api/helper/JsonSender.java deleted file mode 100644 index 9a12ca6..0000000 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/JsonSender.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright 2019 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.github.peterbencze.serritor.api.helper; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.concurrent.Future; -import javax.servlet.ServletResponse; -import org.eclipse.jetty.http.MimeTypes.Type; -import org.eclipse.jetty.websocket.api.Session; - -/** - * A helper class that can be used to send JSON objects as HTTP response or WebSocket message. - */ -public final class JsonSender { - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - /** - * Private constructor to hide the implicit public one. - */ - private JsonSender() { - } - - /** - * Returns the singleton object mapper that is used for reading and writing JSON. - * - * @return the singleton object mapper that is used for reading and writing JSON - */ - public static ObjectMapper getObjectMapper() { - return OBJECT_MAPPER; - } - - /** - * Sends a JSON HTTP response. - * - * @param response the response - * @param object the object to send as JSON - */ - public static void sendJsonResponse(final ServletResponse response, final Object object) { - response.setContentType(Type.APPLICATION_JSON_UTF_8.asString()); - - try { - OBJECT_MAPPER.writeValue(response.getOutputStream(), object); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - /** - * Sends a JSON WebSocket message, blocking until all bytes of the message has been - * transmitted. - * - * @param session the WebSocket session - * @param object the object to send as JSON - */ - public static void sendJsonMessage(final Session session, final Object object) { - try { - session.getRemote().sendString(OBJECT_MAPPER.writeValueAsString(object)); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - /** - * Initiates the asynchronous transmission of a JSON WebSocket message. This method may return - * before the message is transmitted. - * - * @param session the WebSocket session - * @param object the object to send as JSON - * - * @return a Future object that can be used to track progress of the transmission - */ - public static Future sendJsonMessageByFuture(final Session session, final Object object) { - try { - return session.getRemote().sendStringByFuture(OBJECT_MAPPER.writeValueAsString(object)); - } catch (JsonProcessingException e) { - throw new UncheckedIOException(e); - } - } -} From 755acf7f85754d0c0e00ed97addfac5f7f590bb8 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 29 May 2019 22:48:45 +0200 Subject: [PATCH 60/63] Use helper class instead of multiple object mapper instances --- .../internal/web/JsonErrorHandler.java | 7 +- .../web/http/auth/JwtAuthenticator.java | 8 +- .../peterbencze/serritor/it/web/WebApiIT.java | 84 ++++++++++--------- 3 files changed, 51 insertions(+), 48 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/JsonErrorHandler.java b/src/main/java/com/github/peterbencze/serritor/internal/web/JsonErrorHandler.java index e0d925d..44a4466 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/JsonErrorHandler.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/JsonErrorHandler.java @@ -16,7 +16,7 @@ package com.github.peterbencze.serritor.internal.web; -import com.fasterxml.jackson.databind.ObjectMapper; +import com.github.peterbencze.serritor.api.helper.JsonReaderWriter; import com.github.peterbencze.serritor.internal.web.http.dto.ErrorDto; import java.io.IOException; import java.io.Writer; @@ -68,9 +68,6 @@ protected void writeErrorPage( final int code, final String message, final boolean showStacks) throws IOException { - ObjectMapper mapper = new ObjectMapper(); - ErrorDto errorDto = new ErrorDto(code, message); - - mapper.writeValue(writer, errorDto); + JsonReaderWriter.getObjectMapper().writeValue(writer, new ErrorDto(code, message)); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtAuthenticator.java b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtAuthenticator.java index ba808fa..e6f74e8 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtAuthenticator.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/http/auth/JwtAuthenticator.java @@ -23,7 +23,7 @@ import com.auth0.jwt.interfaces.DecodedJWT; import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.databind.JsonMappingException; -import com.fasterxml.jackson.databind.ObjectMapper; +import com.github.peterbencze.serritor.api.helper.JsonReaderWriter; import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; import com.github.peterbencze.serritor.internal.util.KeyFactory; import com.github.peterbencze.serritor.internal.web.http.CsrfFilter; @@ -193,10 +193,8 @@ public boolean secureResponse( private Authentication authenticateWithCredentials( final HttpServletRequest request, final HttpServletResponse response) throws IOException { - ObjectMapper mapper = new ObjectMapper(); - try { - LoginDto loginDto = mapper.readValue(request.getInputStream(), LoginDto.class); + LoginDto loginDto = JsonReaderWriter.readJsonRequest(request, LoginDto.class); String username = loginDto.getUsername(); @@ -248,7 +246,7 @@ private Authentication authenticateWithCredentials( response.setContentType(Type.APPLICATION_JSON.asString()); JwtDto jwtDto = new JwtDto(username, userRoles, expiryDate, jwt); - mapper.writeValue(response.getOutputStream(), jwtDto); + JsonReaderWriter.writeJsonResponse(response, jwtDto); return new UserAuthentication(getAuthMethod(), userIdentity); } catch (JsonParseException | JsonMappingException e) { diff --git a/src/test/java/com/github/peterbencze/serritor/it/web/WebApiIT.java b/src/test/java/com/github/peterbencze/serritor/it/web/WebApiIT.java index 335086e..a1646b5 100644 --- a/src/test/java/com/github/peterbencze/serritor/it/web/WebApiIT.java +++ b/src/test/java/com/github/peterbencze/serritor/it/web/WebApiIT.java @@ -21,6 +21,7 @@ import com.auth0.jwt.algorithms.Algorithm; import com.auth0.jwt.interfaces.DecodedJWT; import com.fasterxml.jackson.databind.ObjectMapper; +import com.github.peterbencze.serritor.api.helper.JsonReaderWriter; import com.github.peterbencze.serritor.api.web.AccessControlConfiguration; import com.github.peterbencze.serritor.api.web.AccessControlConfiguration.AccessControlConfigurationBuilder; import com.github.peterbencze.serritor.api.web.ServerConfiguration; @@ -57,6 +58,7 @@ import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.conn.ssl.TrustSelfSignedStrategy; import org.apache.http.cookie.Cookie; +import org.apache.http.entity.ContentType; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; @@ -68,7 +70,6 @@ import org.awaitility.Awaitility; import org.eclipse.jetty.http.HttpHeader; import org.eclipse.jetty.http.HttpStatus; -import org.eclipse.jetty.http.MimeTypes.Type; import org.eclipse.jetty.util.ssl.SslContextFactory; import org.eclipse.jetty.websocket.api.WebSocketAdapter; import org.eclipse.jetty.websocket.client.ClientUpgradeRequest; @@ -99,7 +100,7 @@ public final class WebApiIT { private static final CookieStore COOKIE_STORE = new BasicCookieStore(); - private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = JsonReaderWriter.getObjectMapper(); private static CloseableHttpClient HTTP_CLIENT; @@ -156,7 +157,8 @@ public void testHttpEndpointWhenEndpointDoesNotExist() throws IOException { HttpGet request = new HttpGet("http://localhost:8080/http/nonexistent"); try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + ErrorDto errorDto = JsonReaderWriter.getObjectMapper() + .readValue(content, ErrorDto.class); Assert.assertEquals(HttpStatus.NOT_FOUND_404, response.getStatusLine().getStatusCode()); @@ -211,7 +213,8 @@ public void testHttpEndpointWhenNoJwtPresentInRequest() throws IOException { HttpGet request = new HttpGet("http://localhost:8080/api/http/test"); try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + ErrorDto errorDto = JsonReaderWriter.getObjectMapper() + .readValue(content, ErrorDto.class); Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, response.getStatusLine().getStatusCode()); @@ -242,7 +245,8 @@ public void testHttpEndpointWhenInvalidJwtProvidedInHeader() throws IOException request.setHeader(HttpHeader.AUTHORIZATION.asString(), "Bearer invalid"); try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + ErrorDto errorDto = JsonReaderWriter.getObjectMapper() + .readValue(content, ErrorDto.class); Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, response.getStatusLine().getStatusCode()); @@ -278,7 +282,8 @@ public void testHttpEndpointWhenInvalidJwtProvidedInCookie() throws IOException try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + ErrorDto errorDto = JsonReaderWriter.getObjectMapper() + .readValue(content, ErrorDto.class); Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, response.getStatusLine().getStatusCode()); @@ -310,13 +315,13 @@ public void testLoginWhenCredentialsAreCorrect() throws IOException { LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); HttpPost request = new HttpPost("http://localhost:8080/api/auth"); - request.addHeader(HttpHeader.CONTENT_TYPE.asString(), - Type.APPLICATION_JSON_UTF_8.asString()); - request.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + StringEntity entity = new StringEntity(OBJECT_MAPPER.writeValueAsString(loginDto), + ContentType.APPLICATION_JSON); + request.setEntity(entity); try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - JwtDto jwtDto = MAPPER.readValue(content, JwtDto.class); + JwtDto jwtDto = JsonReaderWriter.getObjectMapper().readValue(content, JwtDto.class); DecodedJWT decodedJwt = JWT_VERIFIER.verify(jwtDto.getJwt()); Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); @@ -351,13 +356,13 @@ public void testLoginWhenCredentialsAreCorrectAndCookieAuthenticationIsEnabled() LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); HttpPost request = new HttpPost("http://localhost:8080/api/auth"); - request.addHeader(HttpHeader.CONTENT_TYPE.asString(), - Type.APPLICATION_JSON_UTF_8.asString()); - request.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + StringEntity entity = new StringEntity(OBJECT_MAPPER.writeValueAsString(loginDto), + ContentType.APPLICATION_JSON); + request.setEntity(entity); try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - JwtDto jwtDto = MAPPER.readValue(content, JwtDto.class); + JwtDto jwtDto = JsonReaderWriter.getObjectMapper().readValue(content, JwtDto.class); DecodedJWT decodedJwt = JWT_VERIFIER.verify(jwtDto.getJwt()); Optional jwtCookieOpt = COOKIE_STORE.getCookies() @@ -412,13 +417,14 @@ public void testLoginWhenCredentialsAreIncorrect() throws IOException { LoginDto loginDto = new LoginDto(USERNAME, "wrong-password"); HttpPost request = new HttpPost("http://localhost:8080/api/auth"); - request.addHeader(HttpHeader.CONTENT_TYPE.asString(), - Type.APPLICATION_JSON_UTF_8.asString()); - request.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + StringEntity entity = new StringEntity(OBJECT_MAPPER.writeValueAsString(loginDto), + ContentType.APPLICATION_JSON); + request.setEntity(entity); try (CloseableHttpResponse response = HTTP_CLIENT.execute(request)) { String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + ErrorDto errorDto = JsonReaderWriter.getObjectMapper() + .readValue(content, ErrorDto.class); Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, response.getStatusLine().getStatusCode()); @@ -450,9 +456,9 @@ public void testHttpEndpointWhenUserIsNotAuthorized() throws IOException { // Log in LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); - loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), - Type.APPLICATION_JSON_UTF_8.asString()); - loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + StringEntity entity = new StringEntity(OBJECT_MAPPER.writeValueAsString(loginDto), + ContentType.APPLICATION_JSON); + loginRequest.setEntity(entity); try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { EntityUtils.consumeQuietly(response.getEntity()); @@ -463,7 +469,8 @@ public void testHttpEndpointWhenUserIsNotAuthorized() throws IOException { HttpGet endpointRequest = new HttpGet("http://localhost:8080/api/http/test-with-role"); try (CloseableHttpResponse response = HTTP_CLIENT.execute(endpointRequest)) { String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + ErrorDto errorDto = JsonReaderWriter.getObjectMapper() + .readValue(content, ErrorDto.class); Assert.assertEquals(HttpStatus.FORBIDDEN_403, response.getStatusLine().getStatusCode()); @@ -497,9 +504,9 @@ public void testHttpEndpointWhenUserIsAuthorized() throws IOException { // Log in LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); - loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), - Type.APPLICATION_JSON_UTF_8.asString()); - loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + StringEntity entity = new StringEntity(OBJECT_MAPPER.writeValueAsString(loginDto), + ContentType.APPLICATION_JSON); + loginRequest.setEntity(entity); try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { EntityUtils.consumeQuietly(response.getEntity()); @@ -539,9 +546,9 @@ public void testHttpEndpointCsrfProtectionWhenHeaderIsNotPresentInRequest() thro // Log in LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); - loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), - Type.APPLICATION_JSON_UTF_8.asString()); - loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + StringEntity entity = new StringEntity(OBJECT_MAPPER.writeValueAsString(loginDto), + ContentType.APPLICATION_JSON); + loginRequest.setEntity(entity); try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { EntityUtils.consumeQuietly(response.getEntity()); @@ -552,7 +559,8 @@ public void testHttpEndpointCsrfProtectionWhenHeaderIsNotPresentInRequest() thro HttpPost endpointRequest = new HttpPost("http://localhost:8080/api/http/test-csrf"); try (CloseableHttpResponse response = HTTP_CLIENT.execute(endpointRequest)) { String content = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - ErrorDto errorDto = MAPPER.readValue(content, ErrorDto.class); + ErrorDto errorDto = JsonReaderWriter.getObjectMapper() + .readValue(content, ErrorDto.class); Assert.assertEquals(HttpStatus.UNAUTHORIZED_401, response.getStatusLine().getStatusCode()); @@ -584,9 +592,9 @@ public void testHttpEndpointCsrfProtectionWhenHeaderIsPresentInRequest() throws // Log in LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); - loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), - Type.APPLICATION_JSON_UTF_8.asString()); - loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + StringEntity entity = new StringEntity(OBJECT_MAPPER.writeValueAsString(loginDto), + ContentType.APPLICATION_JSON); + loginRequest.setEntity(entity); try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { EntityUtils.consumeQuietly(response.getEntity()); @@ -791,9 +799,9 @@ public void testWebSocketEndpointWhenUserIsNotAuthorized() throws Exception { // Log in LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); - loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), - Type.APPLICATION_JSON_UTF_8.asString()); - loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + StringEntity entity = new StringEntity(OBJECT_MAPPER.writeValueAsString(loginDto), + ContentType.APPLICATION_JSON); + loginRequest.setEntity(entity); try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { EntityUtils.consumeQuietly(response.getEntity()); @@ -840,9 +848,9 @@ public void testWebSocketEndpointWhenUserIsAuthorized() throws Exception { // Log in LoginDto loginDto = new LoginDto(USERNAME, PASSWORD); HttpPost loginRequest = new HttpPost("http://localhost:8080/api/auth"); - loginRequest.addHeader(HttpHeader.CONTENT_TYPE.asString(), - Type.APPLICATION_JSON_UTF_8.asString()); - loginRequest.setEntity(new StringEntity(MAPPER.writeValueAsString(loginDto))); + StringEntity entity = new StringEntity(OBJECT_MAPPER.writeValueAsString(loginDto), + ContentType.APPLICATION_JSON); + loginRequest.setEntity(entity); try (CloseableHttpResponse response = HTTP_CLIENT.execute(loginRequest)) { Assert.assertEquals(HttpStatus.OK_200, response.getStatusLine().getStatusCode()); From 17a0cad8d1d4f85f5ed9446cefc7a700e972375e Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 30 May 2019 00:40:19 +0200 Subject: [PATCH 61/63] Add JDK 8 module to object mapper --- pom.xml | 3 ++- .../peterbencze/serritor/api/helper/JsonReaderWriter.java | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index ed44e6a..09ba7a6 100644 --- a/pom.xml +++ b/pom.xml @@ -94,7 +94,8 @@ com.fasterxml.jackson.datatype jackson-datatype-jdk8 - 2.9.9 + + 2.8.9 org.slf4j diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/JsonReaderWriter.java b/src/main/java/com/github/peterbencze/serritor/api/helper/JsonReaderWriter.java index 5751d8a..cae871e 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/JsonReaderWriter.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/JsonReaderWriter.java @@ -18,6 +18,7 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; import java.io.IOException; import java.io.UncheckedIOException; import java.util.concurrent.Future; @@ -32,7 +33,8 @@ */ public final class JsonReaderWriter { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .registerModule(new Jdk8Module()); /** * Private constructor to hide the implicit public one. From 24feaa135ce9431b5abe4d366c4b670fa616b4a5 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 30 May 2019 09:01:24 +0200 Subject: [PATCH 62/63] Modify param description --- .../peterbencze/serritor/api/CrawlerWithSecuredWebApi.java | 2 +- .../github/peterbencze/serritor/api/CrawlerWithWebApi.java | 2 +- .../github/peterbencze/serritor/internal/web/WebApi.java | 2 +- .../internal/web/socket/WebSocketSessionManager.java | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithSecuredWebApi.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithSecuredWebApi.java index 8592770..b681782 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithSecuredWebApi.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithSecuredWebApi.java @@ -116,7 +116,7 @@ protected void onStop() { * Returns a set of open WebSocket sessions that represent connections to the specific * endpoint. * - * @param socketHandlerClass the class of the WebSocket endpoint handler + * @param socketHandlerClass the runtime class of the WebSocket endpoint handler * * @return a set of open WebSocket sessions that represent connections to the specific endpoint */ diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithWebApi.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithWebApi.java index 195c2bb..2a3f2f9 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithWebApi.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerWithWebApi.java @@ -98,7 +98,7 @@ protected void onStop() { * Returns a set of open WebSocket sessions that represent connections to the specific * endpoint. * - * @param socketHandlerClass the class of the WebSocket endpoint handler + * @param socketHandlerClass the runtime class of the WebSocket endpoint handler * * @return a set of open WebSocket sessions that represent connections to the specific endpoint */ diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/WebApi.java b/src/main/java/com/github/peterbencze/serritor/internal/web/WebApi.java index 78de5e2..53fa2cb 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/WebApi.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/WebApi.java @@ -130,7 +130,7 @@ public void stop() { * Returns a set of open WebSocket sessions that represent connections to the specific * endpoint. * - * @param socketHandlerClass the class of the WebSocket endpoint handler + * @param socketHandlerClass the runtime class of the WebSocket endpoint handler * * @return a set of open WebSocket sessions that represent connections to the specific endpoint */ diff --git a/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketSessionManager.java b/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketSessionManager.java index d59d60a..16acec1 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketSessionManager.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/web/socket/WebSocketSessionManager.java @@ -41,7 +41,7 @@ public WebSocketSessionManager() { * Returns a set of open WebSocket sessions that represent connections to the specific * endpoint. * - * @param socketHandlerClass the class of the WebSocket endpoint handler + * @param socketHandlerClass the runtime class of the WebSocket endpoint handler * * @return a set of open WebSocket sessions that represent connections to the specific endpoint */ @@ -57,7 +57,7 @@ public Set getOpenSessions( * Adds a WebSocket session to the set of open sessions. This method is called when a client * connects to a WebSocket endpoint. * - * @param socketHandlerClass the class of the WebSocket endpoint handler + * @param socketHandlerClass the runtime class of the WebSocket endpoint handler * @param session the open WebSocket session */ public void addSession( @@ -71,7 +71,7 @@ public void addSession( * Removes a WebSocket session from the set of open sessions. This method is called when a * client disconnects from a WebSocket endpoint. * - * @param socketHandlerClass the class of the WebSocket endpoint handler + * @param socketHandlerClass the runtime class of the WebSocket endpoint handler * @param session the no longer open WebSocket session */ public void removeSession( From a23b29ee860b5ae7e766a785ef746cbb781f504e Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 30 May 2019 21:27:04 +0200 Subject: [PATCH 63/63] Update README --- README.md | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index b21d5cd..f60790a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ Serritor ======== -Serritor is an open source web crawler framework built upon [Selenium](http://www.seleniumhq.org/) and written in Java. It can be used to crawl dynamic web pages that use JavaScript. +Serritor is an open source web crawler framework built upon [Selenium](http://www.seleniumhq.org/) +and written in Java. It can be used to crawl dynamic web pages that require JavaScript to render +data. ## Using Serritor in your build ### Maven @@ -11,7 +13,7 @@ Add the following dependency to your pom.xml: com.github.peterbencze serritor - 1.6.0 + 2.0.0 ``` @@ -19,37 +21,39 @@ Add the following dependency to your pom.xml: Add the following dependency to your build.gradle: ```groovy -compile group: 'com.github.peterbencze', name: 'serritor', version: '1.6.0' +compile group: 'com.github.peterbencze', name: 'serritor', version: '2.0.0' ``` ### Manual dependencies -The standalone JAR files are available on the [releases](https://github.com/peterbencze/serritor/releases) page. +The standalone JAR files are available on the +[releases](https://github.com/peterbencze/serritor/releases) page. ## Documentation * The [Wiki](https://github.com/peterbencze/serritor/wiki) contains usage information and examples * The Javadoc is available [here](https://peterbencze.github.io/serritor/) ## Quickstart -The `BaseCrawler` abstract class provides a skeletal implementation of a crawler to minimize the effort to create your own. The extending class should define the logic of the crawler. +The `Crawler` abstract class provides a skeletal implementation of a crawler to minimize the effort +to create your own. The extending class should implement the logic of the crawler. Below you can find a simple example that is enough to get you started: ```java -public class MyCrawler extends BaseCrawler { +public class MyCrawler extends Crawler { private final UrlFinder urlFinder; public MyCrawler(final CrawlerConfiguration config) { super(config); - // Extract URLs from links on the crawled page + // A helper class that is intended to make it easier to find URLs on web pages urlFinder = UrlFinder.createDefault(); } @Override - protected void onPageLoad(final PageLoadEvent event) { - // Crawl every URL that match the given pattern - urlFinder.findUrlsInPage(event) + protected void onResponseSuccess(final ResponseSuccessEvent event) { + // Crawl every URL found on the page + urlFinder.findUrlsInPage(event.getCompleteCrawlResponse()) .stream() .map(CrawlRequest::createDefault) .forEach(this::crawl); @@ -58,11 +62,11 @@ public class MyCrawler extends BaseCrawler { } } ``` -By default, the crawler uses [HtmlUnit headless browser](http://htmlunit.sourceforge.net/): +By default, the crawler uses the [HtmlUnit](http://htmlunit.sourceforge.net/) headless browser: ```java // Create the configuration CrawlerConfiguration config = new CrawlerConfigurationBuilder() - .setOffsiteRequestFiltering(true) + .setOffsiteRequestFilterEnabled(true) .addAllowedCrawlDomain("example.com") .addCrawlSeed(CrawlRequest.createDefault("http://example.com")) .build(); @@ -70,14 +74,14 @@ CrawlerConfiguration config = new CrawlerConfigurationBuilder() // Create the crawler using the configuration above MyCrawler crawler = new MyCrawler(config); -// Start it +// Start crawling with HtmlUnit crawler.start(); ``` -Of course, you can also use any other browsers by specifying a corresponding `WebDriver` instance: +Of course, you can also use other browsers. Currently Chrome and Firefox are supported. ```java // Create the configuration CrawlerConfiguration config = new CrawlerConfigurationBuilder() - .setOffsiteRequestFiltering(true) + .setOffsiteRequestFilterEnabled(true) .addAllowedCrawlDomain("example.com") .addCrawlSeed(CrawlRequest.createDefault("http://example.com")) .build(); @@ -85,11 +89,13 @@ CrawlerConfiguration config = new CrawlerConfigurationBuilder() // Create the crawler using the configuration above MyCrawler crawler = new MyCrawler(config); -// Start it -crawler.start(new ChromeDriver()); +// Start crawling with Chrome +crawler.start(Browser.CHROME); ``` -That's it! In just a few lines you can create a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the `WebDriver` instance, so you can use all the features that are provided by Selenium. +That's it! In just a few lines you can create a crawler that crawls every link it finds, while +filtering duplicate and offsite requests. You also get access to the `WebDriver`, so you can use +all the features that are provided by Selenium. ## Support If this framework helped you in any way, or you would like to support the development: @@ -99,4 +105,5 @@ If this framework helped you in any way, or you would like to support the develo Any amount you choose to give will be greatly appreciated. ## License -The source code of Serritor is made available under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). +The source code of Serritor is made available under the +[Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).