From 8ddaf41db27a358f99e989656b885a00fc7faa45 Mon Sep 17 00:00:00 2001 From: kristinn Date: Fri, 5 Jul 2019 11:27:15 +0000 Subject: [PATCH 1/6] Add crawler-commons dependency. Bump commons-io dependency version to match crawler-commons. --- commons/pom.xml | 2 +- modules/pom.xml | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/commons/pom.xml b/commons/pom.xml index 3a9da7dcc..e847c8b92 100644 --- a/commons/pom.xml +++ b/commons/pom.xml @@ -57,7 +57,7 @@ commons-io commons-io - 1.4 + 2.4 compile diff --git a/modules/pom.xml b/modules/pom.xml index d38cd346a..3db49f545 100644 --- a/modules/pom.xml +++ b/modules/pom.xml @@ -70,6 +70,11 @@ 1.6.6 test + + com.github.crawler-commons + crawler-commons + 1.0 + From 0f7675ab5edce6c78a85e2c657beab229161bc70 Mon Sep 17 00:00:00 2001 From: kristinn Date: Fri, 5 Jul 2019 11:27:43 +0000 Subject: [PATCH 2/6] Add support for new hopptype, (M)anifest --- modules/src/main/java/org/archive/modules/extractor/Hop.java | 3 +++ .../main/java/org/archive/modules/extractor/LinkContext.java | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/modules/src/main/java/org/archive/modules/extractor/Hop.java b/modules/src/main/java/org/archive/modules/extractor/Hop.java index 23f4c688b..914e9fc58 100644 --- a/modules/src/main/java/org/archive/modules/extractor/Hop.java +++ b/modules/src/main/java/org/archive/modules/extractor/Hop.java @@ -55,6 +55,9 @@ public enum Hop { * material, but deduced by convention. */ INFERRED('I'), + + /** Found in some form of site provided URL manifest (e.g. site map) */ + MANIFEST('M'), /** Synthesized form-submit */ SUBMIT('S'); diff --git a/modules/src/main/java/org/archive/modules/extractor/LinkContext.java b/modules/src/main/java/org/archive/modules/extractor/LinkContext.java index b41e79622..ef2e81502 100644 --- a/modules/src/main/java/org/archive/modules/extractor/LinkContext.java +++ b/modules/src/main/java/org/archive/modules/extractor/LinkContext.java @@ -76,6 +76,10 @@ public String toString() { final public static LinkContext PREREQ_MISC = new SimpleLinkContext("=PREREQ_MISC"); + /** Stand-in value for prerequisite urls without other context. */ + final public static LinkContext MANIFEST_MISC + = new SimpleLinkContext("=MANIFEST_MISC"); + public boolean equals(Object o) { if (o == this) { return true; From 5941d59b0e4684b0d3f3bc643f17228086f78671 Mon Sep 17 00:00:00 2001 From: kristinn Date: Fri, 5 Jul 2019 11:28:19 +0000 Subject: [PATCH 3/6] Add extractor to get sitemap url from robots.txt --- .../modules/extractor/ExtractorRobotsTxt.java | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java new file mode 100644 index 000000000..03aebca56 --- /dev/null +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java @@ -0,0 +1,115 @@ +package org.archive.modules.extractor; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.httpclient.URIException; +import org.archive.modules.CrawlURI; +import org.archive.modules.extractor.ContentExtractor; +import org.archive.modules.extractor.Hop; +import org.archive.modules.extractor.LinkContext; + +public class ExtractorRobotsTxt extends ContentExtractor { + private static final Logger LOGGER = Logger + .getLogger(ExtractorRobotsTxt.class.getName()); + private static final Pattern ROBOTS_PATTERN = Pattern + .compile("^https?://[^/]+/robots.txt$"); + private static final Pattern SITEMAP_PATTERN = Pattern + .compile("(?i)Sitemap:\\s*(.+)$"); + + public static final String ANNOTATION_IS_SITEMAP = "isSitemap"; + + @Override + protected boolean shouldExtract(CrawlURI uri) { + boolean shouldExtract = false; + if (uri.isPrerequisite()) { + shouldExtract = ROBOTS_PATTERN.matcher(uri.getURI()).matches(); + LOGGER.finest("Checked prerequisite " + uri + " GOT " + shouldExtract); + } + return shouldExtract; + } + + public List parseRobotsTxt(InputStream input) { + ArrayList links = new ArrayList<>(); + BufferedReader reader = new BufferedReader(new InputStreamReader(input)); + try { + String line; + Matcher matcher; + while ((line = reader.readLine()) != null) { + matcher = SITEMAP_PATTERN.matcher(line); + if (matcher.matches()) { + links.add(matcher.group(1)); + } + } + } catch (IOException e) { + LOGGER.warning(e.toString()); + } + return links; + } + + @Override + protected boolean innerExtract(CrawlURI curi) { + try { + + // Clone the CrawlURI and change hop path and avoid queueing + // sitemaps as prerequisites (i.e. strip P from hop path). + CrawlURI curiClone = new CrawlURI(curi.getUURI(), + curi.getPathFromSeed().replace("P", ""), curi.getVia(), + curi.getViaContext()); + + // Also copy the source over: + curiClone.setSourceTag(curi.getSourceTag()); + + // Parse the robots for the sitemaps. + List links = parseRobotsTxt( + curi.getRecorder() + .getContentReplayInputStream()); + LOGGER.finest("Checked " + curi + " GOT " + links); + + // Get the max outlinks (needed by add method): + int max = getExtractorParameters().getMaxOutlinks(); + + // Accrue links: + for (String link : links) { + try { + // We've found a sitemap: + LOGGER.fine("Found site map: " + link); + numberOfLinksExtracted.incrementAndGet(); + + // Add links but using the cloned CrawlURI as the crawl + // context. + CrawlURI newCuri = addRelativeToBase(curiClone, max, link, + LinkContext.MANIFEST_MISC, Hop.MANIFEST); + + // Annotate as a Site Map: + newCuri.getAnnotations().add( + ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP); + + } catch (URIException e) { + logUriError(e, curi.getUURI(), link); + } + } + + // Patch outlinks back into original curi: + for (CrawlURI outlink : curiClone.getOutLinks()) { + curi.getOutLinks().add(outlink); + } + + // Return number of links discovered: + return !links.isEmpty(); + + } catch (IOException e) { + LOGGER.log(Level.WARNING, curi.getURI(), e); + curi.getNonFatalFailures().add(e); + } + return false; + } + +} From ba8f669e1c2802683695348d3d0fd21390c100ee Mon Sep 17 00:00:00 2001 From: kristinn Date: Fri, 5 Jul 2019 11:28:38 +0000 Subject: [PATCH 4/6] Add extractor that handles sitemaps --- .../modules/extractor/ExtractorSitemap.java | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 modules/src/main/java/org/archive/modules/extractor/ExtractorSitemap.java diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorSitemap.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorSitemap.java new file mode 100644 index 000000000..66b64495b --- /dev/null +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorSitemap.java @@ -0,0 +1,178 @@ +package org.archive.modules.extractor; + +import java.io.IOException; +import java.net.URL; +import java.util.Collection; +import java.util.Date; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.URIException; +import org.apache.commons.io.IOUtils; +import org.archive.modules.CrawlURI; +import org.archive.modules.extractor.ContentExtractor; +import org.archive.modules.extractor.Hop; +import org.archive.modules.extractor.LinkContext; + +import crawlercommons.sitemaps.AbstractSiteMap; +import crawlercommons.sitemaps.SiteMap; +import crawlercommons.sitemaps.SiteMapIndex; +import crawlercommons.sitemaps.SiteMapParser; +import crawlercommons.sitemaps.SiteMapURL; +import crawlercommons.sitemaps.UnknownFormatException; + +/** + * + * @author Andrew Jackson + * + */ +public class ExtractorSitemap extends ContentExtractor { + private static final Logger LOGGER = Logger + .getLogger(ExtractorSitemap.class.getName()); + + /* (non-Javadoc) + * @see org.archive.modules.extractor.ContentExtractor#shouldExtract(org.archive.modules.CrawlURI) + */ + @Override + protected boolean shouldExtract(CrawlURI uri) { + // If declared as such: + if (uri.getAnnotations() + .contains(ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP)) { + if (uri.is2XXSuccess()) { + LOGGER.fine("This url (" + uri + + ") is declared to be a sitemap (via robots.txt) and is a HTTP 200."); + return true; + } else { + LOGGER.fine("This url (" + uri + + ") is declared to be a sitemap (via robots.txt) but is a HTTP " + + uri.getFetchStatus() + "."); + } + } + + // Via content type: + String mimeType = uri.getContentType(); + if (mimeType != null ) { + // Looks like XML: + if (mimeType.toLowerCase().startsWith("text/xml") + || mimeType.toLowerCase().startsWith("application/xml")) { + + // check if content starts with xml preamble "\\s]).*")) { + LOGGER.info("Based on content sniffing, this is a sitemap: " + + uri); + return true; + } + } + } + + // Otherwise, not + return false; + } + + /* (non-Javadoc) + * @see org.archive.modules.extractor.ContentExtractor#innerExtract(org.archive.modules.CrawlURI) + */ + @Override + protected boolean innerExtract(CrawlURI uri) { + // Parse the sitemap: + AbstractSiteMap sitemap = parseSiteMap(uri); + + // Did that work? + if (sitemap != null) { + // Process results: + if (sitemap.isIndex()) { + final Collection links = ((SiteMapIndex) sitemap) + .getSitemaps(); + for (final AbstractSiteMap asm : links) { + if (asm == null) { + continue; + } + this.recordOutlink(uri, asm.getUrl(), asm.getLastModified(), + true); + } + } else { + final Collection links = ((SiteMap) sitemap) + .getSiteMapUrls(); + for (final SiteMapURL url : links) { + if (url == null) { + continue; + } + this.recordOutlink(uri, url.getUrl(), url.getLastModified(), + false); + } + } + } + + return false; + } + + /** + * Parse the sitemap using the Crawler Commons content-sniffing parser. + * + * @param uri + * @return + */ + private AbstractSiteMap parseSiteMap(CrawlURI uri) { + // The thing we will create: + AbstractSiteMap sitemap = null; + + // Be strict about URLs but allow partial extraction: + SiteMapParser smp = new SiteMapParser(true, true); + // Parse it up: + try { + // Sitemaps are not supposed to be bigger than 50MB (according to + // Google) so if we hit problems we can implement that limit: + byte[] content = IOUtils.toByteArray( + uri.getRecorder().getContentReplayInputStream()); + if (content.length > 52428800) { + LOGGER.warning("Found sitemap exceeding 50MB " + uri + " " + + content.length); + } + // Now we can process it: + sitemap = smp.parseSiteMap(content, new URL(uri.getURI())); + } catch (IOException e) { + LOGGER.log(Level.WARNING, + "I/O Exception when parsing sitemap " + uri, e); + } catch (UnknownFormatException e) { + LOGGER.log(Level.WARNING, + "UnknownFormatException when parsing sitemap " + uri, e); + } + return sitemap; + } + + private void recordOutlink(CrawlURI curi, URL newUri, Date lastModified, + boolean isSitemap) { + try { + // Get the max outlinks (needed by add method): + // + // Because sitemaps are really important we excuse this extractor + // from the general setting: + // + // getExtractorParameters().getMaxOutlinks(); + // + // And instead use the maximum that is allowed for a sitemap: + int max = 50000; + + // Add the URI: + // Adding 'regular' URL listed in the sitemap + addRelativeToBase(curi, max, newUri.toString(), + LinkContext.MANIFEST_MISC, Hop.MANIFEST); + + // And log about it: + LOGGER.fine("Found " + newUri + " from " + curi + " Dated " + + lastModified + " and with isSitemap = " + isSitemap); + // Count it: + numberOfLinksExtracted.incrementAndGet(); + } catch (URIException e) { + LOGGER.log(Level.WARNING, + "URIException when recording outlink " + newUri, e); + } + + } + +} From 308cee85a3c0217e5c7f8f71df5eacb5ba9835d9 Mon Sep 17 00:00:00 2001 From: kristinn Date: Fri, 5 Jul 2019 11:28:53 +0000 Subject: [PATCH 5/6] Add sitemap extraction to default profile --- .../archive/crawler/restlet/profile-crawler-beans.cxml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml index 97b35c27a..d32a327e2 100644 --- a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml +++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml @@ -288,6 +288,11 @@ http://example.example/example + + + + + @@ -320,6 +325,10 @@ http://example.example/example + + + + From 52093eb25f8f17b48eea5acad06bb6eaeb99adf4 Mon Sep 17 00:00:00 2001 From: Kristinn Sigurdsson Date: Tue, 20 Apr 2021 13:18:29 +0000 Subject: [PATCH 6/6] Only copy source tag if not null. --- .../archive/modules/extractor/ExtractorRobotsTxt.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java index 03aebca56..c63b23be2 100644 --- a/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java @@ -12,9 +12,6 @@ import org.apache.commons.httpclient.URIException; import org.archive.modules.CrawlURI; -import org.archive.modules.extractor.ContentExtractor; -import org.archive.modules.extractor.Hop; -import org.archive.modules.extractor.LinkContext; public class ExtractorRobotsTxt extends ContentExtractor { private static final Logger LOGGER = Logger @@ -65,8 +62,10 @@ protected boolean innerExtract(CrawlURI curi) { curi.getViaContext()); // Also copy the source over: - curiClone.setSourceTag(curi.getSourceTag()); - + if (curi.getSourceTag() != null) { + curiClone.setSourceTag(curi.getSourceTag()); + } + // Parse the robots for the sitemaps. List links = parseRobotsTxt( curi.getRecorder()