diff --git a/commons/pom.xml b/commons/pom.xml
index 6d51a381e..44c2adf6e 100644
--- a/commons/pom.xml
+++ b/commons/pom.xml
@@ -57,7 +57,7 @@
commons-io
commons-io
- 1.4
+ 2.4
compile
diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml
index d3d02f3fc..58a6d4493 100644
--- a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml
+++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml
@@ -288,6 +288,11 @@ http://example.example/example
+
+
+
+
+ com.github.crawler-commons
+ crawler-commons
+ 1.0
+
com.jcraft
jsch
diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java
new file mode 100644
index 000000000..c63b23be2
--- /dev/null
+++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java
@@ -0,0 +1,114 @@
+package org.archive.modules.extractor;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.httpclient.URIException;
+import org.archive.modules.CrawlURI;
+
+public class ExtractorRobotsTxt extends ContentExtractor {
+ private static final Logger LOGGER = Logger
+ .getLogger(ExtractorRobotsTxt.class.getName());
+ private static final Pattern ROBOTS_PATTERN = Pattern
+ .compile("^https?://[^/]+/robots.txt$");
+ private static final Pattern SITEMAP_PATTERN = Pattern
+ .compile("(?i)Sitemap:\\s*(.+)$");
+
+ public static final String ANNOTATION_IS_SITEMAP = "isSitemap";
+
+ @Override
+ protected boolean shouldExtract(CrawlURI uri) {
+ boolean shouldExtract = false;
+ if (uri.isPrerequisite()) {
+ shouldExtract = ROBOTS_PATTERN.matcher(uri.getURI()).matches();
+ LOGGER.finest("Checked prerequisite " + uri + " GOT " + shouldExtract);
+ }
+ return shouldExtract;
+ }
+
+ public List parseRobotsTxt(InputStream input) {
+ ArrayList links = new ArrayList<>();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(input));
+ try {
+ String line;
+ Matcher matcher;
+ while ((line = reader.readLine()) != null) {
+ matcher = SITEMAP_PATTERN.matcher(line);
+ if (matcher.matches()) {
+ links.add(matcher.group(1));
+ }
+ }
+ } catch (IOException e) {
+ LOGGER.warning(e.toString());
+ }
+ return links;
+ }
+
+ @Override
+ protected boolean innerExtract(CrawlURI curi) {
+ try {
+
+ // Clone the CrawlURI and change hop path and avoid queueing
+ // sitemaps as prerequisites (i.e. strip P from hop path).
+ CrawlURI curiClone = new CrawlURI(curi.getUURI(),
+ curi.getPathFromSeed().replace("P", ""), curi.getVia(),
+ curi.getViaContext());
+
+ // Also copy the source over:
+ if (curi.getSourceTag() != null) {
+ curiClone.setSourceTag(curi.getSourceTag());
+ }
+
+ // Parse the robots for the sitemaps.
+ List links = parseRobotsTxt(
+ curi.getRecorder()
+ .getContentReplayInputStream());
+ LOGGER.finest("Checked " + curi + " GOT " + links);
+
+ // Get the max outlinks (needed by add method):
+ int max = getExtractorParameters().getMaxOutlinks();
+
+ // Accrue links:
+ for (String link : links) {
+ try {
+ // We've found a sitemap:
+ LOGGER.fine("Found site map: " + link);
+ numberOfLinksExtracted.incrementAndGet();
+
+ // Add links but using the cloned CrawlURI as the crawl
+ // context.
+ CrawlURI newCuri = addRelativeToBase(curiClone, max, link,
+ LinkContext.MANIFEST_MISC, Hop.MANIFEST);
+
+ // Annotate as a Site Map:
+ newCuri.getAnnotations().add(
+ ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP);
+
+ } catch (URIException e) {
+ logUriError(e, curi.getUURI(), link);
+ }
+ }
+
+ // Patch outlinks back into original curi:
+ for (CrawlURI outlink : curiClone.getOutLinks()) {
+ curi.getOutLinks().add(outlink);
+ }
+
+ // Return number of links discovered:
+ return !links.isEmpty();
+
+ } catch (IOException e) {
+ LOGGER.log(Level.WARNING, curi.getURI(), e);
+ curi.getNonFatalFailures().add(e);
+ }
+ return false;
+ }
+
+}
diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorSitemap.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorSitemap.java
new file mode 100644
index 000000000..66b64495b
--- /dev/null
+++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorSitemap.java
@@ -0,0 +1,178 @@
+package org.archive.modules.extractor;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.Date;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.URIException;
+import org.apache.commons.io.IOUtils;
+import org.archive.modules.CrawlURI;
+import org.archive.modules.extractor.ContentExtractor;
+import org.archive.modules.extractor.Hop;
+import org.archive.modules.extractor.LinkContext;
+
+import crawlercommons.sitemaps.AbstractSiteMap;
+import crawlercommons.sitemaps.SiteMap;
+import crawlercommons.sitemaps.SiteMapIndex;
+import crawlercommons.sitemaps.SiteMapParser;
+import crawlercommons.sitemaps.SiteMapURL;
+import crawlercommons.sitemaps.UnknownFormatException;
+
+/**
+ *
+ * @author Andrew Jackson
+ *
+ */
+public class ExtractorSitemap extends ContentExtractor {
+ private static final Logger LOGGER = Logger
+ .getLogger(ExtractorSitemap.class.getName());
+
+ /* (non-Javadoc)
+ * @see org.archive.modules.extractor.ContentExtractor#shouldExtract(org.archive.modules.CrawlURI)
+ */
+ @Override
+ protected boolean shouldExtract(CrawlURI uri) {
+ // If declared as such:
+ if (uri.getAnnotations()
+ .contains(ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP)) {
+ if (uri.is2XXSuccess()) {
+ LOGGER.fine("This url (" + uri
+ + ") is declared to be a sitemap (via robots.txt) and is a HTTP 200.");
+ return true;
+ } else {
+ LOGGER.fine("This url (" + uri
+ + ") is declared to be a sitemap (via robots.txt) but is a HTTP "
+ + uri.getFetchStatus() + ".");
+ }
+ }
+
+ // Via content type:
+ String mimeType = uri.getContentType();
+ if (mimeType != null ) {
+ // Looks like XML:
+ if (mimeType.toLowerCase().startsWith("text/xml")
+ || mimeType.toLowerCase().startsWith("application/xml")) {
+
+ // check if content starts with xml preamble "\\s]).*")) {
+ LOGGER.info("Based on content sniffing, this is a sitemap: "
+ + uri);
+ return true;
+ }
+ }
+ }
+
+ // Otherwise, not
+ return false;
+ }
+
+ /* (non-Javadoc)
+ * @see org.archive.modules.extractor.ContentExtractor#innerExtract(org.archive.modules.CrawlURI)
+ */
+ @Override
+ protected boolean innerExtract(CrawlURI uri) {
+ // Parse the sitemap:
+ AbstractSiteMap sitemap = parseSiteMap(uri);
+
+ // Did that work?
+ if (sitemap != null) {
+ // Process results:
+ if (sitemap.isIndex()) {
+ final Collection links = ((SiteMapIndex) sitemap)
+ .getSitemaps();
+ for (final AbstractSiteMap asm : links) {
+ if (asm == null) {
+ continue;
+ }
+ this.recordOutlink(uri, asm.getUrl(), asm.getLastModified(),
+ true);
+ }
+ } else {
+ final Collection links = ((SiteMap) sitemap)
+ .getSiteMapUrls();
+ for (final SiteMapURL url : links) {
+ if (url == null) {
+ continue;
+ }
+ this.recordOutlink(uri, url.getUrl(), url.getLastModified(),
+ false);
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Parse the sitemap using the Crawler Commons content-sniffing parser.
+ *
+ * @param uri
+ * @return
+ */
+ private AbstractSiteMap parseSiteMap(CrawlURI uri) {
+ // The thing we will create:
+ AbstractSiteMap sitemap = null;
+
+ // Be strict about URLs but allow partial extraction:
+ SiteMapParser smp = new SiteMapParser(true, true);
+ // Parse it up:
+ try {
+ // Sitemaps are not supposed to be bigger than 50MB (according to
+ // Google) so if we hit problems we can implement that limit:
+ byte[] content = IOUtils.toByteArray(
+ uri.getRecorder().getContentReplayInputStream());
+ if (content.length > 52428800) {
+ LOGGER.warning("Found sitemap exceeding 50MB " + uri + " "
+ + content.length);
+ }
+ // Now we can process it:
+ sitemap = smp.parseSiteMap(content, new URL(uri.getURI()));
+ } catch (IOException e) {
+ LOGGER.log(Level.WARNING,
+ "I/O Exception when parsing sitemap " + uri, e);
+ } catch (UnknownFormatException e) {
+ LOGGER.log(Level.WARNING,
+ "UnknownFormatException when parsing sitemap " + uri, e);
+ }
+ return sitemap;
+ }
+
+ private void recordOutlink(CrawlURI curi, URL newUri, Date lastModified,
+ boolean isSitemap) {
+ try {
+ // Get the max outlinks (needed by add method):
+ //
+ // Because sitemaps are really important we excuse this extractor
+ // from the general setting:
+ //
+ // getExtractorParameters().getMaxOutlinks();
+ //
+ // And instead use the maximum that is allowed for a sitemap:
+ int max = 50000;
+
+ // Add the URI:
+ // Adding 'regular' URL listed in the sitemap
+ addRelativeToBase(curi, max, newUri.toString(),
+ LinkContext.MANIFEST_MISC, Hop.MANIFEST);
+
+ // And log about it:
+ LOGGER.fine("Found " + newUri + " from " + curi + " Dated "
+ + lastModified + " and with isSitemap = " + isSitemap);
+ // Count it:
+ numberOfLinksExtracted.incrementAndGet();
+ } catch (URIException e) {
+ LOGGER.log(Level.WARNING,
+ "URIException when recording outlink " + newUri, e);
+ }
+
+ }
+
+}
diff --git a/modules/src/main/java/org/archive/modules/extractor/Hop.java b/modules/src/main/java/org/archive/modules/extractor/Hop.java
index 23f4c688b..914e9fc58 100644
--- a/modules/src/main/java/org/archive/modules/extractor/Hop.java
+++ b/modules/src/main/java/org/archive/modules/extractor/Hop.java
@@ -55,6 +55,9 @@ public enum Hop {
* material, but deduced by convention.
*/
INFERRED('I'),
+
+ /** Found in some form of site provided URL manifest (e.g. site map) */
+ MANIFEST('M'),
/** Synthesized form-submit */
SUBMIT('S');
diff --git a/modules/src/main/java/org/archive/modules/extractor/LinkContext.java b/modules/src/main/java/org/archive/modules/extractor/LinkContext.java
index b41e79622..ef2e81502 100644
--- a/modules/src/main/java/org/archive/modules/extractor/LinkContext.java
+++ b/modules/src/main/java/org/archive/modules/extractor/LinkContext.java
@@ -76,6 +76,10 @@ public String toString() {
final public static LinkContext PREREQ_MISC
= new SimpleLinkContext("=PREREQ_MISC");
+ /** Stand-in value for prerequisite urls without other context. */
+ final public static LinkContext MANIFEST_MISC
+ = new SimpleLinkContext("=MANIFEST_MISC");
+
public boolean equals(Object o) {
if (o == this) {
return true;