Skip to content

Commit

Permalink
Merge pull request #262 from kris-sigur/sitemaps
Browse files Browse the repository at this point in the history
Support for extracting URLs in sitemaps
  • Loading branch information
anjackson authored May 20, 2021
2 parents c1bcdd9 + 396467c commit d7869de
Show file tree
Hide file tree
Showing 7 changed files with 314 additions and 1 deletion.
2 changes: 1 addition & 1 deletion commons/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>1.4</version>
<version>2.4</version>
<scope>compile</scope>
</dependency>
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,11 @@ http://example.example/example
</bean>
<bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
</bean>
<bean id="extractorRobotsTxt" class="org.archive.modules.extractor.ExtractorRobotsTxt">
</bean>
<bean id="extractorSitemap" class="org.archive.modules.extractor.ExtractorSitemap">
</bean>

<bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
<!-- <property name="extractJavascript" value="true" /> -->
<!-- <property name="extractValueAttributes" value="true" /> -->
Expand Down Expand Up @@ -320,6 +325,10 @@ http://example.example/example
<ref bean="fetchHttp"/>
<!-- ...extract outlinks from HTTP headers... -->
<ref bean="extractorHttp"/>
<!-- ...extract sitemap urls from robots.txt... -->
<ref bean="extractorRobotsTxt"/>
<!-- ...extract links from sitemaps... -->
<ref bean="extractorSitemap"/>
<!-- ...extract outlinks from HTML content... -->
<ref bean="extractorHtml"/>
<!-- ...extract outlinks from CSS content... -->
Expand Down
5 changes: 5 additions & 0 deletions modules/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@
<version>1.6.6</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.github.crawler-commons</groupId>
<artifactId>crawler-commons</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>com.jcraft</groupId>
<artifactId>jsch</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package org.archive.modules.extractor;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;

public class ExtractorRobotsTxt extends ContentExtractor {
private static final Logger LOGGER = Logger
.getLogger(ExtractorRobotsTxt.class.getName());
private static final Pattern ROBOTS_PATTERN = Pattern
.compile("^https?://[^/]+/robots.txt$");
private static final Pattern SITEMAP_PATTERN = Pattern
.compile("(?i)Sitemap:\\s*(.+)$");

public static final String ANNOTATION_IS_SITEMAP = "isSitemap";

@Override
protected boolean shouldExtract(CrawlURI uri) {
boolean shouldExtract = false;
if (uri.isPrerequisite()) {
shouldExtract = ROBOTS_PATTERN.matcher(uri.getURI()).matches();
LOGGER.finest("Checked prerequisite " + uri + " GOT " + shouldExtract);
}
return shouldExtract;
}

public List<String> parseRobotsTxt(InputStream input) {
ArrayList<String> links = new ArrayList<>();
BufferedReader reader = new BufferedReader(new InputStreamReader(input));
try {
String line;
Matcher matcher;
while ((line = reader.readLine()) != null) {
matcher = SITEMAP_PATTERN.matcher(line);
if (matcher.matches()) {
links.add(matcher.group(1));
}
}
} catch (IOException e) {
LOGGER.warning(e.toString());
}
return links;
}

@Override
protected boolean innerExtract(CrawlURI curi) {
try {

// Clone the CrawlURI and change hop path and avoid queueing
// sitemaps as prerequisites (i.e. strip P from hop path).
CrawlURI curiClone = new CrawlURI(curi.getUURI(),
curi.getPathFromSeed().replace("P", ""), curi.getVia(),
curi.getViaContext());

// Also copy the source over:
if (curi.getSourceTag() != null) {
curiClone.setSourceTag(curi.getSourceTag());
}

// Parse the robots for the sitemaps.
List<String> links = parseRobotsTxt(
curi.getRecorder()
.getContentReplayInputStream());
LOGGER.finest("Checked " + curi + " GOT " + links);

// Get the max outlinks (needed by add method):
int max = getExtractorParameters().getMaxOutlinks();

// Accrue links:
for (String link : links) {
try {
// We've found a sitemap:
LOGGER.fine("Found site map: " + link);
numberOfLinksExtracted.incrementAndGet();

// Add links but using the cloned CrawlURI as the crawl
// context.
CrawlURI newCuri = addRelativeToBase(curiClone, max, link,
LinkContext.MANIFEST_MISC, Hop.MANIFEST);

// Annotate as a Site Map:
newCuri.getAnnotations().add(
ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP);

} catch (URIException e) {
logUriError(e, curi.getUURI(), link);
}
}

// Patch outlinks back into original curi:
for (CrawlURI outlink : curiClone.getOutLinks()) {
curi.getOutLinks().add(outlink);
}

// Return number of links discovered:
return !links.isEmpty();

} catch (IOException e) {
LOGGER.log(Level.WARNING, curi.getURI(), e);
curi.getNonFatalFailures().add(e);
}
return false;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
package org.archive.modules.extractor;

import java.io.IOException;
import java.net.URL;
import java.util.Collection;
import java.util.Date;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.modules.CrawlURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;

import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.UnknownFormatException;

/**
*
* @author Andrew Jackson <Andrew.Jackson@bl.uk>
*
*/
public class ExtractorSitemap extends ContentExtractor {
private static final Logger LOGGER = Logger
.getLogger(ExtractorSitemap.class.getName());

/* (non-Javadoc)
* @see org.archive.modules.extractor.ContentExtractor#shouldExtract(org.archive.modules.CrawlURI)
*/
@Override
protected boolean shouldExtract(CrawlURI uri) {
// If declared as such:
if (uri.getAnnotations()
.contains(ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP)) {
if (uri.is2XXSuccess()) {
LOGGER.fine("This url (" + uri
+ ") is declared to be a sitemap (via robots.txt) and is a HTTP 200.");
return true;
} else {
LOGGER.fine("This url (" + uri
+ ") is declared to be a sitemap (via robots.txt) but is a HTTP "
+ uri.getFetchStatus() + ".");
}
}

// Via content type:
String mimeType = uri.getContentType();
if (mimeType != null ) {
// Looks like XML:
if (mimeType.toLowerCase().startsWith("text/xml")
|| mimeType.toLowerCase().startsWith("application/xml")) {

// check if content starts with xml preamble "<?xml" and does
// contain "<urlset " or "<sitemapindex" early in the content
String contentStartingChunk = uri.getRecorder()
.getContentReplayPrefixString(400);
if (contentStartingChunk.matches("(?is)[\\ufeff]?<\\?xml\\s.*")
&& contentStartingChunk.matches(
"(?is).*(?:<urlset|<sitemapindex[>\\s]).*")) {
LOGGER.info("Based on content sniffing, this is a sitemap: "
+ uri);
return true;
}
}
}

// Otherwise, not
return false;
}

/* (non-Javadoc)
* @see org.archive.modules.extractor.ContentExtractor#innerExtract(org.archive.modules.CrawlURI)
*/
@Override
protected boolean innerExtract(CrawlURI uri) {
// Parse the sitemap:
AbstractSiteMap sitemap = parseSiteMap(uri);

// Did that work?
if (sitemap != null) {
// Process results:
if (sitemap.isIndex()) {
final Collection<AbstractSiteMap> links = ((SiteMapIndex) sitemap)
.getSitemaps();
for (final AbstractSiteMap asm : links) {
if (asm == null) {
continue;
}
this.recordOutlink(uri, asm.getUrl(), asm.getLastModified(),
true);
}
} else {
final Collection<SiteMapURL> links = ((SiteMap) sitemap)
.getSiteMapUrls();
for (final SiteMapURL url : links) {
if (url == null) {
continue;
}
this.recordOutlink(uri, url.getUrl(), url.getLastModified(),
false);
}
}
}

return false;
}

/**
* Parse the sitemap using the Crawler Commons content-sniffing parser.
*
* @param uri
* @return
*/
private AbstractSiteMap parseSiteMap(CrawlURI uri) {
// The thing we will create:
AbstractSiteMap sitemap = null;

// Be strict about URLs but allow partial extraction:
SiteMapParser smp = new SiteMapParser(true, true);
// Parse it up:
try {
// Sitemaps are not supposed to be bigger than 50MB (according to
// Google) so if we hit problems we can implement that limit:
byte[] content = IOUtils.toByteArray(
uri.getRecorder().getContentReplayInputStream());
if (content.length > 52428800) {
LOGGER.warning("Found sitemap exceeding 50MB " + uri + " "
+ content.length);
}
// Now we can process it:
sitemap = smp.parseSiteMap(content, new URL(uri.getURI()));
} catch (IOException e) {
LOGGER.log(Level.WARNING,
"I/O Exception when parsing sitemap " + uri, e);
} catch (UnknownFormatException e) {
LOGGER.log(Level.WARNING,
"UnknownFormatException when parsing sitemap " + uri, e);
}
return sitemap;
}

private void recordOutlink(CrawlURI curi, URL newUri, Date lastModified,
boolean isSitemap) {
try {
// Get the max outlinks (needed by add method):
//
// Because sitemaps are really important we excuse this extractor
// from the general setting:
//
// getExtractorParameters().getMaxOutlinks();
//
// And instead use the maximum that is allowed for a sitemap:
int max = 50000;

// Add the URI:
// Adding 'regular' URL listed in the sitemap
addRelativeToBase(curi, max, newUri.toString(),
LinkContext.MANIFEST_MISC, Hop.MANIFEST);

// And log about it:
LOGGER.fine("Found " + newUri + " from " + curi + " Dated "
+ lastModified + " and with isSitemap = " + isSitemap);
// Count it:
numberOfLinksExtracted.incrementAndGet();
} catch (URIException e) {
LOGGER.log(Level.WARNING,
"URIException when recording outlink " + newUri, e);
}

}

}
3 changes: 3 additions & 0 deletions modules/src/main/java/org/archive/modules/extractor/Hop.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ public enum Hop {
* material, but deduced by convention.
*/
INFERRED('I'),

/** Found in some form of site provided URL manifest (e.g. site map) */
MANIFEST('M'),

/** Synthesized form-submit */
SUBMIT('S');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ public String toString() {
final public static LinkContext PREREQ_MISC
= new SimpleLinkContext("=PREREQ_MISC");

/** Stand-in value for prerequisite urls without other context. */
final public static LinkContext MANIFEST_MISC
= new SimpleLinkContext("=MANIFEST_MISC");

public boolean equals(Object o) {
if (o == this) {
return true;
Expand Down

0 comments on commit d7869de

Please sign in to comment.