Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for extracting URLs in sitemaps #262

Merged
merged 7 commits into from
May 20, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion commons/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>1.4</version>
<version>2.4</version>
<scope>compile</scope>
</dependency>
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,11 @@ http://example.example/example
</bean>
<bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
</bean>
<bean id="extractorRobotsTxt" class="org.archive.modules.extractor.ExtractorRobotsTxt">
</bean>
<bean id="extractorSitemap" class="org.archive.modules.extractor.ExtractorSitemap">
</bean>

<bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
<!-- <property name="extractJavascript" value="true" /> -->
<!-- <property name="extractValueAttributes" value="true" /> -->
Expand Down Expand Up @@ -320,6 +325,10 @@ http://example.example/example
<ref bean="fetchHttp"/>
<!-- ...extract outlinks from HTTP headers... -->
<ref bean="extractorHttp"/>
<!-- ...extract sitemap urls from robots.txt... -->
<ref bean="extractorRobotsTxt"/>
<!-- ...extract links from sitemaps... -->
<ref bean="extractorSitemap"/>
<!-- ...extract outlinks from HTML content... -->
<ref bean="extractorHtml"/>
<!-- ...extract outlinks from CSS content... -->
Expand Down
5 changes: 5 additions & 0 deletions modules/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@
<version>1.6.6</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.github.crawler-commons</groupId>
<artifactId>crawler-commons</artifactId>
<version>1.0</version>
</dependency>
</dependencies>
<build>
<plugins>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package org.archive.modules.extractor;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;

public class ExtractorRobotsTxt extends ContentExtractor {
private static final Logger LOGGER = Logger
.getLogger(ExtractorRobotsTxt.class.getName());
private static final Pattern ROBOTS_PATTERN = Pattern
.compile("^https?://[^/]+/robots.txt$");
private static final Pattern SITEMAP_PATTERN = Pattern
.compile("(?i)Sitemap:\\s*(.+)$");

public static final String ANNOTATION_IS_SITEMAP = "isSitemap";

@Override
protected boolean shouldExtract(CrawlURI uri) {
boolean shouldExtract = false;
if (uri.isPrerequisite()) {
shouldExtract = ROBOTS_PATTERN.matcher(uri.getURI()).matches();
LOGGER.finest("Checked prerequisite " + uri + " GOT " + shouldExtract);
}
return shouldExtract;
}

public List<String> parseRobotsTxt(InputStream input) {
ArrayList<String> links = new ArrayList<>();
BufferedReader reader = new BufferedReader(new InputStreamReader(input));
try {
String line;
Matcher matcher;
while ((line = reader.readLine()) != null) {
matcher = SITEMAP_PATTERN.matcher(line);
if (matcher.matches()) {
links.add(matcher.group(1));
}
}
} catch (IOException e) {
LOGGER.warning(e.toString());
}
return links;
}

@Override
protected boolean innerExtract(CrawlURI curi) {
try {

// Clone the CrawlURI and change hop path and avoid queueing
// sitemaps as prerequisites (i.e. strip P from hop path).
CrawlURI curiClone = new CrawlURI(curi.getUURI(),
curi.getPathFromSeed().replace("P", ""), curi.getVia(),
curi.getViaContext());

// Also copy the source over:
curiClone.setSourceTag(curi.getSourceTag());
kris-sigur marked this conversation as resolved.
Show resolved Hide resolved

// Parse the robots for the sitemaps.
List<String> links = parseRobotsTxt(
curi.getRecorder()
.getContentReplayInputStream());
LOGGER.finest("Checked " + curi + " GOT " + links);

// Get the max outlinks (needed by add method):
int max = getExtractorParameters().getMaxOutlinks();

// Accrue links:
for (String link : links) {
try {
// We've found a sitemap:
LOGGER.fine("Found site map: " + link);
numberOfLinksExtracted.incrementAndGet();

// Add links but using the cloned CrawlURI as the crawl
// context.
CrawlURI newCuri = addRelativeToBase(curiClone, max, link,
LinkContext.MANIFEST_MISC, Hop.MANIFEST);

// Annotate as a Site Map:
newCuri.getAnnotations().add(
ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP);

} catch (URIException e) {
logUriError(e, curi.getUURI(), link);
}
}

// Patch outlinks back into original curi:
for (CrawlURI outlink : curiClone.getOutLinks()) {
curi.getOutLinks().add(outlink);
}

// Return number of links discovered:
return !links.isEmpty();

} catch (IOException e) {
LOGGER.log(Level.WARNING, curi.getURI(), e);
curi.getNonFatalFailures().add(e);
}
return false;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
package org.archive.modules.extractor;

import java.io.IOException;
import java.net.URL;
import java.util.Collection;
import java.util.Date;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.modules.CrawlURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;

import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.UnknownFormatException;

/**
*
* @author Andrew Jackson <Andrew.Jackson@bl.uk>
*
*/
public class ExtractorSitemap extends ContentExtractor {
private static final Logger LOGGER = Logger
.getLogger(ExtractorSitemap.class.getName());

/* (non-Javadoc)
* @see org.archive.modules.extractor.ContentExtractor#shouldExtract(org.archive.modules.CrawlURI)
*/
@Override
protected boolean shouldExtract(CrawlURI uri) {
// If declared as such:
if (uri.getAnnotations()
.contains(ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP)) {
if (uri.is2XXSuccess()) {
LOGGER.fine("This url (" + uri
+ ") is declared to be a sitemap (via robots.txt) and is a HTTP 200.");
return true;
} else {
LOGGER.fine("This url (" + uri
+ ") is declared to be a sitemap (via robots.txt) but is a HTTP "
+ uri.getFetchStatus() + ".");
}
}

// Via content type:
String mimeType = uri.getContentType();
if (mimeType != null ) {
// Looks like XML:
if (mimeType.toLowerCase().startsWith("text/xml")
|| mimeType.toLowerCase().startsWith("application/xml")) {

// check if content starts with xml preamble "<?xml" and does
// contain "<urlset " or "<sitemapindex" early in the content
String contentStartingChunk = uri.getRecorder()
.getContentReplayPrefixString(400);
if (contentStartingChunk.matches("(?is)[\\ufeff]?<\\?xml\\s.*")
&& contentStartingChunk.matches(
"(?is).*(?:<urlset|<sitemapindex[>\\s]).*")) {
LOGGER.info("Based on content sniffing, this is a sitemap: "
+ uri);
return true;
}
}
}

// Otherwise, not
return false;
}

/* (non-Javadoc)
* @see org.archive.modules.extractor.ContentExtractor#innerExtract(org.archive.modules.CrawlURI)
*/
@Override
protected boolean innerExtract(CrawlURI uri) {
// Parse the sitemap:
AbstractSiteMap sitemap = parseSiteMap(uri);

// Did that work?
if (sitemap != null) {
// Process results:
if (sitemap.isIndex()) {
final Collection<AbstractSiteMap> links = ((SiteMapIndex) sitemap)
.getSitemaps();
for (final AbstractSiteMap asm : links) {
if (asm == null) {
continue;
}
this.recordOutlink(uri, asm.getUrl(), asm.getLastModified(),
true);
}
} else {
final Collection<SiteMapURL> links = ((SiteMap) sitemap)
.getSiteMapUrls();
for (final SiteMapURL url : links) {
if (url == null) {
continue;
}
this.recordOutlink(uri, url.getUrl(), url.getLastModified(),
false);
}
}
}

return false;
}

/**
* Parse the sitemap using the Crawler Commons content-sniffing parser.
*
* @param uri
* @return
*/
private AbstractSiteMap parseSiteMap(CrawlURI uri) {
// The thing we will create:
AbstractSiteMap sitemap = null;

// Be strict about URLs but allow partial extraction:
SiteMapParser smp = new SiteMapParser(true, true);
// Parse it up:
try {
// Sitemaps are not supposed to be bigger than 50MB (according to
// Google) so if we hit problems we can implement that limit:
byte[] content = IOUtils.toByteArray(
uri.getRecorder().getContentReplayInputStream());
if (content.length > 52428800) {
LOGGER.warning("Found sitemap exceeding 50MB " + uri + " "
+ content.length);
}
// Now we can process it:
sitemap = smp.parseSiteMap(content, new URL(uri.getURI()));
} catch (IOException e) {
LOGGER.log(Level.WARNING,
"I/O Exception when parsing sitemap " + uri, e);
} catch (UnknownFormatException e) {
LOGGER.log(Level.WARNING,
"UnknownFormatException when parsing sitemap " + uri, e);
}
return sitemap;
}

private void recordOutlink(CrawlURI curi, URL newUri, Date lastModified,
boolean isSitemap) {
try {
// Get the max outlinks (needed by add method):
//
// Because sitemaps are really important we excuse this extractor
// from the general setting:
//
// getExtractorParameters().getMaxOutlinks();
//
// And instead use the maximum that is allowed for a sitemap:
int max = 50000;

// Add the URI:
// Adding 'regular' URL listed in the sitemap
addRelativeToBase(curi, max, newUri.toString(),
LinkContext.MANIFEST_MISC, Hop.MANIFEST);

// And log about it:
LOGGER.fine("Found " + newUri + " from " + curi + " Dated "
+ lastModified + " and with isSitemap = " + isSitemap);
// Count it:
numberOfLinksExtracted.incrementAndGet();
} catch (URIException e) {
LOGGER.log(Level.WARNING,
"URIException when recording outlink " + newUri, e);
}

}

}
3 changes: 3 additions & 0 deletions modules/src/main/java/org/archive/modules/extractor/Hop.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ public enum Hop {
* material, but deduced by convention.
*/
INFERRED('I'),

/** Found in some form of site provided URL manifest (e.g. site map) */
MANIFEST('M'),

/** Synthesized form-submit */
SUBMIT('S');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ public String toString() {
final public static LinkContext PREREQ_MISC
= new SimpleLinkContext("=PREREQ_MISC");

/** Stand-in value for prerequisite urls without other context. */
final public static LinkContext MANIFEST_MISC
= new SimpleLinkContext("=MANIFEST_MISC");

public boolean equals(Object o) {
if (o == this) {
return true;
Expand Down