-
Notifications
You must be signed in to change notification settings - Fork 761
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #262 from kris-sigur/sitemaps
Support for extracting URLs in sitemaps
- Loading branch information
Showing
7 changed files
with
314 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
114 changes: 114 additions & 0 deletions
114
modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
package org.archive.modules.extractor; | ||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import org.apache.commons.httpclient.URIException; | ||
import org.archive.modules.CrawlURI; | ||
|
||
public class ExtractorRobotsTxt extends ContentExtractor { | ||
private static final Logger LOGGER = Logger | ||
.getLogger(ExtractorRobotsTxt.class.getName()); | ||
private static final Pattern ROBOTS_PATTERN = Pattern | ||
.compile("^https?://[^/]+/robots.txt$"); | ||
private static final Pattern SITEMAP_PATTERN = Pattern | ||
.compile("(?i)Sitemap:\\s*(.+)$"); | ||
|
||
public static final String ANNOTATION_IS_SITEMAP = "isSitemap"; | ||
|
||
@Override | ||
protected boolean shouldExtract(CrawlURI uri) { | ||
boolean shouldExtract = false; | ||
if (uri.isPrerequisite()) { | ||
shouldExtract = ROBOTS_PATTERN.matcher(uri.getURI()).matches(); | ||
LOGGER.finest("Checked prerequisite " + uri + " GOT " + shouldExtract); | ||
} | ||
return shouldExtract; | ||
} | ||
|
||
public List<String> parseRobotsTxt(InputStream input) { | ||
ArrayList<String> links = new ArrayList<>(); | ||
BufferedReader reader = new BufferedReader(new InputStreamReader(input)); | ||
try { | ||
String line; | ||
Matcher matcher; | ||
while ((line = reader.readLine()) != null) { | ||
matcher = SITEMAP_PATTERN.matcher(line); | ||
if (matcher.matches()) { | ||
links.add(matcher.group(1)); | ||
} | ||
} | ||
} catch (IOException e) { | ||
LOGGER.warning(e.toString()); | ||
} | ||
return links; | ||
} | ||
|
||
@Override | ||
protected boolean innerExtract(CrawlURI curi) { | ||
try { | ||
|
||
// Clone the CrawlURI and change hop path and avoid queueing | ||
// sitemaps as prerequisites (i.e. strip P from hop path). | ||
CrawlURI curiClone = new CrawlURI(curi.getUURI(), | ||
curi.getPathFromSeed().replace("P", ""), curi.getVia(), | ||
curi.getViaContext()); | ||
|
||
// Also copy the source over: | ||
if (curi.getSourceTag() != null) { | ||
curiClone.setSourceTag(curi.getSourceTag()); | ||
} | ||
|
||
// Parse the robots for the sitemaps. | ||
List<String> links = parseRobotsTxt( | ||
curi.getRecorder() | ||
.getContentReplayInputStream()); | ||
LOGGER.finest("Checked " + curi + " GOT " + links); | ||
|
||
// Get the max outlinks (needed by add method): | ||
int max = getExtractorParameters().getMaxOutlinks(); | ||
|
||
// Accrue links: | ||
for (String link : links) { | ||
try { | ||
// We've found a sitemap: | ||
LOGGER.fine("Found site map: " + link); | ||
numberOfLinksExtracted.incrementAndGet(); | ||
|
||
// Add links but using the cloned CrawlURI as the crawl | ||
// context. | ||
CrawlURI newCuri = addRelativeToBase(curiClone, max, link, | ||
LinkContext.MANIFEST_MISC, Hop.MANIFEST); | ||
|
||
// Annotate as a Site Map: | ||
newCuri.getAnnotations().add( | ||
ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP); | ||
|
||
} catch (URIException e) { | ||
logUriError(e, curi.getUURI(), link); | ||
} | ||
} | ||
|
||
// Patch outlinks back into original curi: | ||
for (CrawlURI outlink : curiClone.getOutLinks()) { | ||
curi.getOutLinks().add(outlink); | ||
} | ||
|
||
// Return number of links discovered: | ||
return !links.isEmpty(); | ||
|
||
} catch (IOException e) { | ||
LOGGER.log(Level.WARNING, curi.getURI(), e); | ||
curi.getNonFatalFailures().add(e); | ||
} | ||
return false; | ||
} | ||
|
||
} |
178 changes: 178 additions & 0 deletions
178
modules/src/main/java/org/archive/modules/extractor/ExtractorSitemap.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
package org.archive.modules.extractor; | ||
|
||
import java.io.IOException; | ||
import java.net.URL; | ||
import java.util.Collection; | ||
import java.util.Date; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
|
||
import org.apache.commons.httpclient.URIException; | ||
import org.apache.commons.io.IOUtils; | ||
import org.archive.modules.CrawlURI; | ||
import org.archive.modules.extractor.ContentExtractor; | ||
import org.archive.modules.extractor.Hop; | ||
import org.archive.modules.extractor.LinkContext; | ||
|
||
import crawlercommons.sitemaps.AbstractSiteMap; | ||
import crawlercommons.sitemaps.SiteMap; | ||
import crawlercommons.sitemaps.SiteMapIndex; | ||
import crawlercommons.sitemaps.SiteMapParser; | ||
import crawlercommons.sitemaps.SiteMapURL; | ||
import crawlercommons.sitemaps.UnknownFormatException; | ||
|
||
/** | ||
* | ||
* @author Andrew Jackson <Andrew.Jackson@bl.uk> | ||
* | ||
*/ | ||
public class ExtractorSitemap extends ContentExtractor { | ||
private static final Logger LOGGER = Logger | ||
.getLogger(ExtractorSitemap.class.getName()); | ||
|
||
/* (non-Javadoc) | ||
* @see org.archive.modules.extractor.ContentExtractor#shouldExtract(org.archive.modules.CrawlURI) | ||
*/ | ||
@Override | ||
protected boolean shouldExtract(CrawlURI uri) { | ||
// If declared as such: | ||
if (uri.getAnnotations() | ||
.contains(ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP)) { | ||
if (uri.is2XXSuccess()) { | ||
LOGGER.fine("This url (" + uri | ||
+ ") is declared to be a sitemap (via robots.txt) and is a HTTP 200."); | ||
return true; | ||
} else { | ||
LOGGER.fine("This url (" + uri | ||
+ ") is declared to be a sitemap (via robots.txt) but is a HTTP " | ||
+ uri.getFetchStatus() + "."); | ||
} | ||
} | ||
|
||
// Via content type: | ||
String mimeType = uri.getContentType(); | ||
if (mimeType != null ) { | ||
// Looks like XML: | ||
if (mimeType.toLowerCase().startsWith("text/xml") | ||
|| mimeType.toLowerCase().startsWith("application/xml")) { | ||
|
||
// check if content starts with xml preamble "<?xml" and does | ||
// contain "<urlset " or "<sitemapindex" early in the content | ||
String contentStartingChunk = uri.getRecorder() | ||
.getContentReplayPrefixString(400); | ||
if (contentStartingChunk.matches("(?is)[\\ufeff]?<\\?xml\\s.*") | ||
&& contentStartingChunk.matches( | ||
"(?is).*(?:<urlset|<sitemapindex[>\\s]).*")) { | ||
LOGGER.info("Based on content sniffing, this is a sitemap: " | ||
+ uri); | ||
return true; | ||
} | ||
} | ||
} | ||
|
||
// Otherwise, not | ||
return false; | ||
} | ||
|
||
/* (non-Javadoc) | ||
* @see org.archive.modules.extractor.ContentExtractor#innerExtract(org.archive.modules.CrawlURI) | ||
*/ | ||
@Override | ||
protected boolean innerExtract(CrawlURI uri) { | ||
// Parse the sitemap: | ||
AbstractSiteMap sitemap = parseSiteMap(uri); | ||
|
||
// Did that work? | ||
if (sitemap != null) { | ||
// Process results: | ||
if (sitemap.isIndex()) { | ||
final Collection<AbstractSiteMap> links = ((SiteMapIndex) sitemap) | ||
.getSitemaps(); | ||
for (final AbstractSiteMap asm : links) { | ||
if (asm == null) { | ||
continue; | ||
} | ||
this.recordOutlink(uri, asm.getUrl(), asm.getLastModified(), | ||
true); | ||
} | ||
} else { | ||
final Collection<SiteMapURL> links = ((SiteMap) sitemap) | ||
.getSiteMapUrls(); | ||
for (final SiteMapURL url : links) { | ||
if (url == null) { | ||
continue; | ||
} | ||
this.recordOutlink(uri, url.getUrl(), url.getLastModified(), | ||
false); | ||
} | ||
} | ||
} | ||
|
||
return false; | ||
} | ||
|
||
/** | ||
* Parse the sitemap using the Crawler Commons content-sniffing parser. | ||
* | ||
* @param uri | ||
* @return | ||
*/ | ||
private AbstractSiteMap parseSiteMap(CrawlURI uri) { | ||
// The thing we will create: | ||
AbstractSiteMap sitemap = null; | ||
|
||
// Be strict about URLs but allow partial extraction: | ||
SiteMapParser smp = new SiteMapParser(true, true); | ||
// Parse it up: | ||
try { | ||
// Sitemaps are not supposed to be bigger than 50MB (according to | ||
// Google) so if we hit problems we can implement that limit: | ||
byte[] content = IOUtils.toByteArray( | ||
uri.getRecorder().getContentReplayInputStream()); | ||
if (content.length > 52428800) { | ||
LOGGER.warning("Found sitemap exceeding 50MB " + uri + " " | ||
+ content.length); | ||
} | ||
// Now we can process it: | ||
sitemap = smp.parseSiteMap(content, new URL(uri.getURI())); | ||
} catch (IOException e) { | ||
LOGGER.log(Level.WARNING, | ||
"I/O Exception when parsing sitemap " + uri, e); | ||
} catch (UnknownFormatException e) { | ||
LOGGER.log(Level.WARNING, | ||
"UnknownFormatException when parsing sitemap " + uri, e); | ||
} | ||
return sitemap; | ||
} | ||
|
||
private void recordOutlink(CrawlURI curi, URL newUri, Date lastModified, | ||
boolean isSitemap) { | ||
try { | ||
// Get the max outlinks (needed by add method): | ||
// | ||
// Because sitemaps are really important we excuse this extractor | ||
// from the general setting: | ||
// | ||
// getExtractorParameters().getMaxOutlinks(); | ||
// | ||
// And instead use the maximum that is allowed for a sitemap: | ||
int max = 50000; | ||
|
||
// Add the URI: | ||
// Adding 'regular' URL listed in the sitemap | ||
addRelativeToBase(curi, max, newUri.toString(), | ||
LinkContext.MANIFEST_MISC, Hop.MANIFEST); | ||
|
||
// And log about it: | ||
LOGGER.fine("Found " + newUri + " from " + curi + " Dated " | ||
+ lastModified + " and with isSitemap = " + isSitemap); | ||
// Count it: | ||
numberOfLinksExtracted.incrementAndGet(); | ||
} catch (URIException e) { | ||
LOGGER.log(Level.WARNING, | ||
"URIException when recording outlink " + newUri, e); | ||
} | ||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters