Skip to content

Commit

Permalink
Merge pull request #70 from nlevitt/ari-3712
Browse files Browse the repository at this point in the history
for ARI-3712, add extracted links relative to both via and base, and annotate with "extractorSWFRelToVia", "extractorSWFRelToBase", or "extractorSWFRelToBoth" if resulting link is the same whether relative to base or via
  • Loading branch information
vonrosen committed May 22, 2014
2 parents 26222ef + aff5b9a commit 1c0d6c6
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -334,17 +334,38 @@ public void getURL(String url, String target)
}
} else {
int max = ext.getExtractorParameters().getMaxOutlinks();
Link.addRelativeToVia(curi, max, url, LinkContext.EMBED_MISC,
Hop.EMBED);
Link relToVia = Link.addRelativeToVia(curi, max, url,
LinkContext.EMBED_MISC, Hop.EMBED);
Link relToBase = Link.addRelativeToBase(curi, max, url,
LinkContext.EMBED_MISC, Hop.EMBED);
addAnnotations(relToVia, relToBase);
linkCount++;
}
}

protected void addAnnotations(Link relToVia, Link relToBase) {
if (relToVia != null && relToBase != null
&& relToVia.getDestination().equals(relToBase.getDestination())) {
relToVia.getAnnotations().add("extractorSWFRelToBoth");
relToBase.getAnnotations().add("extractorSWFRelToBoth");
} else {
if (relToVia != null) {
relToVia.getAnnotations().add("extractorSWFRelToVia");
}
if (relToBase != null) {
relToBase.getAnnotations().add("extractorSWFRelToBase");
}
}
}

public void considerStringAsUri(String str) throws IOException {
if (UriUtils.isVeryLikelyUri(str)) {
int max = ext.getExtractorParameters().getMaxOutlinks();
Link.addRelativeToVia(curi, max, str,
Link relToVia = Link.addRelativeToVia(curi, max, str,
LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
Link relToBase = Link.addRelativeToBase(curi, max, str,
LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
addAnnotations(relToVia, relToBase);
linkCount++;
}
}
Expand Down
38 changes: 29 additions & 9 deletions modules/src/main/java/org/archive/modules/extractor/Link.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@
*/
package org.archive.modules.extractor;

import static org.archive.modules.CoreAttributeConstants.A_ANNOTATIONS;

import java.io.Serializable;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.logging.Logger;

Expand All @@ -43,7 +47,7 @@
public class Link implements Serializable, Comparable<Link> {
private static final Logger LOGGER = Logger.getLogger(Link.class.getName());

private static final long serialVersionUID = 2L;
private static final long serialVersionUID = 3L;


/** URI where this Link was discovered */
Expand Down Expand Up @@ -165,14 +169,14 @@ public int hashCode() {
^ context.hashCode() ^ hop.hashCode();
}

public static void addRelativeToBase(CrawlURI uri, int max,
public static Link addRelativeToBase(CrawlURI uri, int max,
String newUri, LinkContext context, Hop hop) throws URIException {
UURI dest = UURIFactory.getInstance(uri.getBaseURI(), newUri);
add2(uri, max, dest, context, hop);
return addOrDiscard(uri, max, dest, context, hop);
}


public static void addRelativeToVia(CrawlURI uri, int max, String newUri,
public static Link addRelativeToVia(CrawlURI uri, int max, String newUri,
LinkContext context, Hop hop) throws URIException {
UURI relTo = uri.getVia();
if (relTo == null) {
Expand All @@ -183,25 +187,26 @@ public static void addRelativeToVia(CrawlURI uri, int max, String newUri,
relTo = uri.getBaseURI();
}
UURI dest = UURIFactory.getInstance(relTo, newUri);
add2(uri, max, dest, context, hop);
return addOrDiscard(uri, max, dest, context, hop);
}

public static void add(CrawlURI uri, int max, String newUri,
public static Link add(CrawlURI uri, int max, String newUri,
LinkContext context, Hop hop) throws URIException {
UURI dest = UURIFactory.getInstance(newUri);
add2(uri, max, dest, context, hop);
return addOrDiscard(uri, max, dest, context, hop);
}


private static void add2(CrawlURI uri, int max, UURI dest,
private static Link addOrDiscard(CrawlURI uri, int max, UURI dest,
LinkContext context, Hop hop) throws URIException {
if (uri.getOutLinks().size() < max) {
UURI src = uri.getUURI();
Link link = new Link(src, dest, context, hop);
uri.getOutLinks().add(link);
// return link;
return link;
} else {
uri.incrementDiscardedOutLinks();
return null;
}
}

Expand All @@ -219,4 +224,19 @@ public int compareTo(Link o) {
return cmp;
}

/**
* Get the annotations set for this uri.
*
* @return the annotations set for this uri.
*/
// XXX copied from CrawlURI :-\ let's get HER-2039 in there
public Collection<String> getAnnotations() {
@SuppressWarnings("unchecked")
Collection<String> annotations = (Collection<String>)getData().get(A_ANNOTATIONS);
if (annotations == null) {
annotations = new LinkedHashSet<String>();
getData().put(A_ANNOTATIONS, annotations);
}
return annotations;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,44 @@ public void xestNonAsciiLink() throws MalformedURLException, IOException {
+ "\" from " + url, foundIt);
}
}

public void xestAri3712() throws MalformedURLException, IOException {
String url = "https://wayback.archive-it.org/3771/20131119163257/http://nyumedecs.kk5.org/_app/28727/en/resources/container.swf";
CrawlURI curi = setupURI(url);
curi.setVia(UURIFactory.getInstance("http://nyumedecs.kk5.org/"));
long startTime = System.currentTimeMillis();
this.extractor.extract(curi);
long elapsed = System.currentTimeMillis() - startTime;
logger.info(this.extractor.getClass().getSimpleName() + " took "
+ elapsed + "ms to process " + url);

HashMap<CharSequence, String> expected = new HashMap<CharSequence, String>();
expected.put("http://nyumedecs.kk5.org/sm4/portal", "extractorSWFRelToVia");
expected.put("https://wayback.archive-it.org/3771/20131119163257/http://nyumedecs.kk5.org/_app/28727/en/resources/sm4/portal", "extractorSWFRelToBase");
expected.put("http://nyumedecs.kk5.org/", "extractorSWFRelToVia");
expected.put("https://wayback.archive-it.org/", "extractorSWFRelToBase");
expected.put("http://nyumedecs.kk5.org/loadingBarEdit.swf", "extractorSWFRelToVia");
expected.put("https://wayback.archive-it.org/3771/20131119163257/http://nyumedecs.kk5.org/_app/28727/en/resources/loadingBarEdit.swf", "extractorSWFRelToBase");
expected.put("http://nyumedecs.kk5.org/containermain.swf", "extractorSWFRelToVia");
expected.put("https://wayback.archive-it.org/3771/20131119163257/http://nyumedecs.kk5.org/_app/28727/en/resources/containermain.swf", "extractorSWFRelToBase");

for (Link link: curi.getOutLinks()) {
System.out.println(link + " " + link.getData());
assertEquals(1, link.getAnnotations().size());

String dest = link.getDestination().toString();
assertTrue(expected.containsKey(dest));

// remove the entry, so at the end the map should be empty, confirming that we found all the expected links
String expectedAnnotation = expected.remove(dest);
System.out.println("expectedAnnotation=" + expectedAnnotation);

System.out.println("link.getAnnotations()=" + link.getAnnotations());
String annotation = link.getAnnotations().toArray(new String[0])[0];
System.out.println("annotation=" + annotation);


assertEquals(expectedAnnotation, annotation);
}
}
}

0 comments on commit 1c0d6c6

Please sign in to comment.