Skip to content

Commit

Permalink
Refactoring of WashingtonPost collection for Core18 (#1092)
Browse files Browse the repository at this point in the history
Refactoring of WashingtonPost collection based on the clarified definition of
contents() and raw() in SourceDocument, per #1048.
  • Loading branch information
lintool authored Apr 12, 2020
1 parent 585229f commit 35f9f82
Show file tree
Hide file tree
Showing 13 changed files with 165 additions and 91 deletions.
4 changes: 2 additions & 2 deletions docs/regressions-core18.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,12 @@ With the above commands, you should be able to replicate the following results:

MAP | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax |
:---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.2495 | 0.3135 | 0.2925 | 0.2526 | 0.3073 | 0.2966 |
[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.2495 | 0.3135 | 0.2841 | 0.2526 | 0.3073 | 0.2919 |


P30 | BM25 | +RM3 | +Ax | QL | +RM3 | +Ax |
:---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.3567 | 0.4200 | 0.4027 | 0.3653 | 0.4000 | 0.4060 |
[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.3567 | 0.4200 | 0.3947 | 0.3653 | 0.4000 | 0.4020 |

## Replication Log

Expand Down
60 changes: 50 additions & 10 deletions src/main/java/io/anserini/collection/WashingtonPostCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,12 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.datatype.jdk8.Jdk8Module;
import io.anserini.index.generator.WashingtonPostGenerator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.jsoup.Jsoup;

import java.io.BufferedReader;
import java.io.FileInputStream;
Expand Down Expand Up @@ -103,45 +107,85 @@ private void parseRecord(String record) {
bufferedRecord.articleUrl = wapoObj.getArticleUrl();
bufferedRecord.author = wapoObj.getAuthor();
bufferedRecord.obj = wapoObj;
bufferedRecord.content = record;
bufferedRecord.raw = record;
}
}

/**
* A document from the <a href="https://trec.nist.gov/data/wapost/">TREC Washington Post Corpus</a>.
*/
public static class Document implements SourceDocument {
private static final Logger LOG = LogManager.getLogger(Document.class);
public static final List<String> CONTENT_TYPE_TAG = Arrays.asList("sanitized_html", "tweet");

// Required fields
protected String id;
protected Optional<String> articleUrl;
protected Optional<String> author;
protected long publishDate;
protected Optional<String> title;
protected String content;
protected String raw;
protected WashingtonPostObject obj;

protected String fullCaption = null;
protected String kicker = null;

private String removeTags(String content) {
return Jsoup.parse(content).text();
}

@Override
public String id() {
return id;
}

@Override
public String contents() {
return content;
StringBuilder contentBuilder = new StringBuilder();
getTitle().ifPresent(title -> contentBuilder.append(title).append("\n"));

getObj().getContents().ifPresent(contents -> {
for (WashingtonPostObject.Content contentObj : contents) {
if (contentObj == null) continue;
if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
contentObj.getType().ifPresent(type -> {
contentObj.getContent().ifPresent(content -> {
if (CONTENT_TYPE_TAG.contains(type)) {
contentBuilder.append(removeTags(content)).append("\n");
} else if (type.compareToIgnoreCase("kicker") == 0) {
kicker = content;
contentBuilder.append(content).append("\n");
}
});
});
}
contentObj.getFullCaption().ifPresent(caption -> {
fullCaption = contentObj.getFullCaption().get();
contentBuilder.append(removeTags(fullCaption)).append("\n");
});
}
});

return contentBuilder.toString();
}

@Override
public String raw() {
return content;
return raw;
}

@Override
public boolean indexable() {
return true;
}


public String getFullCaption() {
return fullCaption;
}

public String getKicker() {
return kicker;
}

public Optional<String> getArticleUrl() {
return articleUrl;
}
Expand All @@ -158,10 +202,6 @@ public Optional<String> getTitle() {
return title;
}

public String getContent() {
return content;
}

public WashingtonPostObject getObj() {
return obj;
}
Expand Down
28 changes: 28 additions & 0 deletions src/main/java/io/anserini/index/IndexReaderUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,34 @@ public static String documentContents(IndexReader reader, String docid) {
}
}

/**
* Returns the Lucene document based on some field beside its unique collection docid. For example, scientific
* articles might have DOIs.
*
* @param reader index reader
* @param field field
* @param query id to search
* @return the Lucene document
*/
public static Document documentByField(IndexReader reader, String field, String query) {
try {
IndexSearcher searcher = new IndexSearcher(reader);
Query q = new TermQuery(new Term(field, query));
TopDocs rs = searcher.search(q, 1);
ScoreDoc[] hits = rs.scoreDocs;

if (hits == null || hits.length == 0) {
// Silently eat the error and return -1
return null;
}

return reader.document(hits[0].doc);
} catch (IOException e) {
// Silently eat the error and return null
return null;
}
}

/**
* Computes the BM25 weight of a term (prior to analysis) in a particular document.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@
public class DefaultLuceneDocumentGenerator<T extends SourceDocument> implements LuceneDocumentGenerator<T> {
protected IndexArgs args;

protected DefaultLuceneDocumentGenerator() {
}

/**
* Constructor with config and counters
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,7 @@
/**
* Converts a {@link WashingtonPostCollection.Document} into a Lucene {@link Document}, ready to be indexed.
*/
public class WashingtonPostGenerator implements LuceneDocumentGenerator<WashingtonPostCollection.Document> {
public static final List<String> CONTENT_TYPE_TAG = Arrays.asList("sanitized_html", "tweet");

private IndexArgs args;
public class WashingtonPostGenerator extends DefaultLuceneDocumentGenerator<WashingtonPostCollection.Document> {

public enum WashingtonPostField {
AUTHOR("author"),
Expand All @@ -55,28 +52,17 @@ public enum WashingtonPostField {
name = s;
}
}

public WashingtonPostGenerator(IndexArgs args) {
this.args = args;
super.args = args;
}

public static String removeTags(String content) {
return Jsoup.parse(content).text();
}

@Override
public Document createDocument(WashingtonPostCollection.Document src) throws GeneratorException {
String id = src.id();

if (src.contents().trim().isEmpty()) {
throw new EmptyDocumentException();
}

Document doc = new Document();
doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// Use the superclass to create a document with all the default fields.
Document doc = super.createDocument(src);

// This is needed to break score ties by docid.
doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
// Add additional fields that are specialized for the Washington Post
doc.add(new LongPoint(WashingtonPostField.PUBLISHED_DATE.name, src.getPublishDate()));
doc.add(new StoredField(WashingtonPostField.PUBLISHED_DATE.name, src.getPublishDate()));

Expand All @@ -87,55 +73,16 @@ public Document createDocument(WashingtonPostCollection.Document src) throws Gen
src.getTitle().ifPresent(title ->
doc.add(new StringField(WashingtonPostField.TITLE.name, title, Field.Store.NO)));

StringBuilder contentBuilder = new StringBuilder();
src.getTitle().ifPresent(title -> contentBuilder.append(title).append("\n"));

src.getObj().getContents().ifPresent(contents -> {
for (WashingtonPostObject.Content contentObj : contents) {
if (contentObj == null) continue;
if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
contentObj.getType().ifPresent(type -> {
contentObj.getContent().ifPresent(content -> {
if (CONTENT_TYPE_TAG.contains(type)) {
contentBuilder.append(removeTags(content)).append("\n");
} else if (type.compareToIgnoreCase("kicker") == 0) {
doc.add(new StringField(WashingtonPostField.KICKER.name, content, Field.Store.NO));
contentBuilder.append(content).append("\n");
}
});
});
}
contentObj.getFullCaption().ifPresent(caption -> {
String fullCaption = contentObj.getFullCaption().get();
doc.add(new StringField(WashingtonPostField.FULL_CAPTION.name, fullCaption, Field.Store.NO));
contentBuilder.append(removeTags(fullCaption)).append("\n");
});
}
});

if (args.storeRaw) { // store the raw json string as one single field
doc.add(new StoredField(IndexArgs.RAW, src.getContent()));
if (src.getKicker() != null) {
doc.add(new StringField(WashingtonPostGenerator.WashingtonPostField.KICKER.name,
src.getKicker(), Field.Store.NO));
}

FieldType fieldType = new FieldType();

fieldType.setStored(args.storeContents);

// Are we storing document vectors?
if (args.storeDocvectors) {
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
if (src.getFullCaption() != null) {
doc.add(new StringField(WashingtonPostGenerator.WashingtonPostField.FULL_CAPTION.name,
src.getFullCaption(), Field.Store.NO));
}

// Are we building a "positional" or "count" index?
if (args.storePositions) {
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
} else {
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
}

doc.add(new Field(IndexArgs.CONTENTS, contentBuilder.toString(), fieldType));

return doc;
}
}
5 changes: 5 additions & 0 deletions src/main/java/io/anserini/search/SimpleSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -297,6 +298,10 @@ public Result[] search(String q, int k, long t) throws IOException {
return search(query, queryTokens, q, k, t);
}

public Result[] search(Query query, int k) throws IOException {
return search(query, null, null, k, -1);
}

protected Result[] search(Query query, List<String> queryTokens, String queryString, int k,
long t) throws IOException {
// Create an IndexSearch only once. Note that the object is thread safe.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.jsoup.Jsoup;

import java.io.BufferedReader;
import java.io.IOException;
Expand Down Expand Up @@ -217,7 +218,11 @@ private static WashingtonPostCollection.Document.WashingtonPostObject getWapoObj
}
return wapoObj;
}


private static String removeTags(String content) {
return Jsoup.parse(content).text();
}

private static String getRawContents(String record) {
WashingtonPostCollection.Document.WashingtonPostObject wapoObj = getWapoObj(record);

Expand All @@ -230,15 +235,15 @@ private static String getRawContents(String record) {
if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
contentObj.getType().ifPresent(type -> {
contentObj.getContent().ifPresent(content -> {
if (WashingtonPostGenerator.CONTENT_TYPE_TAG.contains(type)) {
contentBuilder.append(WashingtonPostGenerator.removeTags(content)).append("\n");
if (WashingtonPostCollection.Document.CONTENT_TYPE_TAG.contains(type)) {
contentBuilder.append(removeTags(content)).append("\n");
}
});
});
}
contentObj.getFullCaption().ifPresent(caption -> {
String fullCaption = contentObj.getFullCaption().get();
contentBuilder.append(WashingtonPostGenerator.removeTags(fullCaption)).append("\n");
contentBuilder.append(removeTags(fullCaption)).append("\n");
});
}
});
Expand All @@ -255,8 +260,8 @@ private static List<String> getParagraphs(String record) {
if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
contentObj.getType().ifPresent(type -> {
contentObj.getContent().ifPresent(content -> {
if (WashingtonPostGenerator.CONTENT_TYPE_TAG.contains(type)) {
String sanityContent = WashingtonPostGenerator.removeTags(content);
if (WashingtonPostCollection.Document.CONTENT_TYPE_TAG.contains(type)) {
String sanityContent = removeTags(content);
if (sanityContent.trim().length() > 0) {
paragraphs.add(sanityContent);
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/regression/backgroundlinking18.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ input: collections/newswire/WashingtonPost.v2/data/
index_path: indexes/lucene-index.core18.pos+docvectors+raw
collection: WashingtonPostCollection
index_stats:
documents: 595037
documents: 595031
documents (non-empty): 595030
total terms: 318219945
topics:
Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/regression/backgroundlinking19.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ input: collections/newswire/WashingtonPost.v2/data/
index_path: indexes/lucene-index.core18.pos+docvectors+raw
collection: WashingtonPostCollection
index_stats:
documents: 595037
documents: 595031
documents (non-empty): 595030
total terms: 318219945
topics:
Expand Down
Loading

0 comments on commit 35f9f82

Please sign in to comment.