Refactoring of WashingtonPost collection for Core18 (#1092)

Refactoring of WashingtonPost collection based on the clarified definition of contents() and raw() in SourceDocument, per #1048.
castorini · Apr 12, 2020 · 35f9f82 · 35f9f82
1 parent 585229f
commit 35f9f82
Show file tree

Hide file tree

Showing 13 changed files with 165 additions and 91 deletions.
diff --git a/docs/regressions-core18.md b/docs/regressions-core18.md
@@ -76,12 +76,12 @@ With the above commands, you should be able to replicate the following results:
 
 MAP                                     | BM25      | +RM3      | +Ax       | QL        | +RM3      | +Ax       |
 :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
-[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.2495    | 0.3135    | 0.2925    | 0.2526    | 0.3073    | 0.2966    |
+[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.2495    | 0.3135    | 0.2841    | 0.2526    | 0.3073    | 0.2919    |
 
 
 P30                                     | BM25      | +RM3      | +Ax       | QL        | +RM3      | +Ax       |
 :---------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
-[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.3567    | 0.4200    | 0.4027    | 0.3653    | 0.4000    | 0.4060    |
+[TREC 2018 Common Core Track Topics](../src/main/resources/topics-and-qrels/topics.core18.txt)| 0.3567    | 0.4200    | 0.3947    | 0.3653    | 0.4000    | 0.4020    |
 
 ## Replication Log
 

diff --git a/src/main/java/io/anserini/collection/WashingtonPostCollection.java b/src/main/java/io/anserini/collection/WashingtonPostCollection.java
@@ -26,8 +26,12 @@
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
 import com.fasterxml.jackson.datatype.jdk8.Jdk8Module;
+import io.anserini.index.generator.WashingtonPostGenerator;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.jsoup.Jsoup;
 
 import java.io.BufferedReader;
 import java.io.FileInputStream;
@@ -103,45 +107,85 @@ private void parseRecord(String record) {
       bufferedRecord.articleUrl = wapoObj.getArticleUrl();
       bufferedRecord.author = wapoObj.getAuthor();
       bufferedRecord.obj = wapoObj;
-      bufferedRecord.content = record;
+      bufferedRecord.raw = record;
     }
   }
 
   /**
    * A document from the <a href="https://trec.nist.gov/data/wapost/">TREC Washington Post Corpus</a>.
    */
   public static class Document implements SourceDocument {
-    private static final Logger LOG = LogManager.getLogger(Document.class);
+    public static final List<String> CONTENT_TYPE_TAG = Arrays.asList("sanitized_html", "tweet");
 
     // Required fields
     protected String id;
     protected Optional<String> articleUrl;
     protected Optional<String> author;
     protected long publishDate;
     protected Optional<String> title;
-    protected String content;
+    protected String raw;
     protected WashingtonPostObject obj;
 
+    protected String fullCaption = null;
+    protected String kicker = null;
+
+    private String removeTags(String content) {
+      return Jsoup.parse(content).text();
+    }
+
     @Override
     public String id() {
       return id;
     }
 
     @Override
     public String contents() {
-      return content;
+      StringBuilder contentBuilder = new StringBuilder();
+      getTitle().ifPresent(title -> contentBuilder.append(title).append("\n"));
+
+      getObj().getContents().ifPresent(contents -> {
+        for (WashingtonPostObject.Content contentObj : contents) {
+          if (contentObj == null) continue;
+          if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
+            contentObj.getType().ifPresent(type -> {
+              contentObj.getContent().ifPresent(content -> {
+                if (CONTENT_TYPE_TAG.contains(type)) {
+                  contentBuilder.append(removeTags(content)).append("\n");
+                } else if (type.compareToIgnoreCase("kicker") == 0) {
+                  kicker = content;
+                  contentBuilder.append(content).append("\n");
+                }
+              });
+            });
+          }
+          contentObj.getFullCaption().ifPresent(caption -> {
+            fullCaption = contentObj.getFullCaption().get();
+            contentBuilder.append(removeTags(fullCaption)).append("\n");
+          });
+        }
+      });
+
+      return contentBuilder.toString();
     }
 
     @Override
     public String raw() {
-      return content;
+      return raw;
     }
 
     @Override
     public boolean indexable() {
       return true;
     }
-
+
+    public String getFullCaption() {
+      return fullCaption;
+    }
+
+    public String getKicker() {
+      return kicker;
+    }
+
     public Optional<String> getArticleUrl() {
       return articleUrl;
     }
@@ -158,10 +202,6 @@ public Optional<String> getTitle() {
       return title;
     }
 
-    public String getContent() {
-      return content;
-    }
-
     public WashingtonPostObject getObj() {
       return obj;
     }

diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java
@@ -411,6 +411,34 @@ public static String documentContents(IndexReader reader, String docid) {
     }
   }
 
+  /**
+   * Returns the Lucene document based on some field beside its unique collection docid. For example, scientific
+   * articles might have DOIs.
+   *
+   * @param reader index reader
+   * @param field field
+   * @param query id to search
+   * @return the Lucene document
+   */
+  public static Document documentByField(IndexReader reader, String field, String query) {
+    try {
+      IndexSearcher searcher = new IndexSearcher(reader);
+      Query q = new TermQuery(new Term(field, query));
+      TopDocs rs = searcher.search(q, 1);
+      ScoreDoc[] hits = rs.scoreDocs;
+
+      if (hits == null || hits.length == 0) {
+        // Silently eat the error and return -1
+        return null;
+      }
+
+      return reader.document(hits[0].doc);
+    } catch (IOException e) {
+      // Silently eat the error and return null
+      return null;
+    }
+  }
+
   /**
    * Computes the BM25 weight of a term (prior to analysis) in a particular document.
    *

diff --git a/src/main/java/io/anserini/index/generator/DefaultLuceneDocumentGenerator.java b/src/main/java/io/anserini/index/generator/DefaultLuceneDocumentGenerator.java
@@ -37,6 +37,9 @@
 public class DefaultLuceneDocumentGenerator<T extends SourceDocument> implements LuceneDocumentGenerator<T> {
   protected IndexArgs args;
 
+  protected DefaultLuceneDocumentGenerator() {
+  }
+
   /**
    * Constructor with config and counters
    *

diff --git a/src/main/java/io/anserini/index/generator/WashingtonPostGenerator.java b/src/main/java/io/anserini/index/generator/WashingtonPostGenerator.java
@@ -36,10 +36,7 @@
 /**
  * Converts a {@link WashingtonPostCollection.Document} into a Lucene {@link Document}, ready to be indexed.
  */
-public class WashingtonPostGenerator implements LuceneDocumentGenerator<WashingtonPostCollection.Document> {
-  public static final List<String> CONTENT_TYPE_TAG = Arrays.asList("sanitized_html", "tweet");
-
-  private IndexArgs args;
+public class WashingtonPostGenerator extends DefaultLuceneDocumentGenerator<WashingtonPostCollection.Document> {
 
   public enum WashingtonPostField {
     AUTHOR("author"),
@@ -55,28 +52,17 @@ public enum WashingtonPostField {
       name = s;
     }
   }
-  
+
   public WashingtonPostGenerator(IndexArgs args) {
-    this.args = args;
+    super.args = args;
   }
 
-  public static String removeTags(String content) {
-    return Jsoup.parse(content).text();
-  }
-
   @Override
   public Document createDocument(WashingtonPostCollection.Document src) throws GeneratorException {
-    String id = src.id();
-
-    if (src.contents().trim().isEmpty()) {
-      throw new EmptyDocumentException();
-    }
-
-    Document doc = new Document();
-    doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
+    // Use the superclass to create a document with all the default fields.
+    Document doc = super.createDocument(src);
 
-    // This is needed to break score ties by docid.
-    doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
+    // Add additional fields that are specialized for the Washington Post
     doc.add(new LongPoint(WashingtonPostField.PUBLISHED_DATE.name, src.getPublishDate()));
     doc.add(new StoredField(WashingtonPostField.PUBLISHED_DATE.name, src.getPublishDate()));
 
@@ -87,55 +73,16 @@ public Document createDocument(WashingtonPostCollection.Document src) throws Gen
     src.getTitle().ifPresent(title ->
         doc.add(new StringField(WashingtonPostField.TITLE.name, title, Field.Store.NO)));
 
-    StringBuilder contentBuilder = new StringBuilder();
-    src.getTitle().ifPresent(title -> contentBuilder.append(title).append("\n"));
-
-    src.getObj().getContents().ifPresent(contents -> {
-      for (WashingtonPostObject.Content contentObj : contents) {
-        if (contentObj == null) continue;
-        if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
-          contentObj.getType().ifPresent(type -> {
-            contentObj.getContent().ifPresent(content -> {
-              if (CONTENT_TYPE_TAG.contains(type)) {
-                contentBuilder.append(removeTags(content)).append("\n");
-              } else if (type.compareToIgnoreCase("kicker") == 0) {
-                doc.add(new StringField(WashingtonPostField.KICKER.name, content, Field.Store.NO));
-                contentBuilder.append(content).append("\n");
-              }
-            });
-          });
-        }
-        contentObj.getFullCaption().ifPresent(caption -> {
-          String fullCaption = contentObj.getFullCaption().get();
-          doc.add(new StringField(WashingtonPostField.FULL_CAPTION.name, fullCaption, Field.Store.NO));
-          contentBuilder.append(removeTags(fullCaption)).append("\n");
-        });
-      }
-    });
-
-    if (args.storeRaw) { // store the raw json string as one single field
-      doc.add(new StoredField(IndexArgs.RAW, src.getContent()));
+    if (src.getKicker() != null) {
+      doc.add(new StringField(WashingtonPostGenerator.WashingtonPostField.KICKER.name,
+          src.getKicker(), Field.Store.NO));
     }
 
-    FieldType fieldType = new FieldType();
-
-    fieldType.setStored(args.storeContents);
-
-    // Are we storing document vectors?
-    if (args.storeDocvectors) {
-      fieldType.setStoreTermVectors(true);
-      fieldType.setStoreTermVectorPositions(true);
+    if (src.getFullCaption() != null) {
+      doc.add(new StringField(WashingtonPostGenerator.WashingtonPostField.FULL_CAPTION.name,
+          src.getFullCaption(), Field.Store.NO));
     }
 
-    // Are we building a "positional" or "count" index?
-    if (args.storePositions) {
-      fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
-    } else {
-      fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
-    }
-
-    doc.add(new Field(IndexArgs.CONTENTS, contentBuilder.toString(), fieldType));
-
     return doc;
   }
 }
diff --git a/src/main/java/io/anserini/search/SimpleSearcher.java b/src/main/java/io/anserini/search/SimpleSearcher.java
@@ -62,6 +62,7 @@
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -297,6 +298,10 @@ public Result[] search(String q, int k, long t) throws IOException {
     return search(query, queryTokens, q, k, t);
   }
 
+  public Result[] search(Query query, int k) throws IOException {
+    return search(query, null, null, k, -1);
+  }
+
   protected Result[] search(Query query, List<String> queryTokens, String queryString, int k,
                             long t) throws IOException {
     // Create an IndexSearch only once. Note that the object is thread safe.

diff --git a/src/main/java/io/anserini/search/topicreader/BackgroundLinkingTopicReader.java b/src/main/java/io/anserini/search/topicreader/BackgroundLinkingTopicReader.java
@@ -34,6 +34,7 @@
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
+import org.jsoup.Jsoup;
 
 import java.io.BufferedReader;
 import java.io.IOException;
@@ -217,7 +218,11 @@ private static WashingtonPostCollection.Document.WashingtonPostObject getWapoObj
     }
     return wapoObj;
   }
-
+
+  private static String removeTags(String content) {
+    return Jsoup.parse(content).text();
+  }
+
   private static String getRawContents(String record) {
     WashingtonPostCollection.Document.WashingtonPostObject wapoObj = getWapoObj(record);
 
@@ -230,15 +235,15 @@ private static String getRawContents(String record) {
         if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
           contentObj.getType().ifPresent(type -> {
             contentObj.getContent().ifPresent(content -> {
-              if (WashingtonPostGenerator.CONTENT_TYPE_TAG.contains(type)) {
-                contentBuilder.append(WashingtonPostGenerator.removeTags(content)).append("\n");
+              if (WashingtonPostCollection.Document.CONTENT_TYPE_TAG.contains(type)) {
+                contentBuilder.append(removeTags(content)).append("\n");
               }
             });
           });
         }
         contentObj.getFullCaption().ifPresent(caption -> {
           String fullCaption = contentObj.getFullCaption().get();
-          contentBuilder.append(WashingtonPostGenerator.removeTags(fullCaption)).append("\n");
+          contentBuilder.append(removeTags(fullCaption)).append("\n");
         });
       }
     });
@@ -255,8 +260,8 @@ private static List<String> getParagraphs(String record) {
         if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
           contentObj.getType().ifPresent(type -> {
             contentObj.getContent().ifPresent(content -> {
-              if (WashingtonPostGenerator.CONTENT_TYPE_TAG.contains(type)) {
-                String sanityContent = WashingtonPostGenerator.removeTags(content);
+              if (WashingtonPostCollection.Document.CONTENT_TYPE_TAG.contains(type)) {
+                String sanityContent = removeTags(content);
                 if (sanityContent.trim().length() > 0) {
                   paragraphs.add(sanityContent);
                 }

diff --git a/src/main/resources/regression/backgroundlinking18.yaml b/src/main/resources/regression/backgroundlinking18.yaml
@@ -21,7 +21,7 @@ input: collections/newswire/WashingtonPost.v2/data/
 index_path: indexes/lucene-index.core18.pos+docvectors+raw
 collection: WashingtonPostCollection
 index_stats:
-  documents: 595037
+  documents: 595031
   documents (non-empty): 595030
   total terms: 318219945
 topics:

diff --git a/src/main/resources/regression/backgroundlinking19.yaml b/src/main/resources/regression/backgroundlinking19.yaml
@@ -21,7 +21,7 @@ input: collections/newswire/WashingtonPost.v2/data/
 index_path: indexes/lucene-index.core18.pos+docvectors+raw
 collection: WashingtonPostCollection
 index_stats:
-  documents: 595037
+  documents: 595031
   documents (non-empty): 595030
   total terms: 318219945
 topics: