COVID Passage Indexing (#1049)

castorini · Mar 22, 2020 · a90a0ce · a90a0ce
1 parent 99e9709
commit a90a0ce
Show file tree

Hide file tree

Showing 6 changed files with 425 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -90,6 +90,7 @@ For the most part, these runs are based on [_default_ parameter settings](https:
 The experiments described below are not associated with rigorous end-to-end regression testing and thus provide a lower standard of replicability.
 For the most part, manual copying and pasting of commands into a shell is required to replicate our results:
 
++ [Experiments on COVID-19 Open Research Dataset](docs/experiments-covid.md)
 + [Replicating "Neural Hype" Experiments](docs/experiments-forum2018.md)
 + [Guide to running BM25 baselines on the MS MARCO Passage Retrieval Task](docs/experiments-msmarco-passage.md)
 + [Guide to running BM25 baselines on the MS MARCO Document Retrieval Task](docs/experiments-msmarco-doc.md)

diff --git a/docs/experiments-covid.md b/docs/experiments-covid.md
@@ -7,30 +7,70 @@ This document describes the steps to index [COVID-19 Open Research Dataset](http
 First, we need to download and extract the [COVID-19 Open Research Dataset](https://pages.semanticscholar.org/coronavirus-research):
 
 ```bash
-DATA_DIR=./covid
-mkdir ${DATA_DIR}
+DATE=2020-03-20
+DATA_DIR=./covid-"${DATE}"
+mkdir "${DATA_DIR}"
 
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/comm_use_subset.tar.gz -P ${DATA_DIR}
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/noncomm_use_subset.tar.gz -P ${DATA_DIR}
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/custom_license.tar.gz -P ${DATA_DIR}
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/biorxiv_medrxiv.tar.gz -P ${DATA_DIR}
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/metadata.csv -P ${DATA_DIR}
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/comm_use_subset.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/noncomm_use_subset.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/custom_license.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/biorxiv_medrxiv.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/metadata.csv -P "${DATA_DIR}"
 
-ls ${DATA_DIR}/*.tar.gz | xargs --replace tar -zxvf {} -C ${DATA_DIR}
+ls "${DATA_DIR}"/*.tar.gz | xargs -I {} tar -zxvf {} -C "${DATA_DIR}"
+# If the above doesn't work due to cross compatibility issues with xargs, untar all folders individually
+# tar -zxvf "${DATA_DIR}"/comm_use_subset.tar.gz -C "${DATA_DIR}"
+# tar -zxvf "${DATA_DIR}"/noncomm_use_subset.tar.gz -C "${DATA_DIR}"
+# tar -zxvf "${DATA_DIR}"/custom_license.tar.gz -C "${DATA_DIR}"
+# tar -zxvf "${DATA_DIR}"/biorxiv_medrxiv.tar.gz -C "${DATA_DIR}"
 ```
 
-We can now index these docs as a `CovidCollection` using Anserini:
+We can now index these docs as using Anserini; we have three versions:
 
-```bash
-sh target/appassembler/bin/IndexCollection \
-  -collection CovidCollection -generator CovidGenerator \
-  -threads 8 -input ${DATA_DIR} \
-  -index ${DATA_DIR}/lucene-index-covid \
-  -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs
-```
+* `CovidCollection` which adds `title` + `abstract` to Lucene Document's `content`
 
-The output message should be something like this:
+  ```bash
+  sh target/appassembler/bin/IndexCollection \
+    -collection CovidCollection -generator CovidGenerator \
+    -threads 8 -input "${DATA_DIR}" \
+    -index "${DATA_DIR}"/lucene-index-covid \
+    -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs
+  ```
 
-```
-2020-03-22 00:04:40,382 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 44,220 documents indexed in 00:05:06
-```
+  The output message should be something like this:
+
+  ```bash
+  2020-03-22 16:55:00,711 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 44,145 documents indexed in 00:01:07
+  ```
+
+* `CovidFullTextCollection` which adds `title` + `abstract` + `full json text` to Lucene Document's `content`
+
+  ```bash
+  sh target/appassembler/bin/IndexCollection \
+    -collection CovidFullTextCollection -generator CovidGenerator \
+    -threads 8 -input "${DATA_DIR}" \
+    -index "${DATA_DIR}"/lucene-index-covid-full-text \
+    -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs
+  ```
+
+  The output message should be something like this:
+
+  ```bash
+  2020-03-22 16:55:00,711 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 44,145 documents indexed in 00:01:07
+  ```
+
+* `CovidParagraphCollection` which adds `title` + `abstract` + `paragraph number x` to Lucene Document's `content`. And there will be multiple Lucene Documents for each record. Specifically, one for each paragraph in the full text for the record, hence `paragraph number x`.
+
+  ```bash
+  sh target/appassembler/bin/IndexCollection \
+    -collection CovidParagraphCollection -generator CovidGenerator \
+    -threads 8 -input "${DATA_DIR}" \
+    -index "${DATA_DIR}"/lucene-index-covid-paragraph \
+    -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs
+  ```
+
+  The output message should be something like this:
+
+  ```bash
+  2020-03-22 15:24:49,305 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 1,096,241 documents indexed in 00:11:35
+  ```
diff --git a/src/main/java/io/anserini/collection/CovidCollection.java b/src/main/java/io/anserini/collection/CovidCollection.java
@@ -105,26 +105,35 @@ public void close() {
    */
   public class Document implements SourceDocument {
     private String id;
-    private String contents;
+    private String content;
+    private String raw;
     private CSVRecord record;
 
     public Document(CSVRecord record) {
       id = Long.toString(record.getRecordNumber());
-      contents = record.toString();
+      content = record.get("title").replace("\n", " ");
+      content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract");
       this.record = record;
 
-      // index full text into raw contents
+      String fullTextJson = "";
+      // index full text into raw content
       if (record.get("has_full_text").contains("True")) {
         String[] hashes = record.get("sha").split(";");
         String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json";
         try {
-          String fullTextJson = new String(Files.readAllBytes(
+          fullTextJson = new String(Files.readAllBytes(
             Paths.get(CovidCollection.this.path.toString() + fullTextPath)));
-          contents += "\n " + fullTextJson;
         } catch (IOException e) {
           LOG.error("Error parsing file at " + fullTextPath);
+          raw = record.toString();
         }
       }
+
+      if (!fullTextJson.isEmpty()) {
+        raw = fullTextJson;
+      } else {
+        raw = record.toString();
+      }
     }
 
     @Override
@@ -134,14 +143,18 @@ public String id() {
 
     @Override
     public String content() {
-      return contents;
+      return content;
     }
 
     @Override
     public boolean indexable() {
       return true;
     }
 
+    public String raw() {
+      return raw;
+    }
+
     public CSVRecord record() {
       return record;
     }

diff --git a/src/main/java/io/anserini/collection/CovidFullTextCollection.java b/src/main/java/io/anserini/collection/CovidFullTextCollection.java
@@ -0,0 +1,163 @@
+/*
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.collection;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.Set;
+
+/**
+ * A document collection for the CORD-19 dataset provided by Semantic Scholar.
+ */
+public class CovidFullTextCollection extends DocumentCollection<CovidFullTextCollection.Document> {
+  private static final Logger LOG = LogManager.getLogger(CovidFullTextCollection.class);
+
+  public CovidFullTextCollection(Path path){
+    this.path = path;
+    this.allowedFileSuffix = Set.of(".csv");
+  }
+
+  @Override
+  public FileSegment<CovidFullTextCollection.Document> createFileSegment(Path p) throws IOException {
+    return new Segment(p);
+  }
+
+  /**
+   * A file containing a single CSV document.
+   */
+  public class Segment extends FileSegment<CovidFullTextCollection.Document> {
+    CSVParser csvParser = null;
+    private CSVRecord record = null;
+    private Iterator<CSVRecord> iterator = null; // iterator for CSV records
+
+    public Segment(Path path) throws IOException {
+      super(path);
+      bufferedReader = new BufferedReader(new InputStreamReader(
+          new FileInputStream(path.toString())));
+
+      csvParser = new CSVParser(bufferedReader, CSVFormat.DEFAULT
+        .withFirstRecordAsHeader()
+        .withIgnoreHeaderCase()
+        .withTrim());
+
+      iterator = csvParser.iterator();
+      if (iterator.hasNext()) {
+        record = iterator.next();
+      }
+    }
+
+    @Override
+    public void readNext() throws NoSuchElementException {
+      if (record == null) {
+        throw new NoSuchElementException("Record is empty");
+      } else {
+        bufferedRecord = new CovidFullTextCollection.Document(record);
+        if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record
+          record = iterator.next();
+        } else {
+          atEOF = true; // there is no more JSON object in the bufferedReader
+        }
+      }
+    }
+
+    @Override
+    public void close() {
+      super.close();
+      if (csvParser != null) {
+        try {
+          csvParser.close();
+        } catch (IOException e) {
+          // do nothing
+        }
+      }
+    }
+  }
+
+  /**
+   * A document in a CORD-19 collection.
+   */
+  public class Document implements SourceDocument {
+    private String id;
+    private String content;
+    private String raw;
+    private CSVRecord record;
+
+    public Document(CSVRecord record) {
+      id = Long.toString(record.getRecordNumber());
+      content = record.get("title").replace("\n", " ");
+      content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract");
+      this.record = record;
+
+      String fullTextJson = "";
+      // index full text into raw content
+      if (record.get("has_full_text").contains("True")) {
+        String[] hashes = record.get("sha").split(";");
+        String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json";
+        try {
+          fullTextJson = new String(Files.readAllBytes(
+            Paths.get(CovidFullTextCollection.this.path.toString() + fullTextPath)));
+        } catch (IOException e) {
+          LOG.error("Error parsing file at " + fullTextPath);
+          raw = record.toString();
+        }
+      }
+
+      if (!fullTextJson.isEmpty()) {
+        content += fullTextJson.isEmpty() ? "" : "\n " + fullTextJson;
+        raw = fullTextJson;
+      } else {
+        raw = record.toString();
+      }      
+    }
+
+    @Override
+    public String id() {
+      return id;
+    }
+
+    @Override
+    public String content() {
+      return content;
+    }
+
+    @Override
+    public boolean indexable() {
+      return true;
+    }
+
+    public String raw() {
+      return raw;
+    }
+
+    public CSVRecord record() {
+      return record;
+    }
+  }
+}