From 75c88507a0fa14d77ae95a29152d5789469b4b5e Mon Sep 17 00:00:00 2001
From: Nikhil Gupta <niks.gupta09@gmail.com>
Date: Sun, 22 Mar 2020 15:03:59 +0000
Subject: [PATCH 1/3] Passage Indexing Works. Content contain paragraph. Raw
 contains full text

---
 docs/experiments-covid.md                     | 32 +++++---
 .../anserini/collection/CovidCollection.java  | 79 +++++++++++++------
 .../index/generator/CovidGenerator.java       |  3 +-
 3 files changed, 75 insertions(+), 39 deletions(-)

diff --git a/docs/experiments-covid.md b/docs/experiments-covid.md
index a13440f537..2eadfa7fb6 100644
--- a/docs/experiments-covid.md
+++ b/docs/experiments-covid.md
@@ -7,16 +7,22 @@ This document describes the steps to index [COVID-19 Open Research Dataset](http
 First, we need to download and extract the [COVID-19 Open Research Dataset](https://pages.semanticscholar.org/coronavirus-research):
 
 ```bash
-DATA_DIR=./covid
-mkdir ${DATA_DIR}
-
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/comm_use_subset.tar.gz -P ${DATA_DIR}
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/noncomm_use_subset.tar.gz -P ${DATA_DIR}
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/custom_license.tar.gz -P ${DATA_DIR}
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/biorxiv_medrxiv.tar.gz -P ${DATA_DIR}
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/metadata.csv -P ${DATA_DIR}
-
-ls ${DATA_DIR}/*.tar.gz | xargs --replace tar -zxvf {} -C ${DATA_DIR}
+DATE=2020-03-20
+DATA_DIR=./covid-"${DATE}"
+mkdir "${DATA_DIR}"
+
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/comm_use_subset.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/noncomm_use_subset.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/custom_license.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/biorxiv_medrxiv.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/metadata.csv -P "${DATA_DIR}"
+
+ls "${DATA_DIR}"/*.tar.gz | xargs -I {} tar -zxvf {} -C "${DATA_DIR}"
+# If the above doesn't work due to cross compatibility issues with xargs, untar all folders individually
+# tar -zxvf "${DATA_DIR}"/comm_use_subset.tar.gz -C "${DATA_DIR}"
+# tar -zxvf "${DATA_DIR}"/noncomm_use_subset.tar.gz -C "${DATA_DIR}"
+# tar -zxvf "${DATA_DIR}"/custom_license.tar.gz -C "${DATA_DIR}"
+# tar -zxvf "${DATA_DIR}"/biorxiv_medrxiv.tar.gz -C "${DATA_DIR}"
 ```
 
 We can now index these docs as a `CovidCollection` using Anserini:
@@ -24,13 +30,13 @@ We can now index these docs as a `CovidCollection` using Anserini:
 ```bash
 sh target/appassembler/bin/IndexCollection \
   -collection CovidCollection -generator CovidGenerator \
-  -threads 8 -input ${DATA_DIR} \
-  -index ${DATA_DIR}/lucene-index-covid \
+  -threads 8 -input "${DATA_DIR}" \
+  -index "${DATA_DIR}"/lucene-index-covid \
   -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs
 ```
 
 The output message should be something like this:
 
-```
+```bash
 2020-03-22 00:04:40,382 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 44,220 documents indexed in 00:05:06
 ```
diff --git a/src/main/java/io/anserini/collection/CovidCollection.java b/src/main/java/io/anserini/collection/CovidCollection.java
index b288c85977..047ec950a6 100644
--- a/src/main/java/io/anserini/collection/CovidCollection.java
+++ b/src/main/java/io/anserini/collection/CovidCollection.java
@@ -24,6 +24,7 @@
 
 import java.io.BufferedReader;
 import java.io.FileInputStream;
+import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.nio.file.Files;
@@ -33,6 +34,9 @@
 import java.util.NoSuchElementException;
 import java.util.Set;
 
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
 /**
  * A document collection for the CORD-19 dataset provided by Semantic Scholar.
  */
@@ -56,6 +60,9 @@ public class Segment extends FileSegment<CovidCollection.Document> {
     CSVParser csvParser = null;
     private CSVRecord record = null;
     private Iterator<CSVRecord> iterator = null; // iterator for CSV records
+    private String recordFullText = "";
+    private Iterator<JsonNode> paragraphIterator = null; // iterator for paragraphs in a CSV record
+    private Integer paragraphNumber = 0;
 
     public Segment(Path path) throws IOException {
       super(path);
@@ -75,17 +82,36 @@ record = iterator.next();
 
     @Override
     public void readNext() throws NoSuchElementException {
-      if (record == null) {
-        throw new NoSuchElementException("Record is empty");
-      } else {
-        bufferedRecord = new CovidCollection.Document(record);
-        if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record
-          record = iterator.next();
+      if (paragraphIterator != null && paragraphIterator.hasNext()) { // if the record contains more paragraphs, we parse them
+        String paragraph = paragraphIterator.next().get("text").asText();
+        paragraphNumber += 1;
+        bufferedRecord = new CovidCollection.Document(record, recordFullText, paragraph, paragraphNumber);
+      } else if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record
+        record = iterator.next();
+        if (record.get("has_full_text").contains("True")) {
+          String[] hashes = record.get("sha").split(";");
+          String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json";
+          try {
+            String recordFullTextPath = CovidCollection.this.path.toString() + fullTextPath;
+            recordFullText = new String(Files.readAllBytes(Paths.get(recordFullTextPath)));
+            FileReader recordFullTextFileReader = new FileReader(recordFullTextPath);
+            ObjectMapper mapper = new ObjectMapper();
+            JsonNode recordJsonNode = mapper.readerFor(JsonNode.class).readTree(recordFullTextFileReader);
+            paragraphIterator = recordJsonNode.get("body_text").elements(); 
+            
+          } catch (IOException e) {
+            LOG.error("Error parsing file at " + fullTextPath + "\n" + e.getMessage());
+          }
         } else {
-          atEOF = true; // there is no more JSON object in the bufferedReader
+          paragraphIterator = null;
+          recordFullText = "";
         }
-      }
+        paragraphNumber = 0;
+        bufferedRecord = new CovidCollection.Document(record, recordFullText);
+    } else {
+      throw new NoSuchElementException("Reached end of CSVRecord Entries Iterator");
     }
+  }
 
     @Override
     public void close() {
@@ -105,26 +131,25 @@ public void close() {
    */
   public class Document implements SourceDocument {
     private String id;
-    private String contents;
+    private String content;
+    private String raw;
     private CSVRecord record;
 
-    public Document(CSVRecord record) {
-      id = Long.toString(record.getRecordNumber());
-      contents = record.toString();
+    public Document(CSVRecord record, String recordFullText, String paragraph, Integer paragraphNumber) {
+      if (paragraphNumber == 0) {
+        id = Long.toString(record.getRecordNumber());
+      } else {
+        id = Long.toString(record.getRecordNumber()) + "." + String.format("%04d", paragraphNumber);
+      }
+      content = record.get("title").replace("\n", " ");
+      content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract");
+      content += paragraph.isEmpty() ? "" : "\n" + paragraph;
+      this.raw = recordFullText;
       this.record = record;
+    }
 
-      // index full text into raw contents
-      if (record.get("has_full_text").contains("True")) {
-        String[] hashes = record.get("sha").split(";");
-        String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json";
-        try {
-          String fullTextJson = new String(Files.readAllBytes(
-            Paths.get(CovidCollection.this.path.toString() + fullTextPath)));
-          contents += "\n " + fullTextJson;
-        } catch (IOException e) {
-          LOG.error("Error parsing file at " + fullTextPath);
-        }
-      }
+    public Document(CSVRecord record, String recordFullText) {
+      this(record, recordFullText, "", 0);
     }
 
     @Override
@@ -134,7 +159,11 @@ public String id() {
 
     @Override
     public String content() {
-      return contents;
+      return content;
+    }
+
+    public String raw() {
+      return raw;
     }
 
     @Override
diff --git a/src/main/java/io/anserini/index/generator/CovidGenerator.java b/src/main/java/io/anserini/index/generator/CovidGenerator.java
index d66102f892..2069a5d51b 100644
--- a/src/main/java/io/anserini/index/generator/CovidGenerator.java
+++ b/src/main/java/io/anserini/index/generator/CovidGenerator.java
@@ -75,6 +75,7 @@ public CovidGenerator(IndexArgs args, IndexCollection.Counters counters) {
   public Document createDocument(CovidCollection.Document covidDoc) {
     String id = covidDoc.id();
     String content = covidDoc.content();
+    String raw = covidDoc.raw();
 
     if (content == null || content.trim().isEmpty()) {
       counters.empty.incrementAndGet();
@@ -89,7 +90,7 @@ public Document createDocument(CovidCollection.Document covidDoc) {
     doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
 
     if (args.storeRawDocs) {
-      doc.add(new StoredField(IndexArgs.RAW, content));
+      doc.add(new StoredField(IndexArgs.RAW, raw));
     }
 
     FieldType fieldType = new FieldType();

From adc1aa30a5ef059e5393591a0db290935f57b070 Mon Sep 17 00:00:00 2001
From: Nikhil Gupta <niks.gupta09@gmail.com>
Date: Sun, 22 Mar 2020 15:06:12 +0000
Subject: [PATCH 2/3] Readme edit to link covid doc. Change to raw for passage
 docs. Only 1st one contains it now

---
 README.md                                        |  1 +
 docs/experiments-covid.md                        |  2 +-
 .../io/anserini/collection/CovidCollection.java  | 16 +++++++++-------
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index ba2bb63556..48f00f7443 100644
--- a/README.md
+++ b/README.md
@@ -90,6 +90,7 @@ For the most part, these runs are based on [_default_ parameter settings](https:
 The experiments described below are not associated with rigorous end-to-end regression testing and thus provide a lower standard of replicability.
 For the most part, manual copying and pasting of commands into a shell is required to replicate our results:
 
++ [Experiments on COVID-19 Open Research Dataset](docs/experiments-covid.md)
 + [Replicating "Neural Hype" Experiments](docs/experiments-forum2018.md)
 + [Guide to running BM25 baselines on the MS MARCO Passage Retrieval Task](docs/experiments-msmarco-passage.md)
 + [Guide to running BM25 baselines on the MS MARCO Document Retrieval Task](docs/experiments-msmarco-doc.md)
diff --git a/docs/experiments-covid.md b/docs/experiments-covid.md
index 2eadfa7fb6..4bec7bda07 100644
--- a/docs/experiments-covid.md
+++ b/docs/experiments-covid.md
@@ -38,5 +38,5 @@ sh target/appassembler/bin/IndexCollection \
 The output message should be something like this:
 
 ```bash
-2020-03-22 00:04:40,382 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 44,220 documents indexed in 00:05:06
+2020-03-22 15:24:49,305 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 1,096,241 documents indexed in 00:11:35
 ```
diff --git a/src/main/java/io/anserini/collection/CovidCollection.java b/src/main/java/io/anserini/collection/CovidCollection.java
index 047ec950a6..1908419e27 100644
--- a/src/main/java/io/anserini/collection/CovidCollection.java
+++ b/src/main/java/io/anserini/collection/CovidCollection.java
@@ -60,7 +60,6 @@ public class Segment extends FileSegment<CovidCollection.Document> {
     CSVParser csvParser = null;
     private CSVRecord record = null;
     private Iterator<CSVRecord> iterator = null; // iterator for CSV records
-    private String recordFullText = "";
     private Iterator<JsonNode> paragraphIterator = null; // iterator for paragraphs in a CSV record
     private Integer paragraphNumber = 0;
 
@@ -85,9 +84,10 @@ public void readNext() throws NoSuchElementException {
       if (paragraphIterator != null && paragraphIterator.hasNext()) { // if the record contains more paragraphs, we parse them
         String paragraph = paragraphIterator.next().get("text").asText();
         paragraphNumber += 1;
-        bufferedRecord = new CovidCollection.Document(record, recordFullText, paragraph, paragraphNumber);
+        bufferedRecord = new CovidCollection.Document(record, paragraph, paragraphNumber);
       } else if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record
         record = iterator.next();
+        String recordFullText = "";
         if (record.get("has_full_text").contains("True")) {
           String[] hashes = record.get("sha").split(";");
           String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json";
@@ -97,14 +97,12 @@ record = iterator.next();
             FileReader recordFullTextFileReader = new FileReader(recordFullTextPath);
             ObjectMapper mapper = new ObjectMapper();
             JsonNode recordJsonNode = mapper.readerFor(JsonNode.class).readTree(recordFullTextFileReader);
-            paragraphIterator = recordJsonNode.get("body_text").elements(); 
-            
+            paragraphIterator = recordJsonNode.get("body_text").elements();
           } catch (IOException e) {
             LOG.error("Error parsing file at " + fullTextPath + "\n" + e.getMessage());
           }
         } else {
           paragraphIterator = null;
-          recordFullText = "";
         }
         paragraphNumber = 0;
         bufferedRecord = new CovidCollection.Document(record, recordFullText);
@@ -135,7 +133,7 @@ public class Document implements SourceDocument {
     private String raw;
     private CSVRecord record;
 
-    public Document(CSVRecord record, String recordFullText, String paragraph, Integer paragraphNumber) {
+    public Document(CSVRecord record, String paragraph, Integer paragraphNumber, String recordFullText) {
       if (paragraphNumber == 0) {
         id = Long.toString(record.getRecordNumber());
       } else {
@@ -148,8 +146,12 @@ public Document(CSVRecord record, String recordFullText, String paragraph, Integ
       this.record = record;
     }
 
+    public Document(CSVRecord record, String paragraph, Integer paragraphNumber) {
+      this(record, paragraph, paragraphNumber, "");
+    }
+
     public Document(CSVRecord record, String recordFullText) {
-      this(record, recordFullText, "", 0);
+      this(record, "", 0, recordFullText);
     }
 
     @Override

From 5aafcd6cde4c427a2d4ca6af8f79e2a98d6a7812 Mon Sep 17 00:00:00 2001
From: Nikhil Gupta <niks.gupta09@gmail.com>
Date: Sun, 22 Mar 2020 16:44:24 +0000
Subject: [PATCH 3/3] CR. Add 3 versions of Covid Collections to support
 different indexing techniques

---
 docs/experiments-covid.md                     |  58 ++++--
 .../anserini/collection/CovidCollection.java  |  82 ++++----
 .../collection/CovidFullTextCollection.java   | 163 ++++++++++++++++
 .../collection/CovidParagraphCollection.java  | 180 ++++++++++++++++++
 4 files changed, 421 insertions(+), 62 deletions(-)
 create mode 100644 src/main/java/io/anserini/collection/CovidFullTextCollection.java
 create mode 100644 src/main/java/io/anserini/collection/CovidParagraphCollection.java

diff --git a/docs/experiments-covid.md b/docs/experiments-covid.md
index 4bec7bda07..9ed6f8e7ba 100644
--- a/docs/experiments-covid.md
+++ b/docs/experiments-covid.md
@@ -25,18 +25,52 @@ ls "${DATA_DIR}"/*.tar.gz | xargs -I {} tar -zxvf {} -C "${DATA_DIR}"
 # tar -zxvf "${DATA_DIR}"/biorxiv_medrxiv.tar.gz -C "${DATA_DIR}"
 ```
 
-We can now index these docs as a `CovidCollection` using Anserini:
+We can now index these docs as using Anserini; we have three versions:
 
-```bash
-sh target/appassembler/bin/IndexCollection \
-  -collection CovidCollection -generator CovidGenerator \
-  -threads 8 -input "${DATA_DIR}" \
-  -index "${DATA_DIR}"/lucene-index-covid \
-  -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs
-```
+* `CovidCollection` which adds `title` + `abstract` to Lucene Document's `content`
 
-The output message should be something like this:
+  ```bash
+  sh target/appassembler/bin/IndexCollection \
+    -collection CovidCollection -generator CovidGenerator \
+    -threads 8 -input "${DATA_DIR}" \
+    -index "${DATA_DIR}"/lucene-index-covid \
+    -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs
+  ```
 
-```bash
-2020-03-22 15:24:49,305 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 1,096,241 documents indexed in 00:11:35
-```
+  The output message should be something like this:
+
+  ```bash
+  2020-03-22 16:55:00,711 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 44,145 documents indexed in 00:01:07
+  ```
+
+* `CovidFullTextCollection` which adds `title` + `abstract` + `full json text` to Lucene Document's `content`
+
+  ```bash
+  sh target/appassembler/bin/IndexCollection \
+    -collection CovidFullTextCollection -generator CovidGenerator \
+    -threads 8 -input "${DATA_DIR}" \
+    -index "${DATA_DIR}"/lucene-index-covid-full-text \
+    -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs
+  ```
+
+  The output message should be something like this:
+
+  ```bash
+  2020-03-22 16:55:00,711 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 44,145 documents indexed in 00:01:07
+  ```
+
+* `CovidParagraphCollection` which adds `title` + `abstract` + `paragraph number x` to Lucene Document's `content`. And there will be multiple Lucene Documents for each record. Specifically, one for each paragraph in the full text for the record, hence `paragraph number x`.
+
+  ```bash
+  sh target/appassembler/bin/IndexCollection \
+    -collection CovidParagraphCollection -generator CovidGenerator \
+    -threads 8 -input "${DATA_DIR}" \
+    -index "${DATA_DIR}"/lucene-index-covid-paragraph \
+    -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs
+  ```
+
+  The output message should be something like this:
+
+  ```bash
+  2020-03-22 15:24:49,305 INFO  [main] index.IndexCollection (IndexCollection.java:845) - Total 1,096,241 documents indexed in 00:11:35
+  ```
diff --git a/src/main/java/io/anserini/collection/CovidCollection.java b/src/main/java/io/anserini/collection/CovidCollection.java
index 1908419e27..edd009ce8c 100644
--- a/src/main/java/io/anserini/collection/CovidCollection.java
+++ b/src/main/java/io/anserini/collection/CovidCollection.java
@@ -24,7 +24,6 @@
 
 import java.io.BufferedReader;
 import java.io.FileInputStream;
-import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.nio.file.Files;
@@ -34,9 +33,6 @@
 import java.util.NoSuchElementException;
 import java.util.Set;
 
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-
 /**
  * A document collection for the CORD-19 dataset provided by Semantic Scholar.
  */
@@ -60,8 +56,6 @@ public class Segment extends FileSegment<CovidCollection.Document> {
     CSVParser csvParser = null;
     private CSVRecord record = null;
     private Iterator<CSVRecord> iterator = null; // iterator for CSV records
-    private Iterator<JsonNode> paragraphIterator = null; // iterator for paragraphs in a CSV record
-    private Integer paragraphNumber = 0;
 
     public Segment(Path path) throws IOException {
       super(path);
@@ -81,35 +75,17 @@ record = iterator.next();
 
     @Override
     public void readNext() throws NoSuchElementException {
-      if (paragraphIterator != null && paragraphIterator.hasNext()) { // if the record contains more paragraphs, we parse them
-        String paragraph = paragraphIterator.next().get("text").asText();
-        paragraphNumber += 1;
-        bufferedRecord = new CovidCollection.Document(record, paragraph, paragraphNumber);
-      } else if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record
-        record = iterator.next();
-        String recordFullText = "";
-        if (record.get("has_full_text").contains("True")) {
-          String[] hashes = record.get("sha").split(";");
-          String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json";
-          try {
-            String recordFullTextPath = CovidCollection.this.path.toString() + fullTextPath;
-            recordFullText = new String(Files.readAllBytes(Paths.get(recordFullTextPath)));
-            FileReader recordFullTextFileReader = new FileReader(recordFullTextPath);
-            ObjectMapper mapper = new ObjectMapper();
-            JsonNode recordJsonNode = mapper.readerFor(JsonNode.class).readTree(recordFullTextFileReader);
-            paragraphIterator = recordJsonNode.get("body_text").elements();
-          } catch (IOException e) {
-            LOG.error("Error parsing file at " + fullTextPath + "\n" + e.getMessage());
-          }
+      if (record == null) {
+        throw new NoSuchElementException("Record is empty");
+      } else {
+        bufferedRecord = new CovidCollection.Document(record);
+        if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record
+          record = iterator.next();
         } else {
-          paragraphIterator = null;
+          atEOF = true; // there is no more JSON object in the bufferedReader
         }
-        paragraphNumber = 0;
-        bufferedRecord = new CovidCollection.Document(record, recordFullText);
-    } else {
-      throw new NoSuchElementException("Reached end of CSVRecord Entries Iterator");
+      }
     }
-  }
 
     @Override
     public void close() {
@@ -133,25 +109,31 @@ public class Document implements SourceDocument {
     private String raw;
     private CSVRecord record;
 
-    public Document(CSVRecord record, String paragraph, Integer paragraphNumber, String recordFullText) {
-      if (paragraphNumber == 0) {
-        id = Long.toString(record.getRecordNumber());
-      } else {
-        id = Long.toString(record.getRecordNumber()) + "." + String.format("%04d", paragraphNumber);
-      }
+    public Document(CSVRecord record) {
+      id = Long.toString(record.getRecordNumber());
       content = record.get("title").replace("\n", " ");
       content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract");
-      content += paragraph.isEmpty() ? "" : "\n" + paragraph;
-      this.raw = recordFullText;
       this.record = record;
-    }
 
-    public Document(CSVRecord record, String paragraph, Integer paragraphNumber) {
-      this(record, paragraph, paragraphNumber, "");
-    }
+      String fullTextJson = "";
+      // index full text into raw content
+      if (record.get("has_full_text").contains("True")) {
+        String[] hashes = record.get("sha").split(";");
+        String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json";
+        try {
+          fullTextJson = new String(Files.readAllBytes(
+            Paths.get(CovidCollection.this.path.toString() + fullTextPath)));
+        } catch (IOException e) {
+          LOG.error("Error parsing file at " + fullTextPath);
+          raw = record.toString();
+        }
+      }
 
-    public Document(CSVRecord record, String recordFullText) {
-      this(record, "", 0, recordFullText);
+      if (!fullTextJson.isEmpty()) {
+        raw = fullTextJson;
+      } else {
+        raw = record.toString();
+      }
     }
 
     @Override
@@ -164,15 +146,15 @@ public String content() {
       return content;
     }
 
-    public String raw() {
-      return raw;
-    }
-
     @Override
     public boolean indexable() {
       return true;
     }
 
+    public String raw() {
+      return raw;
+    }
+
     public CSVRecord record() {
       return record;
     }
diff --git a/src/main/java/io/anserini/collection/CovidFullTextCollection.java b/src/main/java/io/anserini/collection/CovidFullTextCollection.java
new file mode 100644
index 0000000000..da11e485b5
--- /dev/null
+++ b/src/main/java/io/anserini/collection/CovidFullTextCollection.java
@@ -0,0 +1,163 @@
+/*
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.collection;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.Set;
+
+/**
+ * A document collection for the CORD-19 dataset provided by Semantic Scholar.
+ */
+public class CovidFullTextCollection extends DocumentCollection<CovidFullTextCollection.Document> {
+  private static final Logger LOG = LogManager.getLogger(CovidFullTextCollection.class);
+
+  public CovidFullTextCollection(Path path){
+    this.path = path;
+    this.allowedFileSuffix = Set.of(".csv");
+  }
+
+  @Override
+  public FileSegment<CovidFullTextCollection.Document> createFileSegment(Path p) throws IOException {
+    return new Segment(p);
+  }
+
+  /**
+   * A file containing a single CSV document.
+   */
+  public class Segment extends FileSegment<CovidFullTextCollection.Document> {
+    CSVParser csvParser = null;
+    private CSVRecord record = null;
+    private Iterator<CSVRecord> iterator = null; // iterator for CSV records
+
+    public Segment(Path path) throws IOException {
+      super(path);
+      bufferedReader = new BufferedReader(new InputStreamReader(
+          new FileInputStream(path.toString())));
+
+      csvParser = new CSVParser(bufferedReader, CSVFormat.DEFAULT
+        .withFirstRecordAsHeader()
+        .withIgnoreHeaderCase()
+        .withTrim());
+
+      iterator = csvParser.iterator();
+      if (iterator.hasNext()) {
+        record = iterator.next();
+      }
+    }
+
+    @Override
+    public void readNext() throws NoSuchElementException {
+      if (record == null) {
+        throw new NoSuchElementException("Record is empty");
+      } else {
+        bufferedRecord = new CovidFullTextCollection.Document(record);
+        if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record
+          record = iterator.next();
+        } else {
+          atEOF = true; // there is no more JSON object in the bufferedReader
+        }
+      }
+    }
+
+    @Override
+    public void close() {
+      super.close();
+      if (csvParser != null) {
+        try {
+          csvParser.close();
+        } catch (IOException e) {
+          // do nothing
+        }
+      }
+    }
+  }
+
+  /**
+   * A document in a CORD-19 collection.
+   */
+  public class Document implements SourceDocument {
+    private String id;
+    private String content;
+    private String raw;
+    private CSVRecord record;
+
+    public Document(CSVRecord record) {
+      id = Long.toString(record.getRecordNumber());
+      content = record.get("title").replace("\n", " ");
+      content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract");
+      this.record = record;
+
+      String fullTextJson = "";
+      // index full text into raw content
+      if (record.get("has_full_text").contains("True")) {
+        String[] hashes = record.get("sha").split(";");
+        String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json";
+        try {
+          fullTextJson = new String(Files.readAllBytes(
+            Paths.get(CovidFullTextCollection.this.path.toString() + fullTextPath)));
+        } catch (IOException e) {
+          LOG.error("Error parsing file at " + fullTextPath);
+          raw = record.toString();
+        }
+      }
+
+      if (!fullTextJson.isEmpty()) {
+        content += fullTextJson.isEmpty() ? "" : "\n " + fullTextJson;
+        raw = fullTextJson;
+      } else {
+        raw = record.toString();
+      }      
+    }
+
+    @Override
+    public String id() {
+      return id;
+    }
+
+    @Override
+    public String content() {
+      return content;
+    }
+
+    @Override
+    public boolean indexable() {
+      return true;
+    }
+
+    public String raw() {
+      return raw;
+    }
+
+    public CSVRecord record() {
+      return record;
+    }
+  }
+}
diff --git a/src/main/java/io/anserini/collection/CovidParagraphCollection.java b/src/main/java/io/anserini/collection/CovidParagraphCollection.java
new file mode 100644
index 0000000000..126f568612
--- /dev/null
+++ b/src/main/java/io/anserini/collection/CovidParagraphCollection.java
@@ -0,0 +1,180 @@
+/*
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.collection;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.Set;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+/**
+ * A document collection for the CORD-19 dataset provided by Semantic Scholar.
+ */
+public class CovidParagraphCollection extends DocumentCollection<CovidParagraphCollection.Document> {
+  private static final Logger LOG = LogManager.getLogger(CovidParagraphCollection.class);
+
+  public CovidParagraphCollection(Path path){
+    this.path = path;
+    this.allowedFileSuffix = Set.of(".csv");
+  }
+
+  @Override
+  public FileSegment<CovidParagraphCollection.Document> createFileSegment(Path p) throws IOException {
+    return new Segment(p);
+  }
+
+  /**
+   * A file containing a single CSV document.
+   */
+  public class Segment extends FileSegment<CovidParagraphCollection.Document> {
+    CSVParser csvParser = null;
+    private CSVRecord record = null;
+    private Iterator<CSVRecord> iterator = null; // iterator for CSV records
+    private Iterator<JsonNode> paragraphIterator = null; // iterator for paragraphs in a CSV record
+    private Integer paragraphNumber = 0;
+
+    public Segment(Path path) throws IOException {
+      super(path);
+      bufferedReader = new BufferedReader(new InputStreamReader(
+          new FileInputStream(path.toString())));
+
+      csvParser = new CSVParser(bufferedReader, CSVFormat.DEFAULT
+        .withFirstRecordAsHeader()
+        .withIgnoreHeaderCase()
+        .withTrim());
+
+      iterator = csvParser.iterator();
+      if (iterator.hasNext()) {
+        record = iterator.next();
+      }
+    }
+
+    @Override
+    public void readNext() throws NoSuchElementException {
+      if (paragraphIterator != null && paragraphIterator.hasNext()) { // if the record contains more paragraphs, we parse them
+        String paragraph = paragraphIterator.next().get("text").asText();
+        paragraphNumber += 1;
+        bufferedRecord = new CovidParagraphCollection.Document(record, paragraph, paragraphNumber);
+      } else if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record
+        record = iterator.next();
+        String recordFullText = "";
+        if (record.get("has_full_text").contains("True")) {
+          String[] hashes = record.get("sha").split(";");
+          String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json";
+          try {
+            String recordFullTextPath = CovidParagraphCollection.this.path.toString() + fullTextPath;
+            recordFullText = new String(Files.readAllBytes(Paths.get(recordFullTextPath)));
+            FileReader recordFullTextFileReader = new FileReader(recordFullTextPath);
+            ObjectMapper mapper = new ObjectMapper();
+            JsonNode recordJsonNode = mapper.readerFor(JsonNode.class).readTree(recordFullTextFileReader);
+            paragraphIterator = recordJsonNode.get("body_text").elements();
+          } catch (IOException e) {
+            LOG.error("Error parsing file at " + fullTextPath + "\n" + e.getMessage());
+          }
+        } else {
+          paragraphIterator = null;
+        }
+        paragraphNumber = 0;
+        bufferedRecord = new CovidParagraphCollection.Document(record, recordFullText);
+    } else {
+      throw new NoSuchElementException("Reached end of CSVRecord Entries Iterator");
+    }
+  }
+
+    @Override
+    public void close() {
+      super.close();
+      if (csvParser != null) {
+        try {
+          csvParser.close();
+        } catch (IOException e) {
+          // do nothing
+        }
+      }
+    }
+  }
+
+  /**
+   * A document in a CORD-19 collection.
+   */
+  public class Document implements SourceDocument {
+    private String id;
+    private String content;
+    private String raw;
+    private CSVRecord record;
+
+    public Document(CSVRecord record, String paragraph, Integer paragraphNumber, String recordFullText) {
+      if (paragraphNumber == 0) {
+        id = Long.toString(record.getRecordNumber());
+      } else {
+        id = Long.toString(record.getRecordNumber()) + "." + String.format("%05d", paragraphNumber);
+      }
+      content = record.get("title").replace("\n", " ");
+      content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract");
+      content += paragraph.isEmpty() ? "" : "\n" + paragraph;
+      this.raw = recordFullText;
+      this.record = record;
+    }
+
+    public Document(CSVRecord record, String paragraph, Integer paragraphNumber) {
+      this(record, paragraph, paragraphNumber, "");
+    }
+
+    public Document(CSVRecord record, String recordFullText) {
+      this(record, "", 0, recordFullText);
+    }
+
+    @Override
+    public String id() {
+      return id;
+    }
+
+    @Override
+    public String content() {
+      return content;
+    }
+
+    public String raw() {
+      return raw;
+    }
+
+    @Override
+    public boolean indexable() {
+      return true;
+    }
+
+    public CSVRecord record() {
+      return record;
+    }
+  }
+}