diff --git a/README.md b/README.md index ba2bb63556..48f00f7443 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,7 @@ For the most part, these runs are based on [_default_ parameter settings](https: The experiments described below are not associated with rigorous end-to-end regression testing and thus provide a lower standard of replicability. For the most part, manual copying and pasting of commands into a shell is required to replicate our results: ++ [Experiments on COVID-19 Open Research Dataset](docs/experiments-covid.md) + [Replicating "Neural Hype" Experiments](docs/experiments-forum2018.md) + [Guide to running BM25 baselines on the MS MARCO Passage Retrieval Task](docs/experiments-msmarco-passage.md) + [Guide to running BM25 baselines on the MS MARCO Document Retrieval Task](docs/experiments-msmarco-doc.md) diff --git a/docs/experiments-covid.md b/docs/experiments-covid.md index a13440f537..9ed6f8e7ba 100644 --- a/docs/experiments-covid.md +++ b/docs/experiments-covid.md @@ -7,30 +7,70 @@ This document describes the steps to index [COVID-19 Open Research Dataset](http First, we need to download and extract the [COVID-19 Open Research Dataset](https://pages.semanticscholar.org/coronavirus-research): ```bash -DATA_DIR=./covid -mkdir ${DATA_DIR} +DATE=2020-03-20 +DATA_DIR=./covid-"${DATE}" +mkdir "${DATA_DIR}" -wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/comm_use_subset.tar.gz -P ${DATA_DIR} -wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/noncomm_use_subset.tar.gz -P ${DATA_DIR} -wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/custom_license.tar.gz -P ${DATA_DIR} -wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/biorxiv_medrxiv.tar.gz -P ${DATA_DIR} -wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/metadata.csv -P ${DATA_DIR} +wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/comm_use_subset.tar.gz -P "${DATA_DIR}" +wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/noncomm_use_subset.tar.gz -P "${DATA_DIR}" +wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/custom_license.tar.gz -P "${DATA_DIR}" +wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/biorxiv_medrxiv.tar.gz -P "${DATA_DIR}" +wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/metadata.csv -P "${DATA_DIR}" -ls ${DATA_DIR}/*.tar.gz | xargs --replace tar -zxvf {} -C ${DATA_DIR} +ls "${DATA_DIR}"/*.tar.gz | xargs -I {} tar -zxvf {} -C "${DATA_DIR}" +# If the above doesn't work due to cross compatibility issues with xargs, untar all folders individually +# tar -zxvf "${DATA_DIR}"/comm_use_subset.tar.gz -C "${DATA_DIR}" +# tar -zxvf "${DATA_DIR}"/noncomm_use_subset.tar.gz -C "${DATA_DIR}" +# tar -zxvf "${DATA_DIR}"/custom_license.tar.gz -C "${DATA_DIR}" +# tar -zxvf "${DATA_DIR}"/biorxiv_medrxiv.tar.gz -C "${DATA_DIR}" ``` -We can now index these docs as a `CovidCollection` using Anserini: +We can now index these docs as using Anserini; we have three versions: -```bash -sh target/appassembler/bin/IndexCollection \ - -collection CovidCollection -generator CovidGenerator \ - -threads 8 -input ${DATA_DIR} \ - -index ${DATA_DIR}/lucene-index-covid \ - -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs -``` +* `CovidCollection` which adds `title` + `abstract` to Lucene Document's `content` -The output message should be something like this: + ```bash + sh target/appassembler/bin/IndexCollection \ + -collection CovidCollection -generator CovidGenerator \ + -threads 8 -input "${DATA_DIR}" \ + -index "${DATA_DIR}"/lucene-index-covid \ + -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs + ``` -``` -2020-03-22 00:04:40,382 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 44,220 documents indexed in 00:05:06 -``` + The output message should be something like this: + + ```bash + 2020-03-22 16:55:00,711 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 44,145 documents indexed in 00:01:07 + ``` + +* `CovidFullTextCollection` which adds `title` + `abstract` + `full json text` to Lucene Document's `content` + + ```bash + sh target/appassembler/bin/IndexCollection \ + -collection CovidFullTextCollection -generator CovidGenerator \ + -threads 8 -input "${DATA_DIR}" \ + -index "${DATA_DIR}"/lucene-index-covid-full-text \ + -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs + ``` + + The output message should be something like this: + + ```bash + 2020-03-22 16:55:00,711 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 44,145 documents indexed in 00:01:07 + ``` + +* `CovidParagraphCollection` which adds `title` + `abstract` + `paragraph number x` to Lucene Document's `content`. And there will be multiple Lucene Documents for each record. Specifically, one for each paragraph in the full text for the record, hence `paragraph number x`. + + ```bash + sh target/appassembler/bin/IndexCollection \ + -collection CovidParagraphCollection -generator CovidGenerator \ + -threads 8 -input "${DATA_DIR}" \ + -index "${DATA_DIR}"/lucene-index-covid-paragraph \ + -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs + ``` + + The output message should be something like this: + + ```bash + 2020-03-22 15:24:49,305 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 1,096,241 documents indexed in 00:11:35 + ``` diff --git a/src/main/java/io/anserini/collection/CovidCollection.java b/src/main/java/io/anserini/collection/CovidCollection.java index b288c85977..edd009ce8c 100644 --- a/src/main/java/io/anserini/collection/CovidCollection.java +++ b/src/main/java/io/anserini/collection/CovidCollection.java @@ -105,26 +105,35 @@ public void close() { */ public class Document implements SourceDocument { private String id; - private String contents; + private String content; + private String raw; private CSVRecord record; public Document(CSVRecord record) { id = Long.toString(record.getRecordNumber()); - contents = record.toString(); + content = record.get("title").replace("\n", " "); + content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract"); this.record = record; - // index full text into raw contents + String fullTextJson = ""; + // index full text into raw content if (record.get("has_full_text").contains("True")) { String[] hashes = record.get("sha").split(";"); String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json"; try { - String fullTextJson = new String(Files.readAllBytes( + fullTextJson = new String(Files.readAllBytes( Paths.get(CovidCollection.this.path.toString() + fullTextPath))); - contents += "\n " + fullTextJson; } catch (IOException e) { LOG.error("Error parsing file at " + fullTextPath); + raw = record.toString(); } } + + if (!fullTextJson.isEmpty()) { + raw = fullTextJson; + } else { + raw = record.toString(); + } } @Override @@ -134,7 +143,7 @@ public String id() { @Override public String content() { - return contents; + return content; } @Override @@ -142,6 +151,10 @@ public boolean indexable() { return true; } + public String raw() { + return raw; + } + public CSVRecord record() { return record; } diff --git a/src/main/java/io/anserini/collection/CovidFullTextCollection.java b/src/main/java/io/anserini/collection/CovidFullTextCollection.java new file mode 100644 index 0000000000..da11e485b5 --- /dev/null +++ b/src/main/java/io/anserini/collection/CovidFullTextCollection.java @@ -0,0 +1,163 @@ +/* + * Anserini: A Lucene toolkit for replicable information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.Set; + +/** + * A document collection for the CORD-19 dataset provided by Semantic Scholar. + */ +public class CovidFullTextCollection extends DocumentCollection { + private static final Logger LOG = LogManager.getLogger(CovidFullTextCollection.class); + + public CovidFullTextCollection(Path path){ + this.path = path; + this.allowedFileSuffix = Set.of(".csv"); + } + + @Override + public FileSegment createFileSegment(Path p) throws IOException { + return new Segment(p); + } + + /** + * A file containing a single CSV document. + */ + public class Segment extends FileSegment { + CSVParser csvParser = null; + private CSVRecord record = null; + private Iterator iterator = null; // iterator for CSV records + + public Segment(Path path) throws IOException { + super(path); + bufferedReader = new BufferedReader(new InputStreamReader( + new FileInputStream(path.toString()))); + + csvParser = new CSVParser(bufferedReader, CSVFormat.DEFAULT + .withFirstRecordAsHeader() + .withIgnoreHeaderCase() + .withTrim()); + + iterator = csvParser.iterator(); + if (iterator.hasNext()) { + record = iterator.next(); + } + } + + @Override + public void readNext() throws NoSuchElementException { + if (record == null) { + throw new NoSuchElementException("Record is empty"); + } else { + bufferedRecord = new CovidFullTextCollection.Document(record); + if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record + record = iterator.next(); + } else { + atEOF = true; // there is no more JSON object in the bufferedReader + } + } + } + + @Override + public void close() { + super.close(); + if (csvParser != null) { + try { + csvParser.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** + * A document in a CORD-19 collection. + */ + public class Document implements SourceDocument { + private String id; + private String content; + private String raw; + private CSVRecord record; + + public Document(CSVRecord record) { + id = Long.toString(record.getRecordNumber()); + content = record.get("title").replace("\n", " "); + content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract"); + this.record = record; + + String fullTextJson = ""; + // index full text into raw content + if (record.get("has_full_text").contains("True")) { + String[] hashes = record.get("sha").split(";"); + String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json"; + try { + fullTextJson = new String(Files.readAllBytes( + Paths.get(CovidFullTextCollection.this.path.toString() + fullTextPath))); + } catch (IOException e) { + LOG.error("Error parsing file at " + fullTextPath); + raw = record.toString(); + } + } + + if (!fullTextJson.isEmpty()) { + content += fullTextJson.isEmpty() ? "" : "\n " + fullTextJson; + raw = fullTextJson; + } else { + raw = record.toString(); + } + } + + @Override + public String id() { + return id; + } + + @Override + public String content() { + return content; + } + + @Override + public boolean indexable() { + return true; + } + + public String raw() { + return raw; + } + + public CSVRecord record() { + return record; + } + } +} diff --git a/src/main/java/io/anserini/collection/CovidParagraphCollection.java b/src/main/java/io/anserini/collection/CovidParagraphCollection.java new file mode 100644 index 0000000000..126f568612 --- /dev/null +++ b/src/main/java/io/anserini/collection/CovidParagraphCollection.java @@ -0,0 +1,180 @@ +/* + * Anserini: A Lucene toolkit for replicable information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.Set; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * A document collection for the CORD-19 dataset provided by Semantic Scholar. + */ +public class CovidParagraphCollection extends DocumentCollection { + private static final Logger LOG = LogManager.getLogger(CovidParagraphCollection.class); + + public CovidParagraphCollection(Path path){ + this.path = path; + this.allowedFileSuffix = Set.of(".csv"); + } + + @Override + public FileSegment createFileSegment(Path p) throws IOException { + return new Segment(p); + } + + /** + * A file containing a single CSV document. + */ + public class Segment extends FileSegment { + CSVParser csvParser = null; + private CSVRecord record = null; + private Iterator iterator = null; // iterator for CSV records + private Iterator paragraphIterator = null; // iterator for paragraphs in a CSV record + private Integer paragraphNumber = 0; + + public Segment(Path path) throws IOException { + super(path); + bufferedReader = new BufferedReader(new InputStreamReader( + new FileInputStream(path.toString()))); + + csvParser = new CSVParser(bufferedReader, CSVFormat.DEFAULT + .withFirstRecordAsHeader() + .withIgnoreHeaderCase() + .withTrim()); + + iterator = csvParser.iterator(); + if (iterator.hasNext()) { + record = iterator.next(); + } + } + + @Override + public void readNext() throws NoSuchElementException { + if (paragraphIterator != null && paragraphIterator.hasNext()) { // if the record contains more paragraphs, we parse them + String paragraph = paragraphIterator.next().get("text").asText(); + paragraphNumber += 1; + bufferedRecord = new CovidParagraphCollection.Document(record, paragraph, paragraphNumber); + } else if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record + record = iterator.next(); + String recordFullText = ""; + if (record.get("has_full_text").contains("True")) { + String[] hashes = record.get("sha").split(";"); + String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json"; + try { + String recordFullTextPath = CovidParagraphCollection.this.path.toString() + fullTextPath; + recordFullText = new String(Files.readAllBytes(Paths.get(recordFullTextPath))); + FileReader recordFullTextFileReader = new FileReader(recordFullTextPath); + ObjectMapper mapper = new ObjectMapper(); + JsonNode recordJsonNode = mapper.readerFor(JsonNode.class).readTree(recordFullTextFileReader); + paragraphIterator = recordJsonNode.get("body_text").elements(); + } catch (IOException e) { + LOG.error("Error parsing file at " + fullTextPath + "\n" + e.getMessage()); + } + } else { + paragraphIterator = null; + } + paragraphNumber = 0; + bufferedRecord = new CovidParagraphCollection.Document(record, recordFullText); + } else { + throw new NoSuchElementException("Reached end of CSVRecord Entries Iterator"); + } + } + + @Override + public void close() { + super.close(); + if (csvParser != null) { + try { + csvParser.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** + * A document in a CORD-19 collection. + */ + public class Document implements SourceDocument { + private String id; + private String content; + private String raw; + private CSVRecord record; + + public Document(CSVRecord record, String paragraph, Integer paragraphNumber, String recordFullText) { + if (paragraphNumber == 0) { + id = Long.toString(record.getRecordNumber()); + } else { + id = Long.toString(record.getRecordNumber()) + "." + String.format("%05d", paragraphNumber); + } + content = record.get("title").replace("\n", " "); + content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract"); + content += paragraph.isEmpty() ? "" : "\n" + paragraph; + this.raw = recordFullText; + this.record = record; + } + + public Document(CSVRecord record, String paragraph, Integer paragraphNumber) { + this(record, paragraph, paragraphNumber, ""); + } + + public Document(CSVRecord record, String recordFullText) { + this(record, "", 0, recordFullText); + } + + @Override + public String id() { + return id; + } + + @Override + public String content() { + return content; + } + + public String raw() { + return raw; + } + + @Override + public boolean indexable() { + return true; + } + + public CSVRecord record() { + return record; + } + } +} diff --git a/src/main/java/io/anserini/index/generator/CovidGenerator.java b/src/main/java/io/anserini/index/generator/CovidGenerator.java index d66102f892..2069a5d51b 100644 --- a/src/main/java/io/anserini/index/generator/CovidGenerator.java +++ b/src/main/java/io/anserini/index/generator/CovidGenerator.java @@ -75,6 +75,7 @@ public CovidGenerator(IndexArgs args, IndexCollection.Counters counters) { public Document createDocument(CovidCollection.Document covidDoc) { String id = covidDoc.id(); String content = covidDoc.content(); + String raw = covidDoc.raw(); if (content == null || content.trim().isEmpty()) { counters.empty.incrementAndGet(); @@ -89,7 +90,7 @@ public Document createDocument(CovidCollection.Document covidDoc) { doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id))); if (args.storeRawDocs) { - doc.add(new StoredField(IndexArgs.RAW, content)); + doc.add(new StoredField(IndexArgs.RAW, raw)); } FieldType fieldType = new FieldType();