From 75c88507a0fa14d77ae95a29152d5789469b4b5e Mon Sep 17 00:00:00 2001 From: Nikhil Gupta Date: Sun, 22 Mar 2020 15:03:59 +0000 Subject: [PATCH 1/3] Passage Indexing Works. Content contain paragraph. Raw contains full text --- docs/experiments-covid.md | 32 +++++--- .../anserini/collection/CovidCollection.java | 79 +++++++++++++------ .../index/generator/CovidGenerator.java | 3 +- 3 files changed, 75 insertions(+), 39 deletions(-) diff --git a/docs/experiments-covid.md b/docs/experiments-covid.md index a13440f537..2eadfa7fb6 100644 --- a/docs/experiments-covid.md +++ b/docs/experiments-covid.md @@ -7,16 +7,22 @@ This document describes the steps to index [COVID-19 Open Research Dataset](http First, we need to download and extract the [COVID-19 Open Research Dataset](https://pages.semanticscholar.org/coronavirus-research): ```bash -DATA_DIR=./covid -mkdir ${DATA_DIR} - -wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/comm_use_subset.tar.gz -P ${DATA_DIR} -wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/noncomm_use_subset.tar.gz -P ${DATA_DIR} -wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/custom_license.tar.gz -P ${DATA_DIR} -wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/biorxiv_medrxiv.tar.gz -P ${DATA_DIR} -wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/metadata.csv -P ${DATA_DIR} - -ls ${DATA_DIR}/*.tar.gz | xargs --replace tar -zxvf {} -C ${DATA_DIR} +DATE=2020-03-20 +DATA_DIR=./covid-"${DATE}" +mkdir "${DATA_DIR}" + +wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/comm_use_subset.tar.gz -P "${DATA_DIR}" +wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/noncomm_use_subset.tar.gz -P "${DATA_DIR}" +wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/custom_license.tar.gz -P "${DATA_DIR}" +wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/biorxiv_medrxiv.tar.gz -P "${DATA_DIR}" +wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/metadata.csv -P "${DATA_DIR}" + +ls "${DATA_DIR}"/*.tar.gz | xargs -I {} tar -zxvf {} -C "${DATA_DIR}" +# If the above doesn't work due to cross compatibility issues with xargs, untar all folders individually +# tar -zxvf "${DATA_DIR}"/comm_use_subset.tar.gz -C "${DATA_DIR}" +# tar -zxvf "${DATA_DIR}"/noncomm_use_subset.tar.gz -C "${DATA_DIR}" +# tar -zxvf "${DATA_DIR}"/custom_license.tar.gz -C "${DATA_DIR}" +# tar -zxvf "${DATA_DIR}"/biorxiv_medrxiv.tar.gz -C "${DATA_DIR}" ``` We can now index these docs as a `CovidCollection` using Anserini: @@ -24,13 +30,13 @@ We can now index these docs as a `CovidCollection` using Anserini: ```bash sh target/appassembler/bin/IndexCollection \ -collection CovidCollection -generator CovidGenerator \ - -threads 8 -input ${DATA_DIR} \ - -index ${DATA_DIR}/lucene-index-covid \ + -threads 8 -input "${DATA_DIR}" \ + -index "${DATA_DIR}"/lucene-index-covid \ -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs ``` The output message should be something like this: -``` +```bash 2020-03-22 00:04:40,382 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 44,220 documents indexed in 00:05:06 ``` diff --git a/src/main/java/io/anserini/collection/CovidCollection.java b/src/main/java/io/anserini/collection/CovidCollection.java index b288c85977..047ec950a6 100644 --- a/src/main/java/io/anserini/collection/CovidCollection.java +++ b/src/main/java/io/anserini/collection/CovidCollection.java @@ -24,6 +24,7 @@ import java.io.BufferedReader; import java.io.FileInputStream; +import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.nio.file.Files; @@ -33,6 +34,9 @@ import java.util.NoSuchElementException; import java.util.Set; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + /** * A document collection for the CORD-19 dataset provided by Semantic Scholar. */ @@ -56,6 +60,9 @@ public class Segment extends FileSegment { CSVParser csvParser = null; private CSVRecord record = null; private Iterator iterator = null; // iterator for CSV records + private String recordFullText = ""; + private Iterator paragraphIterator = null; // iterator for paragraphs in a CSV record + private Integer paragraphNumber = 0; public Segment(Path path) throws IOException { super(path); @@ -75,17 +82,36 @@ record = iterator.next(); @Override public void readNext() throws NoSuchElementException { - if (record == null) { - throw new NoSuchElementException("Record is empty"); - } else { - bufferedRecord = new CovidCollection.Document(record); - if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record - record = iterator.next(); + if (paragraphIterator != null && paragraphIterator.hasNext()) { // if the record contains more paragraphs, we parse them + String paragraph = paragraphIterator.next().get("text").asText(); + paragraphNumber += 1; + bufferedRecord = new CovidCollection.Document(record, recordFullText, paragraph, paragraphNumber); + } else if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record + record = iterator.next(); + if (record.get("has_full_text").contains("True")) { + String[] hashes = record.get("sha").split(";"); + String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json"; + try { + String recordFullTextPath = CovidCollection.this.path.toString() + fullTextPath; + recordFullText = new String(Files.readAllBytes(Paths.get(recordFullTextPath))); + FileReader recordFullTextFileReader = new FileReader(recordFullTextPath); + ObjectMapper mapper = new ObjectMapper(); + JsonNode recordJsonNode = mapper.readerFor(JsonNode.class).readTree(recordFullTextFileReader); + paragraphIterator = recordJsonNode.get("body_text").elements(); + + } catch (IOException e) { + LOG.error("Error parsing file at " + fullTextPath + "\n" + e.getMessage()); + } } else { - atEOF = true; // there is no more JSON object in the bufferedReader + paragraphIterator = null; + recordFullText = ""; } - } + paragraphNumber = 0; + bufferedRecord = new CovidCollection.Document(record, recordFullText); + } else { + throw new NoSuchElementException("Reached end of CSVRecord Entries Iterator"); } + } @Override public void close() { @@ -105,26 +131,25 @@ public void close() { */ public class Document implements SourceDocument { private String id; - private String contents; + private String content; + private String raw; private CSVRecord record; - public Document(CSVRecord record) { - id = Long.toString(record.getRecordNumber()); - contents = record.toString(); + public Document(CSVRecord record, String recordFullText, String paragraph, Integer paragraphNumber) { + if (paragraphNumber == 0) { + id = Long.toString(record.getRecordNumber()); + } else { + id = Long.toString(record.getRecordNumber()) + "." + String.format("%04d", paragraphNumber); + } + content = record.get("title").replace("\n", " "); + content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract"); + content += paragraph.isEmpty() ? "" : "\n" + paragraph; + this.raw = recordFullText; this.record = record; + } - // index full text into raw contents - if (record.get("has_full_text").contains("True")) { - String[] hashes = record.get("sha").split(";"); - String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json"; - try { - String fullTextJson = new String(Files.readAllBytes( - Paths.get(CovidCollection.this.path.toString() + fullTextPath))); - contents += "\n " + fullTextJson; - } catch (IOException e) { - LOG.error("Error parsing file at " + fullTextPath); - } - } + public Document(CSVRecord record, String recordFullText) { + this(record, recordFullText, "", 0); } @Override @@ -134,7 +159,11 @@ public String id() { @Override public String content() { - return contents; + return content; + } + + public String raw() { + return raw; } @Override diff --git a/src/main/java/io/anserini/index/generator/CovidGenerator.java b/src/main/java/io/anserini/index/generator/CovidGenerator.java index d66102f892..2069a5d51b 100644 --- a/src/main/java/io/anserini/index/generator/CovidGenerator.java +++ b/src/main/java/io/anserini/index/generator/CovidGenerator.java @@ -75,6 +75,7 @@ public CovidGenerator(IndexArgs args, IndexCollection.Counters counters) { public Document createDocument(CovidCollection.Document covidDoc) { String id = covidDoc.id(); String content = covidDoc.content(); + String raw = covidDoc.raw(); if (content == null || content.trim().isEmpty()) { counters.empty.incrementAndGet(); @@ -89,7 +90,7 @@ public Document createDocument(CovidCollection.Document covidDoc) { doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id))); if (args.storeRawDocs) { - doc.add(new StoredField(IndexArgs.RAW, content)); + doc.add(new StoredField(IndexArgs.RAW, raw)); } FieldType fieldType = new FieldType(); From adc1aa30a5ef059e5393591a0db290935f57b070 Mon Sep 17 00:00:00 2001 From: Nikhil Gupta Date: Sun, 22 Mar 2020 15:06:12 +0000 Subject: [PATCH 2/3] Readme edit to link covid doc. Change to raw for passage docs. Only 1st one contains it now --- README.md | 1 + docs/experiments-covid.md | 2 +- .../io/anserini/collection/CovidCollection.java | 16 +++++++++------- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ba2bb63556..48f00f7443 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,7 @@ For the most part, these runs are based on [_default_ parameter settings](https: The experiments described below are not associated with rigorous end-to-end regression testing and thus provide a lower standard of replicability. For the most part, manual copying and pasting of commands into a shell is required to replicate our results: ++ [Experiments on COVID-19 Open Research Dataset](docs/experiments-covid.md) + [Replicating "Neural Hype" Experiments](docs/experiments-forum2018.md) + [Guide to running BM25 baselines on the MS MARCO Passage Retrieval Task](docs/experiments-msmarco-passage.md) + [Guide to running BM25 baselines on the MS MARCO Document Retrieval Task](docs/experiments-msmarco-doc.md) diff --git a/docs/experiments-covid.md b/docs/experiments-covid.md index 2eadfa7fb6..4bec7bda07 100644 --- a/docs/experiments-covid.md +++ b/docs/experiments-covid.md @@ -38,5 +38,5 @@ sh target/appassembler/bin/IndexCollection \ The output message should be something like this: ```bash -2020-03-22 00:04:40,382 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 44,220 documents indexed in 00:05:06 +2020-03-22 15:24:49,305 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 1,096,241 documents indexed in 00:11:35 ``` diff --git a/src/main/java/io/anserini/collection/CovidCollection.java b/src/main/java/io/anserini/collection/CovidCollection.java index 047ec950a6..1908419e27 100644 --- a/src/main/java/io/anserini/collection/CovidCollection.java +++ b/src/main/java/io/anserini/collection/CovidCollection.java @@ -60,7 +60,6 @@ public class Segment extends FileSegment { CSVParser csvParser = null; private CSVRecord record = null; private Iterator iterator = null; // iterator for CSV records - private String recordFullText = ""; private Iterator paragraphIterator = null; // iterator for paragraphs in a CSV record private Integer paragraphNumber = 0; @@ -85,9 +84,10 @@ public void readNext() throws NoSuchElementException { if (paragraphIterator != null && paragraphIterator.hasNext()) { // if the record contains more paragraphs, we parse them String paragraph = paragraphIterator.next().get("text").asText(); paragraphNumber += 1; - bufferedRecord = new CovidCollection.Document(record, recordFullText, paragraph, paragraphNumber); + bufferedRecord = new CovidCollection.Document(record, paragraph, paragraphNumber); } else if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record record = iterator.next(); + String recordFullText = ""; if (record.get("has_full_text").contains("True")) { String[] hashes = record.get("sha").split(";"); String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json"; @@ -97,14 +97,12 @@ record = iterator.next(); FileReader recordFullTextFileReader = new FileReader(recordFullTextPath); ObjectMapper mapper = new ObjectMapper(); JsonNode recordJsonNode = mapper.readerFor(JsonNode.class).readTree(recordFullTextFileReader); - paragraphIterator = recordJsonNode.get("body_text").elements(); - + paragraphIterator = recordJsonNode.get("body_text").elements(); } catch (IOException e) { LOG.error("Error parsing file at " + fullTextPath + "\n" + e.getMessage()); } } else { paragraphIterator = null; - recordFullText = ""; } paragraphNumber = 0; bufferedRecord = new CovidCollection.Document(record, recordFullText); @@ -135,7 +133,7 @@ public class Document implements SourceDocument { private String raw; private CSVRecord record; - public Document(CSVRecord record, String recordFullText, String paragraph, Integer paragraphNumber) { + public Document(CSVRecord record, String paragraph, Integer paragraphNumber, String recordFullText) { if (paragraphNumber == 0) { id = Long.toString(record.getRecordNumber()); } else { @@ -148,8 +146,12 @@ public Document(CSVRecord record, String recordFullText, String paragraph, Integ this.record = record; } + public Document(CSVRecord record, String paragraph, Integer paragraphNumber) { + this(record, paragraph, paragraphNumber, ""); + } + public Document(CSVRecord record, String recordFullText) { - this(record, recordFullText, "", 0); + this(record, "", 0, recordFullText); } @Override From 5aafcd6cde4c427a2d4ca6af8f79e2a98d6a7812 Mon Sep 17 00:00:00 2001 From: Nikhil Gupta Date: Sun, 22 Mar 2020 16:44:24 +0000 Subject: [PATCH 3/3] CR. Add 3 versions of Covid Collections to support different indexing techniques --- docs/experiments-covid.md | 58 ++++-- .../anserini/collection/CovidCollection.java | 82 ++++---- .../collection/CovidFullTextCollection.java | 163 ++++++++++++++++ .../collection/CovidParagraphCollection.java | 180 ++++++++++++++++++ 4 files changed, 421 insertions(+), 62 deletions(-) create mode 100644 src/main/java/io/anserini/collection/CovidFullTextCollection.java create mode 100644 src/main/java/io/anserini/collection/CovidParagraphCollection.java diff --git a/docs/experiments-covid.md b/docs/experiments-covid.md index 4bec7bda07..9ed6f8e7ba 100644 --- a/docs/experiments-covid.md +++ b/docs/experiments-covid.md @@ -25,18 +25,52 @@ ls "${DATA_DIR}"/*.tar.gz | xargs -I {} tar -zxvf {} -C "${DATA_DIR}" # tar -zxvf "${DATA_DIR}"/biorxiv_medrxiv.tar.gz -C "${DATA_DIR}" ``` -We can now index these docs as a `CovidCollection` using Anserini: +We can now index these docs as using Anserini; we have three versions: -```bash -sh target/appassembler/bin/IndexCollection \ - -collection CovidCollection -generator CovidGenerator \ - -threads 8 -input "${DATA_DIR}" \ - -index "${DATA_DIR}"/lucene-index-covid \ - -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs -``` +* `CovidCollection` which adds `title` + `abstract` to Lucene Document's `content` -The output message should be something like this: + ```bash + sh target/appassembler/bin/IndexCollection \ + -collection CovidCollection -generator CovidGenerator \ + -threads 8 -input "${DATA_DIR}" \ + -index "${DATA_DIR}"/lucene-index-covid \ + -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs + ``` -```bash -2020-03-22 15:24:49,305 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 1,096,241 documents indexed in 00:11:35 -``` + The output message should be something like this: + + ```bash + 2020-03-22 16:55:00,711 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 44,145 documents indexed in 00:01:07 + ``` + +* `CovidFullTextCollection` which adds `title` + `abstract` + `full json text` to Lucene Document's `content` + + ```bash + sh target/appassembler/bin/IndexCollection \ + -collection CovidFullTextCollection -generator CovidGenerator \ + -threads 8 -input "${DATA_DIR}" \ + -index "${DATA_DIR}"/lucene-index-covid-full-text \ + -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs + ``` + + The output message should be something like this: + + ```bash + 2020-03-22 16:55:00,711 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 44,145 documents indexed in 00:01:07 + ``` + +* `CovidParagraphCollection` which adds `title` + `abstract` + `paragraph number x` to Lucene Document's `content`. And there will be multiple Lucene Documents for each record. Specifically, one for each paragraph in the full text for the record, hence `paragraph number x`. + + ```bash + sh target/appassembler/bin/IndexCollection \ + -collection CovidParagraphCollection -generator CovidGenerator \ + -threads 8 -input "${DATA_DIR}" \ + -index "${DATA_DIR}"/lucene-index-covid-paragraph \ + -storePositions -storeDocvectors -storeRawDocs -storeTransformedDocs + ``` + + The output message should be something like this: + + ```bash + 2020-03-22 15:24:49,305 INFO [main] index.IndexCollection (IndexCollection.java:845) - Total 1,096,241 documents indexed in 00:11:35 + ``` diff --git a/src/main/java/io/anserini/collection/CovidCollection.java b/src/main/java/io/anserini/collection/CovidCollection.java index 1908419e27..edd009ce8c 100644 --- a/src/main/java/io/anserini/collection/CovidCollection.java +++ b/src/main/java/io/anserini/collection/CovidCollection.java @@ -24,7 +24,6 @@ import java.io.BufferedReader; import java.io.FileInputStream; -import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.nio.file.Files; @@ -34,9 +33,6 @@ import java.util.NoSuchElementException; import java.util.Set; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; - /** * A document collection for the CORD-19 dataset provided by Semantic Scholar. */ @@ -60,8 +56,6 @@ public class Segment extends FileSegment { CSVParser csvParser = null; private CSVRecord record = null; private Iterator iterator = null; // iterator for CSV records - private Iterator paragraphIterator = null; // iterator for paragraphs in a CSV record - private Integer paragraphNumber = 0; public Segment(Path path) throws IOException { super(path); @@ -81,35 +75,17 @@ record = iterator.next(); @Override public void readNext() throws NoSuchElementException { - if (paragraphIterator != null && paragraphIterator.hasNext()) { // if the record contains more paragraphs, we parse them - String paragraph = paragraphIterator.next().get("text").asText(); - paragraphNumber += 1; - bufferedRecord = new CovidCollection.Document(record, paragraph, paragraphNumber); - } else if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record - record = iterator.next(); - String recordFullText = ""; - if (record.get("has_full_text").contains("True")) { - String[] hashes = record.get("sha").split(";"); - String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json"; - try { - String recordFullTextPath = CovidCollection.this.path.toString() + fullTextPath; - recordFullText = new String(Files.readAllBytes(Paths.get(recordFullTextPath))); - FileReader recordFullTextFileReader = new FileReader(recordFullTextPath); - ObjectMapper mapper = new ObjectMapper(); - JsonNode recordJsonNode = mapper.readerFor(JsonNode.class).readTree(recordFullTextFileReader); - paragraphIterator = recordJsonNode.get("body_text").elements(); - } catch (IOException e) { - LOG.error("Error parsing file at " + fullTextPath + "\n" + e.getMessage()); - } + if (record == null) { + throw new NoSuchElementException("Record is empty"); + } else { + bufferedRecord = new CovidCollection.Document(record); + if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record + record = iterator.next(); } else { - paragraphIterator = null; + atEOF = true; // there is no more JSON object in the bufferedReader } - paragraphNumber = 0; - bufferedRecord = new CovidCollection.Document(record, recordFullText); - } else { - throw new NoSuchElementException("Reached end of CSVRecord Entries Iterator"); + } } - } @Override public void close() { @@ -133,25 +109,31 @@ public class Document implements SourceDocument { private String raw; private CSVRecord record; - public Document(CSVRecord record, String paragraph, Integer paragraphNumber, String recordFullText) { - if (paragraphNumber == 0) { - id = Long.toString(record.getRecordNumber()); - } else { - id = Long.toString(record.getRecordNumber()) + "." + String.format("%04d", paragraphNumber); - } + public Document(CSVRecord record) { + id = Long.toString(record.getRecordNumber()); content = record.get("title").replace("\n", " "); content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract"); - content += paragraph.isEmpty() ? "" : "\n" + paragraph; - this.raw = recordFullText; this.record = record; - } - public Document(CSVRecord record, String paragraph, Integer paragraphNumber) { - this(record, paragraph, paragraphNumber, ""); - } + String fullTextJson = ""; + // index full text into raw content + if (record.get("has_full_text").contains("True")) { + String[] hashes = record.get("sha").split(";"); + String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json"; + try { + fullTextJson = new String(Files.readAllBytes( + Paths.get(CovidCollection.this.path.toString() + fullTextPath))); + } catch (IOException e) { + LOG.error("Error parsing file at " + fullTextPath); + raw = record.toString(); + } + } - public Document(CSVRecord record, String recordFullText) { - this(record, "", 0, recordFullText); + if (!fullTextJson.isEmpty()) { + raw = fullTextJson; + } else { + raw = record.toString(); + } } @Override @@ -164,15 +146,15 @@ public String content() { return content; } - public String raw() { - return raw; - } - @Override public boolean indexable() { return true; } + public String raw() { + return raw; + } + public CSVRecord record() { return record; } diff --git a/src/main/java/io/anserini/collection/CovidFullTextCollection.java b/src/main/java/io/anserini/collection/CovidFullTextCollection.java new file mode 100644 index 0000000000..da11e485b5 --- /dev/null +++ b/src/main/java/io/anserini/collection/CovidFullTextCollection.java @@ -0,0 +1,163 @@ +/* + * Anserini: A Lucene toolkit for replicable information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.Set; + +/** + * A document collection for the CORD-19 dataset provided by Semantic Scholar. + */ +public class CovidFullTextCollection extends DocumentCollection { + private static final Logger LOG = LogManager.getLogger(CovidFullTextCollection.class); + + public CovidFullTextCollection(Path path){ + this.path = path; + this.allowedFileSuffix = Set.of(".csv"); + } + + @Override + public FileSegment createFileSegment(Path p) throws IOException { + return new Segment(p); + } + + /** + * A file containing a single CSV document. + */ + public class Segment extends FileSegment { + CSVParser csvParser = null; + private CSVRecord record = null; + private Iterator iterator = null; // iterator for CSV records + + public Segment(Path path) throws IOException { + super(path); + bufferedReader = new BufferedReader(new InputStreamReader( + new FileInputStream(path.toString()))); + + csvParser = new CSVParser(bufferedReader, CSVFormat.DEFAULT + .withFirstRecordAsHeader() + .withIgnoreHeaderCase() + .withTrim()); + + iterator = csvParser.iterator(); + if (iterator.hasNext()) { + record = iterator.next(); + } + } + + @Override + public void readNext() throws NoSuchElementException { + if (record == null) { + throw new NoSuchElementException("Record is empty"); + } else { + bufferedRecord = new CovidFullTextCollection.Document(record); + if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record + record = iterator.next(); + } else { + atEOF = true; // there is no more JSON object in the bufferedReader + } + } + } + + @Override + public void close() { + super.close(); + if (csvParser != null) { + try { + csvParser.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** + * A document in a CORD-19 collection. + */ + public class Document implements SourceDocument { + private String id; + private String content; + private String raw; + private CSVRecord record; + + public Document(CSVRecord record) { + id = Long.toString(record.getRecordNumber()); + content = record.get("title").replace("\n", " "); + content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract"); + this.record = record; + + String fullTextJson = ""; + // index full text into raw content + if (record.get("has_full_text").contains("True")) { + String[] hashes = record.get("sha").split(";"); + String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json"; + try { + fullTextJson = new String(Files.readAllBytes( + Paths.get(CovidFullTextCollection.this.path.toString() + fullTextPath))); + } catch (IOException e) { + LOG.error("Error parsing file at " + fullTextPath); + raw = record.toString(); + } + } + + if (!fullTextJson.isEmpty()) { + content += fullTextJson.isEmpty() ? "" : "\n " + fullTextJson; + raw = fullTextJson; + } else { + raw = record.toString(); + } + } + + @Override + public String id() { + return id; + } + + @Override + public String content() { + return content; + } + + @Override + public boolean indexable() { + return true; + } + + public String raw() { + return raw; + } + + public CSVRecord record() { + return record; + } + } +} diff --git a/src/main/java/io/anserini/collection/CovidParagraphCollection.java b/src/main/java/io/anserini/collection/CovidParagraphCollection.java new file mode 100644 index 0000000000..126f568612 --- /dev/null +++ b/src/main/java/io/anserini/collection/CovidParagraphCollection.java @@ -0,0 +1,180 @@ +/* + * Anserini: A Lucene toolkit for replicable information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.Set; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * A document collection for the CORD-19 dataset provided by Semantic Scholar. + */ +public class CovidParagraphCollection extends DocumentCollection { + private static final Logger LOG = LogManager.getLogger(CovidParagraphCollection.class); + + public CovidParagraphCollection(Path path){ + this.path = path; + this.allowedFileSuffix = Set.of(".csv"); + } + + @Override + public FileSegment createFileSegment(Path p) throws IOException { + return new Segment(p); + } + + /** + * A file containing a single CSV document. + */ + public class Segment extends FileSegment { + CSVParser csvParser = null; + private CSVRecord record = null; + private Iterator iterator = null; // iterator for CSV records + private Iterator paragraphIterator = null; // iterator for paragraphs in a CSV record + private Integer paragraphNumber = 0; + + public Segment(Path path) throws IOException { + super(path); + bufferedReader = new BufferedReader(new InputStreamReader( + new FileInputStream(path.toString()))); + + csvParser = new CSVParser(bufferedReader, CSVFormat.DEFAULT + .withFirstRecordAsHeader() + .withIgnoreHeaderCase() + .withTrim()); + + iterator = csvParser.iterator(); + if (iterator.hasNext()) { + record = iterator.next(); + } + } + + @Override + public void readNext() throws NoSuchElementException { + if (paragraphIterator != null && paragraphIterator.hasNext()) { // if the record contains more paragraphs, we parse them + String paragraph = paragraphIterator.next().get("text").asText(); + paragraphNumber += 1; + bufferedRecord = new CovidParagraphCollection.Document(record, paragraph, paragraphNumber); + } else if (iterator.hasNext()) { // if CSV contains more lines, we parse the next record + record = iterator.next(); + String recordFullText = ""; + if (record.get("has_full_text").contains("True")) { + String[] hashes = record.get("sha").split(";"); + String fullTextPath = "/" + record.get("full_text_file") + "/" + hashes[hashes.length - 1].strip() + ".json"; + try { + String recordFullTextPath = CovidParagraphCollection.this.path.toString() + fullTextPath; + recordFullText = new String(Files.readAllBytes(Paths.get(recordFullTextPath))); + FileReader recordFullTextFileReader = new FileReader(recordFullTextPath); + ObjectMapper mapper = new ObjectMapper(); + JsonNode recordJsonNode = mapper.readerFor(JsonNode.class).readTree(recordFullTextFileReader); + paragraphIterator = recordJsonNode.get("body_text").elements(); + } catch (IOException e) { + LOG.error("Error parsing file at " + fullTextPath + "\n" + e.getMessage()); + } + } else { + paragraphIterator = null; + } + paragraphNumber = 0; + bufferedRecord = new CovidParagraphCollection.Document(record, recordFullText); + } else { + throw new NoSuchElementException("Reached end of CSVRecord Entries Iterator"); + } + } + + @Override + public void close() { + super.close(); + if (csvParser != null) { + try { + csvParser.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** + * A document in a CORD-19 collection. + */ + public class Document implements SourceDocument { + private String id; + private String content; + private String raw; + private CSVRecord record; + + public Document(CSVRecord record, String paragraph, Integer paragraphNumber, String recordFullText) { + if (paragraphNumber == 0) { + id = Long.toString(record.getRecordNumber()); + } else { + id = Long.toString(record.getRecordNumber()) + "." + String.format("%05d", paragraphNumber); + } + content = record.get("title").replace("\n", " "); + content += record.get("abstract").isEmpty() ? "" : "\n" + record.get("abstract"); + content += paragraph.isEmpty() ? "" : "\n" + paragraph; + this.raw = recordFullText; + this.record = record; + } + + public Document(CSVRecord record, String paragraph, Integer paragraphNumber) { + this(record, paragraph, paragraphNumber, ""); + } + + public Document(CSVRecord record, String recordFullText) { + this(record, "", 0, recordFullText); + } + + @Override + public String id() { + return id; + } + + @Override + public String content() { + return content; + } + + public String raw() { + return raw; + } + + @Override + public boolean indexable() { + return true; + } + + public CSVRecord record() { + return record; + } + } +}