From 9b49a79adc5276e812c5cf16109c37f976e28e44 Mon Sep 17 00:00:00 2001 From: nsndimt Date: Tue, 16 Jun 2020 15:06:39 -0400 Subject: [PATCH 1/3] add option to output external did --- .../java/io/anserini/util/ExtractDocumentLengths.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/io/anserini/util/ExtractDocumentLengths.java b/src/main/java/io/anserini/util/ExtractDocumentLengths.java index 232f2ee99b..71b38006e2 100644 --- a/src/main/java/io/anserini/util/ExtractDocumentLengths.java +++ b/src/main/java/io/anserini/util/ExtractDocumentLengths.java @@ -17,6 +17,7 @@ package io.anserini.util; import io.anserini.index.IndexArgs; +import io.anserini.index.IndexReaderUtils; import io.anserini.index.NotStoredException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -47,6 +48,9 @@ public static class Args { @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file") String output; + + @Option(name = "-outputdid", usage = "output collection id") + boolean lookupLuceneDocid = false; } public static void main(String[] args) throws Exception { @@ -90,7 +94,10 @@ public static void main(String[] args) throws Exception { // See https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java int lossyDoclength = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactDoclength)); int lossyTermCount = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactTermCount)); - out.println(String.format("%d\t%d\t%d\t%d\t%d", i, exactDoclength, exactTermCount, lossyDoclength, lossyTermCount)); + if (!myArgs.lookupLuceneDocid) + out.println(String.format("%d\t%d\t%d\t%d\t%d", i, exactDoclength, exactTermCount, lossyDoclength, lossyTermCount)); + else out.println(String.format("%s\t%d\t%d\t%d\t%d", IndexReaderUtils.convertLuceneDocidToDocid(reader, i), + exactDoclength, exactTermCount, lossyDoclength, lossyTermCount)); lossyTotalTerms += lossyDoclength; exactTotalTerms += exactDoclength; } From 83ab3866dfe74f40e1904c53ad5209ffa1bd2503 Mon Sep 17 00:00:00 2001 From: nsndimt Date: Thu, 18 Jun 2020 17:46:06 -0400 Subject: [PATCH 2/3] Output external document ids directly; Change test case accordingly; --- .../io/anserini/util/ExtractDocumentLengths.java | 12 ++++-------- .../io/anserini/util/ExtractDocumentLengthsTest.java | 8 ++++---- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/main/java/io/anserini/util/ExtractDocumentLengths.java b/src/main/java/io/anserini/util/ExtractDocumentLengths.java index 71b38006e2..3be0392a94 100644 --- a/src/main/java/io/anserini/util/ExtractDocumentLengths.java +++ b/src/main/java/io/anserini/util/ExtractDocumentLengths.java @@ -48,9 +48,6 @@ public static class Args { @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file") String output; - - @Option(name = "-outputdid", usage = "output collection id") - boolean lookupLuceneDocid = false; } public static void main(String[] args) throws Exception { @@ -82,8 +79,9 @@ public static void main(String[] args) throws Exception { if (terms == null) { // It could be the case that TermVectors weren't stored when constructing the index, or we're just missing a // TermVector for a zero-length document. Warn, but don't throw exception. - System.err.println(String.format("Warning: TermVector not available for docid %d.", i)); - out.println(String.format("%d\t0\t0\t0\t0", i)); + String external_did = IndexReaderUtils.convertLuceneDocidToDocid(reader, i); + System.err.println(String.format("Warning: TermVector not available for docid %s.", external_did)); + out.println(String.format("%s\t0\t0\t0\t0", external_did)); continue; } @@ -94,9 +92,7 @@ public static void main(String[] args) throws Exception { // See https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java int lossyDoclength = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactDoclength)); int lossyTermCount = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactTermCount)); - if (!myArgs.lookupLuceneDocid) - out.println(String.format("%d\t%d\t%d\t%d\t%d", i, exactDoclength, exactTermCount, lossyDoclength, lossyTermCount)); - else out.println(String.format("%s\t%d\t%d\t%d\t%d", IndexReaderUtils.convertLuceneDocidToDocid(reader, i), + out.println(String.format("%s\t%d\t%d\t%d\t%d", IndexReaderUtils.convertLuceneDocidToDocid(reader, i), exactDoclength, exactTermCount, lossyDoclength, lossyTermCount)); lossyTotalTerms += lossyDoclength; exactTotalTerms += exactDoclength; diff --git a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java index bbdd25daa7..24914caa25 100644 --- a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java +++ b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java @@ -72,9 +72,9 @@ public void test() throws Exception { List lines = Files.readAllLines(Paths.get(randomFileName)); assertEquals(5, lines.size()); - assertEquals("0\t8\t5\t8\t5", lines.get(1)); - assertEquals("1\t2\t2\t2\t2", lines.get(2)); - assertEquals("2\t2\t2\t2\t2", lines.get(3)); - assertEquals("3\t0\t0\t0\t0", lines.get(4)); + assertEquals("doc1\t8\t5\t8\t5", lines.get(1)); + assertEquals("doc2\t2\t2\t2\t2", lines.get(2)); + assertEquals("doc3\t2\t2\t2\t2", lines.get(3)); + assertEquals("doc4\t0\t0\t0\t0", lines.get(4)); } } From 4757427d7000109b6bb3c5c46a3ac16edb5e8554 Mon Sep 17 00:00:00 2001 From: nsndimt Date: Sun, 16 Aug 2020 20:21:07 -0400 Subject: [PATCH 3/3] keep both internal and external docid --- .../java/io/anserini/util/ExtractDocumentLengths.java | 6 +++--- .../java/io/anserini/util/ExtractDocumentLengthsTest.java | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/java/io/anserini/util/ExtractDocumentLengths.java b/src/main/java/io/anserini/util/ExtractDocumentLengths.java index 3be0392a94..ca8820672f 100644 --- a/src/main/java/io/anserini/util/ExtractDocumentLengths.java +++ b/src/main/java/io/anserini/util/ExtractDocumentLengths.java @@ -73,7 +73,7 @@ public static void main(String[] args) throws Exception { long lossyTotalTerms = 0; long exactTotalTerms = 0; - out.println("docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count"); + out.println("internal_docid\texternal_docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count"); for (int i = 0; i < numDocs; i++) { Terms terms = reader.getTermVector(i, IndexArgs.CONTENTS); if (terms == null) { @@ -81,7 +81,7 @@ public static void main(String[] args) throws Exception { // TermVector for a zero-length document. Warn, but don't throw exception. String external_did = IndexReaderUtils.convertLuceneDocidToDocid(reader, i); System.err.println(String.format("Warning: TermVector not available for docid %s.", external_did)); - out.println(String.format("%s\t0\t0\t0\t0", external_did)); + out.println(String.format("%d\t%s\t0\t0\t0\t0", i, external_did)); continue; } @@ -92,7 +92,7 @@ public static void main(String[] args) throws Exception { // See https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java int lossyDoclength = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactDoclength)); int lossyTermCount = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactTermCount)); - out.println(String.format("%s\t%d\t%d\t%d\t%d", IndexReaderUtils.convertLuceneDocidToDocid(reader, i), + out.println(String.format("%d\t%s\t%d\t%d\t%d\t%d", i, IndexReaderUtils.convertLuceneDocidToDocid(reader, i), exactDoclength, exactTermCount, lossyDoclength, lossyTermCount)); lossyTotalTerms += lossyDoclength; exactTotalTerms += exactDoclength; diff --git a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java index 24914caa25..882fee79f2 100644 --- a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java +++ b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java @@ -72,9 +72,9 @@ public void test() throws Exception { List lines = Files.readAllLines(Paths.get(randomFileName)); assertEquals(5, lines.size()); - assertEquals("doc1\t8\t5\t8\t5", lines.get(1)); - assertEquals("doc2\t2\t2\t2\t2", lines.get(2)); - assertEquals("doc3\t2\t2\t2\t2", lines.get(3)); - assertEquals("doc4\t0\t0\t0\t0", lines.get(4)); + assertEquals("0\tdoc1\t8\t5\t8\t5", lines.get(1)); + assertEquals("1\tdoc2\t2\t2\t2\t2", lines.get(2)); + assertEquals("2\tdoc3\t2\t2\t2\t2", lines.get(3)); + assertEquals("3\tdoc4\t0\t0\t0\t0", lines.get(4)); } }