diff --git a/src/main/java/io/anserini/util/ExtractDocumentLengths.java b/src/main/java/io/anserini/util/ExtractDocumentLengths.java index 232f2ee99b..ca8820672f 100644 --- a/src/main/java/io/anserini/util/ExtractDocumentLengths.java +++ b/src/main/java/io/anserini/util/ExtractDocumentLengths.java @@ -17,6 +17,7 @@ package io.anserini.util; import io.anserini.index.IndexArgs; +import io.anserini.index.IndexReaderUtils; import io.anserini.index.NotStoredException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -72,14 +73,15 @@ public static void main(String[] args) throws Exception { long lossyTotalTerms = 0; long exactTotalTerms = 0; - out.println("docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count"); + out.println("internal_docid\texternal_docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count"); for (int i = 0; i < numDocs; i++) { Terms terms = reader.getTermVector(i, IndexArgs.CONTENTS); if (terms == null) { // It could be the case that TermVectors weren't stored when constructing the index, or we're just missing a // TermVector for a zero-length document. Warn, but don't throw exception. - System.err.println(String.format("Warning: TermVector not available for docid %d.", i)); - out.println(String.format("%d\t0\t0\t0\t0", i)); + String external_did = IndexReaderUtils.convertLuceneDocidToDocid(reader, i); + System.err.println(String.format("Warning: TermVector not available for docid %s.", external_did)); + out.println(String.format("%d\t%s\t0\t0\t0\t0", i, external_did)); continue; } @@ -90,7 +92,8 @@ public static void main(String[] args) throws Exception { // See https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java int lossyDoclength = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactDoclength)); int lossyTermCount = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactTermCount)); - out.println(String.format("%d\t%d\t%d\t%d\t%d", i, exactDoclength, exactTermCount, lossyDoclength, lossyTermCount)); + out.println(String.format("%d\t%s\t%d\t%d\t%d\t%d", i, IndexReaderUtils.convertLuceneDocidToDocid(reader, i), + exactDoclength, exactTermCount, lossyDoclength, lossyTermCount)); lossyTotalTerms += lossyDoclength; exactTotalTerms += exactDoclength; } diff --git a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java index bbdd25daa7..882fee79f2 100644 --- a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java +++ b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java @@ -72,9 +72,9 @@ public void test() throws Exception { List lines = Files.readAllLines(Paths.get(randomFileName)); assertEquals(5, lines.size()); - assertEquals("0\t8\t5\t8\t5", lines.get(1)); - assertEquals("1\t2\t2\t2\t2", lines.get(2)); - assertEquals("2\t2\t2\t2\t2", lines.get(3)); - assertEquals("3\t0\t0\t0\t0", lines.get(4)); + assertEquals("0\tdoc1\t8\t5\t8\t5", lines.get(1)); + assertEquals("1\tdoc2\t2\t2\t2\t2", lines.get(2)); + assertEquals("2\tdoc3\t2\t2\t2\t2", lines.get(3)); + assertEquals("3\tdoc4\t0\t0\t0\t0", lines.get(4)); } }