Skip to content

Commit

Permalink
Add option to output external document ids in ExtractDocumentLengths (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
nsndimt authored Aug 17, 2020
1 parent ac266de commit 857f6da
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 8 deletions.
11 changes: 7 additions & 4 deletions src/main/java/io/anserini/util/ExtractDocumentLengths.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package io.anserini.util;

import io.anserini.index.IndexArgs;
import io.anserini.index.IndexReaderUtils;
import io.anserini.index.NotStoredException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
Expand Down Expand Up @@ -72,14 +73,15 @@ public static void main(String[] args) throws Exception {
long lossyTotalTerms = 0;
long exactTotalTerms = 0;

out.println("docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count");
out.println("internal_docid\texternal_docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count");
for (int i = 0; i < numDocs; i++) {
Terms terms = reader.getTermVector(i, IndexArgs.CONTENTS);
if (terms == null) {
// It could be the case that TermVectors weren't stored when constructing the index, or we're just missing a
// TermVector for a zero-length document. Warn, but don't throw exception.
System.err.println(String.format("Warning: TermVector not available for docid %d.", i));
out.println(String.format("%d\t0\t0\t0\t0", i));
String external_did = IndexReaderUtils.convertLuceneDocidToDocid(reader, i);
System.err.println(String.format("Warning: TermVector not available for docid %s.", external_did));
out.println(String.format("%d\t%s\t0\t0\t0\t0", i, external_did));
continue;
}

Expand All @@ -90,7 +92,8 @@ public static void main(String[] args) throws Exception {
// See https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
int lossyDoclength = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactDoclength));
int lossyTermCount = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactTermCount));
out.println(String.format("%d\t%d\t%d\t%d\t%d", i, exactDoclength, exactTermCount, lossyDoclength, lossyTermCount));
out.println(String.format("%d\t%s\t%d\t%d\t%d\t%d", i, IndexReaderUtils.convertLuceneDocidToDocid(reader, i),
exactDoclength, exactTermCount, lossyDoclength, lossyTermCount));
lossyTotalTerms += lossyDoclength;
exactTotalTerms += exactDoclength;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ public void test() throws Exception {

List<String> lines = Files.readAllLines(Paths.get(randomFileName));
assertEquals(5, lines.size());
assertEquals("0\t8\t5\t8\t5", lines.get(1));
assertEquals("1\t2\t2\t2\t2", lines.get(2));
assertEquals("2\t2\t2\t2\t2", lines.get(3));
assertEquals("3\t0\t0\t0\t0", lines.get(4));
assertEquals("0\tdoc1\t8\t5\t8\t5", lines.get(1));
assertEquals("1\tdoc2\t2\t2\t2\t2", lines.get(2));
assertEquals("2\tdoc3\t2\t2\t2\t2", lines.get(3));
assertEquals("3\tdoc4\t0\t0\t0\t0", lines.get(4));
}
}

0 comments on commit 857f6da

Please sign in to comment.