Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add _sort to DocValues #317

Merged
merged 2 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 22 additions & 23 deletions src/main/java/com/github/discvrseq/walkers/VcfToLuceneIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
Expand Down Expand Up @@ -104,7 +102,7 @@ public void onTraversalStart() {
}

IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setIndexSort(new Sort(new SortField("genomicPosition", SortField.Type.INT, false)));
config.setIndexSort(new Sort(new SortField("genomicPosition_sort", SortField.Type.INT, false)));

try {
writer = new IndexWriter(index, config);
Expand Down Expand Up @@ -245,26 +243,27 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType(

// Add standard fields
doc.add(new TextField("contig", variant.getContig(), Field.Store.YES));
doc.add(new SortedDocValuesField("contig", new BytesRef(variant.getContig())));
doc.add(new SortedDocValuesField("contig_sort", new BytesRef(variant.getContig())));

doc.add(new TextField("ref", variant.getReference().getDisplayString(), Field.Store.YES));
doc.add(new SortedDocValuesField("ref", new BytesRef(variant.getReference().getDisplayString())));
doc.add(new SortedDocValuesField("ref_sort", new BytesRef(variant.getReference().getDisplayString())));

doc.add(new TextField("alt", alt.getDisplayString(), Field.Store.YES));
doc.add(new SortedDocValuesField("alt", new BytesRef(alt.getDisplayString())));
doc.add(new SortedDocValuesField("alt_sort", new BytesRef(alt.getDisplayString())));

final int genomicPositionStart = getGenomicPosition(variant.getContig(), variant.getStart());
doc.add(new IntPoint("start", variant.getStart()));
doc.add(new StoredField("start", variant.getStart()));
doc.add(new NumericDocValuesField("start", variant.getStart()));
doc.add(new NumericDocValuesField("start_sort", genomicPositionStart));

final int genomicPositionEnd = getGenomicPosition(variant.getContig(), variant.getEnd());
doc.add(new IntPoint("end", variant.getEnd()));
doc.add(new StoredField("end", variant.getEnd()));
doc.add(new NumericDocValuesField("end", variant.getEnd()));
doc.add(new NumericDocValuesField("end_sort", genomicPositionEnd));

final int genomicPosition = getGenomicPosition(variant.getContig(), variant.getStart());
doc.add(new IntPoint("genomicPosition", genomicPosition));
doc.add(new StoredField("genomicPosition", genomicPosition));
doc.add(new NumericDocValuesField("genomicPosition", genomicPosition));
doc.add(new IntPoint("genomicPosition", genomicPositionStart));
doc.add(new StoredField("genomicPosition", genomicPositionStart));
doc.add(new NumericDocValuesField("genomicPosition_sort", genomicPositionStart));

if (variant.hasGenotypes()) {
AtomicReference<String> docValue = new AtomicReference<>(null);
Expand All @@ -278,7 +277,7 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType(
});

if (docValue.get() != null) {
doc.add(new SortedDocValuesField("variableSamples", new BytesRef(docValue.get())));
doc.add(new SortedDocValuesField("variableSamples_sort", new BytesRef(docValue.get())));
docValue.set(null);
}

Expand All @@ -291,29 +290,29 @@ else if (line.getCountType() == VCFHeaderLineCount.INTEGER || line.getCountType(
});

if (docValue.get() != null) {
doc.add(new SortedDocValuesField("homozygousVarSamples", new BytesRef(docValue.get())));
doc.add(new SortedDocValuesField("homozygousVarSamples_sort", new BytesRef(docValue.get())));
docValue.set(null);
}

long nHet = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHet()).count();
doc.add(new IntPoint("nHet", (int)nHet));
doc.add(new StoredField("nHet", (int)nHet));
doc.add(new NumericDocValuesField("nHet", (int)nHet));
doc.add(new NumericDocValuesField("nHet_sort", (int)nHet));

long nHomVar = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall() && g.getAlleles().contains(alt) && g.isHomVar()).count();
doc.add(new IntPoint("nHomVar", (int)nHomVar));
doc.add(new StoredField("nHomVar", (int)nHomVar));
doc.add(new NumericDocValuesField("nHomVar", (int)nHomVar));
doc.add(new NumericDocValuesField("nHomVar_sort", (int)nHomVar));

long nCalled = variant.getGenotypes().stream().filter(g -> !g.isFiltered() && !g.isNoCall()).count();
doc.add(new IntPoint("nCalled", (int)nCalled));
doc.add(new StoredField("nCalled", (int)nCalled));
doc.add(new NumericDocValuesField("nCalled", (int)nCalled));
doc.add(new NumericDocValuesField("nCalled_sort", (int)nCalled));

float fractionHet = (float) nHet / (float) (nHet + nHomVar);
doc.add(new DoublePoint("fractionHet", fractionHet));
doc.add(new StoredField("fractionHet", fractionHet));
doc.add(new NumericDocValuesField("fractionHet", NumericUtils.doubleToSortableLong(fractionHet)));
doc.add(new NumericDocValuesField("fractionHet_sort", NumericUtils.doubleToSortableLong(fractionHet)));
}

try {
Expand Down Expand Up @@ -408,7 +407,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
doc.add(new StringField(key, String.valueOf(value), Field.Store.YES));

if (indexDocValue.get()) {
doc.add(new SortedDocValuesField(key, new BytesRef(String.valueOf(value))));
doc.add(new SortedDocValuesField(key + "_sort", new BytesRef(String.valueOf(value))));
indexDocValue.set(false);
}
}
Expand All @@ -417,7 +416,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
doc.add(new IntPoint(key, x));

if (indexDocValue.get()) {
doc.add(new NumericDocValuesField(key, x));
doc.add(new NumericDocValuesField(key + "_sort", x));
indexDocValue.set(false);
}
}
Expand All @@ -436,7 +435,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
});

if (docValue.get() != null && indexDocValue.get()) {
doc.add(new NumericDocValuesField(key, NumericUtils.doubleToSortableLong(docValue.get())));
doc.add(new NumericDocValuesField(key + "_sort", NumericUtils.doubleToSortableLong(docValue.get())));
indexDocValue.set(false);
}
}
Expand All @@ -455,7 +454,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
});

if (docValue.get() != null && indexDocValue.get()) {
doc.add(new NumericDocValuesField(key, docValue.get()));
doc.add(new NumericDocValuesField(key + "_sort", docValue.get()));
indexDocValue.set(false);
}
}
Expand All @@ -464,7 +463,7 @@ synchronized private void addFieldToDocument(Document doc, VCFHeaderLineType var
doc.add(new TextField(key, String.valueOf(value), Field.Store.YES));

if (indexDocValue.get()) {
doc.add(new SortedDocValuesField(key, new BytesRef(String.valueOf(value))));
doc.add(new SortedDocValuesField(key +"_sort", new BytesRef(String.valueOf(value))));
indexDocValue.set(false);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ public void doExtendedTest() throws Exception {
Assert.assertEquals(topDocs.totalHits.value, 1L);

// Top 50 hits are sorted by genomicPosition
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("genomicPosition", SortField.Type.INT)));
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("genomicPosition_sort", SortField.Type.INT)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

int lastGenomicPosition = -1;
Expand All @@ -485,7 +485,7 @@ public void doExtendedTest() throws Exception {
}

// Results are sorted by REFFIELD
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("REFFIELD", SortField.Type.STRING)));
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("REFFIELD_sort", SortField.Type.STRING)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

String lastRefField = null;
Expand All @@ -499,7 +499,7 @@ public void doExtendedTest() throws Exception {
}

// Results are sorted by start
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("start", SortField.Type.INT)));
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("start_sort", SortField.Type.INT)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

int lastStart = -1;
Expand All @@ -511,7 +511,7 @@ public void doExtendedTest() throws Exception {
}

// Results are sorted by HaplotypeScore
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("HaplotypeScore", SortField.Type.DOUBLE)));
topDocs = indexSearcher.search(new MatchAllDocsQuery(), 6, new Sort(new SortField("HaplotypeScore_sort", SortField.Type.DOUBLE)));
Assert.assertEquals(6, topDocs.scoreDocs.length);

float lastHaplotypeScore = -1.0f;
Expand Down