Skip to content

Commit

Permalink
Upgrade Anserini to Lucene (and Solr) 9.0.0 (#1875)
Browse files Browse the repository at this point in the history
  • Loading branch information
tteofili authored Aug 1, 2022
1 parent 38d8581 commit 5af657d
Show file tree
Hide file tree
Showing 22 changed files with 148 additions and 121 deletions.
65 changes: 60 additions & 5 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
</developers>

<properties>
<lucene.version>8.11.0</lucene.version>
<lucene.version>9.0.0</lucene.version>
<solr.version>9.0.0</solr.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

Expand Down Expand Up @@ -297,12 +298,17 @@
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-kuromoji</artifactId>
<artifactId>lucene-queries</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analysis-kuromoji</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency> <!-- polish and ukrainian -->
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-morfologik</artifactId>
<artifactId>lucene-analysis-morfologik</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency> <!-- only needed for testing -->
Expand All @@ -320,13 +326,62 @@
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>${lucene.version}</version>
<version>${solr.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analysis-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency> <!-- only needed for testing -->
<groupId>org.apache.solr</groupId>
<artifactId>solr-test-framework</artifactId>
<version>${lucene.version}</version>
<version>${solr.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analysis-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-codecs</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/io/anserini/index/IndexCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
import org.apache.lucene.store.FSDirectory;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.Http2SolrClient;
import org.apache.solr.common.SolrInputDocument;
import org.elasticsearch.action.DocWriteRequest;
import org.elasticsearch.action.bulk.BulkRequest;
Expand Down Expand Up @@ -411,8 +412,7 @@ private class SolrClientFactory extends BasePooledObjectFactory<SolrClient> {
@Override
public SolrClient create() {
return new CloudSolrClient.Builder(Splitter.on(',').splitToList(args.zkUrl), Optional.of(args.zkChroot))
.withConnectionTimeout(TIMEOUT)
.withSocketTimeout(TIMEOUT)
.withInternalClientBuilder(new Http2SolrClient.Builder().connectionTimeout(TIMEOUT))
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexOptions;
Expand Down Expand Up @@ -102,7 +102,7 @@ public Document createDocument(AclAnthology.Document aclDoc) throws GeneratorExc
// Store the collection docid.
doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// This is needed to break score ties by docid.
doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
doc.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));

if (args.storeRaw) {
doc.add(new StoredField(IndexArgs.RAW, aclDoc.raw()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexOptions;
Expand Down Expand Up @@ -101,7 +101,7 @@ public Document createDocument(BibtexCollection.Document bibtexDoc) throws Gener
// Store the collection docid.
doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// This is needed to break score ties by docid.
doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
doc.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));
// Store the collection's bibtex type
doc.add(new StringField(TYPE, type, Field.Store.YES));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexOptions;
Expand Down Expand Up @@ -115,7 +115,7 @@ public Document createDocument(Cord19BaseDocument covidDoc) throws GeneratorExce
// Store the collection docid.
doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// This is needed to break score ties by docid.
doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
doc.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));

if (args.storeRaw) {
doc.add(new StoredField(IndexArgs.RAW, raw));
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/io/anserini/index/generator/CoreGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexOptions;
Expand Down Expand Up @@ -106,7 +106,7 @@ public Document createDocument(CoreCollection.Document coreDoc) throws Generator
// Store the collection docid.
doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// This is needed to break score ties by docid.
doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
doc.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));

if (args.storeRaw) {
doc.add(new StoredField(IndexArgs.RAW, coreDoc.raw()));
Expand Down Expand Up @@ -165,10 +165,10 @@ private void addDocumentField(Document doc, String key, JsonNode value, FieldTyp
// index as numeric value to allow range queries
try {
doc.add(new IntPoint(key, Integer.parseInt(valueText)));
doc.add(new StoredField(key, valueText));
} catch(Exception e) {
// year is not numeric value
}
doc.add(new StoredField(key, valueText));
} else {
doc.add(new Field(key, valueText, fieldType));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
import io.anserini.collection.SourceDocument;
import io.anserini.index.IndexArgs;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexOptions;
Expand Down Expand Up @@ -74,7 +74,7 @@ public Document createDocument(T src) throws GeneratorException {
// Store the collection docid.
document.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// This is needed to break score ties by docid.
document.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));

if (args.storeRaw) {
document.add(new StoredField(IndexArgs.RAW, src.raw()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexOptions;
Expand Down Expand Up @@ -76,7 +76,7 @@ public Document createDocument(EpidemicQACollection.Document covidDoc) throws Ge
// Store the collection docid.
doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// This is needed to break score ties by docid.
doc.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
doc.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));

if (args.storeRaw) {
doc.add(new StoredField(IndexArgs.RAW, raw));
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/io/anserini/rerank/ScoredDocuments.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
package io.anserini.rerank;

import io.anserini.index.IndexArgs;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
Expand Down Expand Up @@ -97,7 +97,7 @@ public static ScoredDocuments fromSolrDocs(SolrDocumentList rs) {
// Store the collection docid.
document.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// This is needed to break score ties by docid.
document.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));
scoredDocs.documents[i] = document;
scoredDocs.scores[i] = score;
scoredDocs.ids[i] = i; // no internal Lucene ID available, use index as placeholder
Expand Down Expand Up @@ -130,7 +130,7 @@ public static ScoredDocuments fromESDocs(SearchHits rs) {
// Store the collection docid.
document.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// This is needed to break score ties by docid.
document.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef(id)));
document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));
scoredDocs.documents[i] = document;
scoredDocs.scores[i] = score;
scoredDocs.ids[i] = i; // no internal Lucene ID available, use index as placeholder
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/io/anserini/search/SearchSolr.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.Http2SolrClient;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocumentList;
import org.kohsuke.args4j.CmdLineException;
Expand Down Expand Up @@ -165,8 +166,7 @@ public SearchSolr(Args args) throws IOException {
LOG.info("Solr ZooKeeper URL: " + args.zkUrl);
this.client = new CloudSolrClient.Builder(Splitter.on(',')
.splitToList(args.zkUrl), Optional.of(args.zkChroot))
.withConnectionTimeout(TIMEOUT)
.withSocketTimeout(TIMEOUT)
.withInternalClientBuilder(new Http2SolrClient.Builder().connectionTimeout(TIMEOUT))
.build();
}

Expand Down
6 changes: 3 additions & 3 deletions src/main/java/io/anserini/search/query/SdmQueryGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
import io.anserini.analysis.AnalyzerUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.spans.SpanNearQuery;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.queries.spans.SpanTermQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;

import java.util.List;

Expand Down
Loading

0 comments on commit 5af657d

Please sign in to comment.