diff --git a/pom.xml b/pom.xml
index 368f29e610..ecb15d4b84 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2,7 +2,7 @@
4.0.0
io.anserini
anserini
- 0.14.5-SNAPSHOT
+ 0.15.0-SNAPSHOT
Anserini
An information retrieval toolkit built on Lucene
http://anserini.io/
@@ -26,8 +26,7 @@
- 9.0.0
- 9.0.0
+ 9.3.0
UTF-8
@@ -110,14 +109,6 @@
io.anserini.search.SearchCollection
SearchCollection
-
- io.anserini.search.SearchSolr
- SearchSolr
-
-
- io.anserini.search.SearchElastic
- SearchElastic
-
io.anserini.search.SearchMsmarco
SearchMsmarco
@@ -296,11 +287,31 @@
lucene-core
${lucene.version}
+
+ org.apache.lucene
+ lucene-codecs
+ ${lucene.version}
+
+
+ org.apache.lucene
+ lucene-backward-codecs
+ ${lucene.version}
+
org.apache.lucene
lucene-queries
${lucene.version}
+
+ org.apache.lucene
+ lucene-queryparser
+ ${lucene.version}
+
+
+ org.apache.lucene
+ lucene-analysis-common
+ ${lucene.version}
+
org.apache.lucene
lucene-analysis-kuromoji
@@ -323,71 +334,6 @@
4.13.2
test
-
- org.apache.solr
- solr-solrj
- ${solr.version}
-
-
- org.apache.lucene
- lucene-core
-
-
- org.apache.lucene
- lucene-analysis-common
-
-
- org.apache.lucene
- lucene-queries
-
-
- org.slf4j
- slf4j-api
-
-
- org.apache.logging.log4j
- log4j-slf4j-impl
-
-
-
-
- org.apache.solr
- solr-test-framework
- ${solr.version}
- test
-
-
- org.apache.lucene
- lucene-core
-
-
- org.apache.lucene
- lucene-analysis-common
-
-
- org.apache.lucene
- lucene-queries
-
-
- org.slf4j
- slf4j-api
-
-
- org.apache.logging.log4j
- log4j-slf4j-impl
-
-
-
-
- org.apache.lucene
- lucene-codecs
- ${lucene.version}
-
-
- org.elasticsearch.client
- elasticsearch-rest-high-level-client
- 7.0.0
-
org.tukaani
xz
@@ -491,6 +437,11 @@
commons-csv
1.8
+
+ org.apache.commons
+ commons-text
+ 1.9
+
org.mockito
mockito-all
diff --git a/src/main/java/io/anserini/analysis/TweetLowerCaseEntityPreservingFilterFactory.java b/src/main/java/io/anserini/analysis/TweetLowerCaseEntityPreservingFilterFactory.java
index 14a63c6790..948911672f 100644
--- a/src/main/java/io/anserini/analysis/TweetLowerCaseEntityPreservingFilterFactory.java
+++ b/src/main/java/io/anserini/analysis/TweetLowerCaseEntityPreservingFilterFactory.java
@@ -17,7 +17,7 @@
package io.anserini.analysis;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenFilterFactory;
import java.util.Map;
diff --git a/src/main/java/io/anserini/index/IndexArgs.java b/src/main/java/io/anserini/index/IndexArgs.java
index 6997f95886..70957978da 100644
--- a/src/main/java/io/anserini/index/IndexArgs.java
+++ b/src/main/java/io/anserini/index/IndexArgs.java
@@ -69,8 +69,7 @@ public class IndexArgs {
// optional arguments
- @Option(name = "-index", metaVar = "[path]", forbids = {"-solr", "-es"},
- usage = "Index path.")
+ @Option(name = "-index", metaVar = "[path]", usage = "Index path.")
public String index;
@Option(name = "-fields", handler = StringArrayOptionHandler.class,
@@ -160,82 +159,6 @@ public class IndexArgs {
usage = "File that contains deleted tweet ids (longs), one per line; these tweets will be skipped during indexing.")
public String tweetDeletedIdsFile = "";
- // Solr options
-
- @Option(name = "-solr", forbids = {"-index", "-es"},
- usage = "Indexes into Solr.")
- public boolean solr = false;
-
- @Option(name = "-solr.batch", metaVar = "[n]",
- usage = "Solr indexing batch size.")
- public int solrBatch = 1000;
-
- @Option(name = "-solr.commitWithin", metaVar = "[s]",
- usage = "Solr commitWithin setting (in seconds).")
- public int solrCommitWithin = 60;
-
- @Option(name = "-solr.index", metaVar = "[name]",
- usage = "Solr index name.")
- public String solrIndex = null;
-
- @Option(name = "-solr.zkUrl", metaVar = "[urls]",
- usage = "Solr ZooKeeper URLs (comma separated list).")
- public String zkUrl = null;
-
- @Option(name = "-solr.zkChroot", metaVar = "[path]",
- usage = "Solr ZooKeeper chroot")
- public String zkChroot = "/";
-
- @Option(name = "-solr.poolSize", metaVar = "[n]",
- usage = "Solr client pool size.")
- public int solrPoolSize = 16;
-
- // Elasticsearch options
-
- @Option(name = "-es", forbids = {"-index", "-solr"},
- usage = "Indexes into Elasticsearch.")
- public boolean es = false;
-
- @Option(name = "-es.index", metaVar = "[name]",
- usage = "Elasticsearch index name.")
- public String esIndex = null;
-
- @Option(name = "-es.batch", metaVar = "[n]",
- usage = "Elasticsearch batch index requests size.")
- public int esBatch = 1000;
-
- @Option(name = "-es.bulk", metaVar = "[n]",
- usage = "Elasticsearch max bulk requests size in bytes.")
- public int esBulk = 80000000;
-
- @Option(name = "-es.hostname", metaVar = "[host]",
- usage = "Elasticsearch host.")
- public String esHostname = "localhost";
-
- @Option(name = "-es.port", metaVar = "[port]",
- usage = "Elasticsearch port number.")
- public int esPort = 9200;
-
- @Option(name = "-es.user", metaVar = "[username]",
- usage = "Elasticsearch user name.")
- public String esUser = "elastic";
-
- @Option(name = "-es.password", metaVar = "[password]",
- usage = "Elasticsearch password.")
- public String esPassword = "changeme";
-
- @Option(name = "-es.poolSize", metaVar = "[num]",
- usage = "Elasticsearch client pool size.")
- public int esPoolSize = 10;
-
- @Option(name = "-es.connectTimeout", metaVar = "[ms]",
- usage = "Elasticsearch (low level) REST client connect timeout (in ms).")
- public int esConnectTimeout = TIMEOUT;
-
- @Option(name = "-es.socketTimeout", metaVar = "[ms]",
- usage = "Elasticsearch (low level) REST client socket timeout (in ms).")
- public int esSocketTimeout = TIMEOUT;
-
// Sharding options
@Option(name = "-shard.count", metaVar = "[n]",
diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java
index e768f587fe..b83403a3ae 100644
--- a/src/main/java/io/anserini/index/IndexCollection.java
+++ b/src/main/java/io/anserini/index/IndexCollection.java
@@ -16,8 +16,6 @@
package io.anserini.index;
-import com.google.common.base.Splitter;
-import com.google.common.collect.Lists;
import io.anserini.analysis.DefaultEnglishAnalyzer;
import io.anserini.analysis.TweetAnalyzer;
import io.anserini.collection.DocumentCollection;
@@ -27,22 +25,10 @@
import io.anserini.index.generator.InvalidDocumentException;
import io.anserini.index.generator.LuceneDocumentGenerator;
import io.anserini.index.generator.SkippedDocumentException;
-import io.anserini.index.generator.WashingtonPostGenerator;
import io.anserini.search.similarity.AccurateBM25Similarity;
import io.anserini.search.similarity.ImpactSimilarity;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.time.DurationFormatUtils;
-import org.apache.commons.pool2.BasePooledObjectFactory;
-import org.apache.commons.pool2.ObjectPool;
-import org.apache.commons.pool2.PooledObject;
-import org.apache.commons.pool2.impl.DefaultPooledObject;
-import org.apache.commons.pool2.impl.GenericObjectPool;
-import org.apache.commons.pool2.impl.GenericObjectPoolConfig;
-import org.apache.http.HttpHost;
-import org.apache.http.auth.AuthScope;
-import org.apache.http.auth.UsernamePasswordCredentials;
-import org.apache.http.client.CredentialsProvider;
-import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -72,29 +58,14 @@
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
-
import org.apache.lucene.document.Document;
import org.apache.lucene.index.ConcurrentMergeScheduler;
-import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
-import org.apache.solr.client.solrj.SolrClient;
-import org.apache.solr.client.solrj.impl.CloudSolrClient;
-import org.apache.solr.client.solrj.impl.Http2SolrClient;
-import org.apache.solr.common.SolrInputDocument;
-import org.elasticsearch.action.DocWriteRequest;
-import org.elasticsearch.action.bulk.BulkRequest;
-import org.elasticsearch.action.index.IndexRequest;
-import org.elasticsearch.client.RequestOptions;
-import org.elasticsearch.client.RestClient;
-import org.elasticsearch.client.RestHighLevelClient;
-import org.elasticsearch.common.xcontent.XContentBuilder;
-import org.elasticsearch.common.xcontent.XContentFactory;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.OptionHandlerFilter;
@@ -105,32 +76,21 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collections;
import java.util.HashSet;
import java.util.List;
-import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
public final class IndexCollection {
private static final Logger LOG = LogManager.getLogger(IndexCollection.class);
- private static final int TIMEOUT = 600 * 1000;
// This is the default analyzer used, unless another stemming algorithm or language is specified.
public static final Analyzer DEFAULT_ANALYZER = DefaultEnglishAnalyzer.newDefaultInstance();
- // When duplicates of these fields are attempted to be indexed in Solr, they are ignored. This allows some fields to be multi-valued, but not others.
- // Stored vs. indexed vs. doc values vs. multi-valued vs. ... are controlled via config, rather than code, in Solr.
- private static final List IGNORED_DUPLICATE_FIELDS =
- Lists.newArrayList(WashingtonPostGenerator.WashingtonPostField.PUBLISHED_DATE.name);
-
public final class Counters {
/**
* Counter for successfully indexed documents.
@@ -262,361 +222,6 @@ public void run() {
}
}
- private final class SolrIndexerThread implements Runnable {
- private final Path input;
- private final DocumentCollection collection;
- private final List buffer = new ArrayList<>(args.solrBatch);
- private FileSegment fileSegment;
-
- private SolrIndexerThread(DocumentCollection collection, Path input) {
- this.input = input;
- this.collection = collection;
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public void run() {
- try {
- LuceneDocumentGenerator generator = (LuceneDocumentGenerator)
- generatorClass.getDeclaredConstructor(IndexArgs.class).newInstance(args);
-
- // We keep track of two separate counts: the total count of documents in this file segment (cnt),
- // and the number of documents in this current "batch" (batch). We update the global counter every
- // 10k documents: this is so that we get intermediate updates, which is informative if a collection
- // has only one file segment; see https://github.com/castorini/anserini/issues/683
- int cnt = 0;
- int batch = 0;
-
- @SuppressWarnings("unchecked")
- FileSegment segment = (FileSegment) collection.createFileSegment(input);
- // in order to call close() and clean up resources in case of exception
- this.fileSegment = segment;
-
- for (SourceDocument sourceDocument : segment) {
- if (!sourceDocument.indexable()) {
- counters.unindexable.incrementAndGet();
- continue;
- }
-
- Document document;
- try {
- document = generator.createDocument(sourceDocument);
- } catch (EmptyDocumentException e1) {
- counters.empty.incrementAndGet();
- continue;
- } catch (SkippedDocumentException e2) {
- counters.skipped.incrementAndGet();
- continue;
- } catch (InvalidDocumentException e3) {
- counters.errors.incrementAndGet();
- continue;
- }
-
- if (whitelistDocids != null && !whitelistDocids.contains(sourceDocument.id())) {
- counters.skipped.incrementAndGet();
- continue;
- }
-
- SolrInputDocument solrDocument = new SolrInputDocument();
-
- // Copy all Lucene Document fields to Solr document
- for (IndexableField field : document.getFields()) {
- // Skip docValues fields - this is done via Solr config.
- if (field.fieldType().docValuesType() != DocValuesType.NONE) {
- continue;
- }
- // If the field is already in the doc, skip it.
- // This fixes an issue with WaPo where published_date is in the Lucene doc as LongPoint and StoredField. Solr needs one copy, more fine-grained control in config.
- if (solrDocument.containsKey(field.name()) && IGNORED_DUPLICATE_FIELDS.contains(field.name())) {
- continue;
- }
- if (field.numericValue() != null) {
- solrDocument.addField(field.name(), field.numericValue());
- } else if (field.stringValue() != null) { // For some reason, id is multi-valued with null as one of the values
- solrDocument.addField(field.name(), field.stringValue());
- }
- }
-
- buffer.add(solrDocument);
- if (buffer.size() == args.solrBatch) {
- flush();
- }
-
- cnt++;
- batch++;
-
- // And the counts from this batch, reset batch counter.
- if (batch % 10000 == 0) {
- counters.indexed.addAndGet(batch);
- batch = 0;
- }
- }
-
- // If we have docs in the buffer, flush them.
- if (!buffer.isEmpty()) {
- flush();
- }
-
- // Add the remaining documents.
- counters.indexed.addAndGet(batch);
-
- int skipped = segment.getSkippedCount();
- if (skipped > 0) {
- // When indexing tweets, this is normal, because there are delete messages that are skipped over.
- counters.skipped.addAndGet(skipped);
- LOG.warn(input.getParent().getFileName().toString() + File.separator +
- input.getFileName().toString() + ": " + skipped + " docs skipped.");
- }
-
- if (segment.getErrorStatus()) {
- counters.errors.incrementAndGet();
- LOG.error(input.getParent().getFileName().toString() + File.separator +
- input.getFileName().toString() + ": error iterating through segment.");
- }
-
- // Log at the debug level because this can be quite noisy if there are lots of file segments.
- LOG.debug(input.getParent().getFileName().toString() + File.separator +
- input.getFileName().toString() + ": " + cnt + " docs added.");
- } catch (Exception e) {
- LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e);
- } finally {
- if (fileSegment != null) {
- fileSegment.close();
- }
- }
- }
-
- private void flush() {
- if (!buffer.isEmpty()) {
- SolrClient solrClient = null;
- try {
- solrClient = solrPool.borrowObject();
- solrClient.add(args.solrIndex, buffer, args.solrCommitWithin * 1000);
- buffer.clear();
- } catch (Exception e) {
- LOG.error("Error flushing documents to Solr", e);
- } finally {
- if (solrClient != null) {
- try {
- solrPool.returnObject(solrClient);
- } catch (Exception e) {
- LOG.error("Error returning SolrClient to pool", e);
- }
- }
- }
- }
- }
- }
-
- private class SolrClientFactory extends BasePooledObjectFactory {
- @Override
- public SolrClient create() {
- return new CloudSolrClient.Builder(Splitter.on(',').splitToList(args.zkUrl), Optional.of(args.zkChroot))
- .withInternalClientBuilder(new Http2SolrClient.Builder().connectionTimeout(TIMEOUT))
- .build();
- }
-
- @Override
- public PooledObject wrap(SolrClient solrClient) {
- return new DefaultPooledObject<>(solrClient);
- }
-
- @Override
- public void destroyObject(PooledObject pooled) throws Exception {
- pooled.getObject().close();
- }
- }
-
- private final class ESIndexerThread implements Runnable {
- private final Path input;
- private final DocumentCollection collection;
- private BulkRequest bulkRequest;
- private FileSegment fileSegment;
-
- private ESIndexerThread(DocumentCollection collection, Path input) {
- this.input = input;
- this.collection = collection;
- this.bulkRequest = new BulkRequest();
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public void run() {
- try {
- LuceneDocumentGenerator generator = (LuceneDocumentGenerator)
- generatorClass.getDeclaredConstructor(IndexArgs.class).newInstance(args);
-
- // We keep track of two separate counts: the total count of documents in this file segment (cnt),
- // and the number of documents in this current "batch" (batch). We update the global counter every
- // 10k documents: this is so that we get intermediate updates, which is informative if a collection
- // has only one file segment; see https://github.com/castorini/anserini/issues/683
- int cnt = 0;
- int batch = 0;
-
- FileSegment segment = collection.createFileSegment(input);
- // in order to call close() and clean up resources in case of exception
- this.fileSegment = segment;
-
- for (SourceDocument sourceDocument : segment) {
- if (!sourceDocument.indexable()) {
- counters.unindexable.incrementAndGet();
- continue;
- }
-
- Document document;
- try {
- document = generator.createDocument(sourceDocument);
- } catch (EmptyDocumentException e1) {
- counters.empty.incrementAndGet();
- continue;
- } catch (SkippedDocumentException e2) {
- counters.skipped.incrementAndGet();
- continue;
- } catch (InvalidDocumentException e3) {
- counters.errors.incrementAndGet();
- continue;
- }
-
- if (whitelistDocids != null && !whitelistDocids.contains(sourceDocument.id())) {
- counters.skipped.incrementAndGet();
- continue;
- }
-
- // Get distinct field names
- List fields = document.getFields().stream().map(field -> field.name()).distinct().collect(Collectors.toList());
-
- XContentBuilder builder = XContentFactory.jsonBuilder().startObject();
-
- for (String field : fields) {
-
- // Skip docValues fields
- if (document.getField(field).fieldType().docValuesType() != DocValuesType.NONE) continue;
-
- // Get field objects for current field name (could be multiple, such as WaPo's fullCaption)
- IndexableField[] indexableFields = document.getFields(field);
-
- if (field.equalsIgnoreCase("id") || indexableFields.length == 1) {
- // Single value fields or "id" field
- Object value = document.getField(field).stringValue() != null ? document.getField(field).stringValue() : document.getField(field).numericValue();
- builder.field(field, value);
- } else {
- // Multi-valued fields
- Object[] values = Stream.of(indexableFields).map(f -> f.stringValue()).toArray();
- builder.array(field, values);
- }
- }
-
- builder.endObject();
-
- String indexName = (args.esIndex != null) ? args.esIndex : input.getFileName().toString();
- bulkRequest.add(new IndexRequest(indexName).id(sourceDocument.id()).source(builder));
-
- // sendBulkRequest when the batch size is reached OR the bulk size is reached
- if (bulkRequest.numberOfActions() == args.esBatch ||
- bulkRequest.estimatedSizeInBytes() >= args.esBulk) {
- sendBulkRequest();
- }
-
- cnt++;
- batch++;
-
- // And the counts from this batch, reset batch counter.
- if (batch % 10000 == 0) {
- counters.indexed.addAndGet(batch);
- batch = 0;
- }
- }
-
- if (bulkRequest.numberOfActions() != 0) {
- sendBulkRequest();
- }
-
- // Add the remaining documents.
- counters.indexed.addAndGet(batch);
-
- int skipped = segment.getSkippedCount();
- if (skipped > 0) {
- // When indexing tweets, this is normal, because there are delete messages that are skipped over.
- counters.skipped.addAndGet(skipped);
- LOG.warn(input.getParent().getFileName().toString() + File.separator +
- input.getFileName().toString() + ": " + skipped + " docs skipped.");
- }
-
- if (segment.getErrorStatus()) {
- counters.errors.incrementAndGet();
- LOG.error(input.getParent().getFileName().toString() + File.separator +
- input.getFileName().toString() + ": error iterating through segment.");
- }
-
- // Log at the debug level because this can be quite noisy if there are lots of file segments.
- LOG.debug(input.getParent().getFileName().toString() + File.separator +
- input.getFileName().toString() + ": " + cnt + " docs added.");
- } catch (Exception e) {
- LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e);
- } finally {
- if (fileSegment != null){
- fileSegment.close();
- }
- }
- }
-
- private void sendBulkRequest() {
- if (bulkRequest.numberOfActions() == 0) {
- return;
- }
-
- RestHighLevelClient esClient = null;
- try {
- esClient = esPool.borrowObject();
- esClient.bulk(bulkRequest, RequestOptions.DEFAULT);
- bulkRequest = new BulkRequest();
- } catch (Exception e) {
- LOG.error("Error sending bulk requests to Elasticsearch", e);
-
- // Log the 10 docs that have the largest sizes in this request
- List> docs = bulkRequest.requests();
- Collections.sort(docs, (d1, d2) -> ((IndexRequest) d2).source().length() - ((IndexRequest) d1).source().length());
-
- LOG.info("Error sending bulkRequest. The 10 largest docs in this request are the following cord_uid: ");
- for (int i = 0; i < 10; i++) {
- IndexRequest doc = (IndexRequest) docs.get(i);
- LOG.info(doc.id());
- }
- } finally {
- if (esClient != null) {
- try {
- esPool.returnObject(esClient);
- } catch (Exception e) {
- LOG.error("Error returning ES client to pool", e);
- }
- }
- }
- }
- }
-
- private class ESClientFactory extends BasePooledObjectFactory {
- @Override
- public RestHighLevelClient create() {
- final CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
- credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(args.esUser, args.esPassword));
- return new RestHighLevelClient(
- RestClient.builder(new HttpHost(args.esHostname, args.esPort, "http"))
- .setHttpClientConfigCallback(builder -> builder.setDefaultCredentialsProvider(credentialsProvider))
- .setRequestConfigCallback(builder -> builder.setConnectTimeout(args.esConnectTimeout).setSocketTimeout(args.esSocketTimeout))
- );
- }
-
- @Override
- public PooledObject wrap(RestHighLevelClient esClient) {
- return new DefaultPooledObject<>(esClient);
- }
-
- @Override
- public void destroyObject(PooledObject pooled) throws Exception {
- pooled.getObject().close();
- }
- }
-
private final IndexArgs args;
private final Path collectionPath;
private final Set whitelistDocids;
@@ -625,10 +230,6 @@ public void destroyObject(PooledObject pooled) throws Excep
private final DocumentCollection collection;
private final Counters counters;
private Path indexPath;
- private ObjectPool solrPool;
- private ObjectPool esPool;
-
-
@SuppressWarnings("unchecked")
public IndexCollection(IndexArgs args) throws Exception {
@@ -665,32 +266,7 @@ public IndexCollection(IndexArgs args) throws Exception {
LOG.info("Optimize (merge segments)? " + args.optimize);
LOG.info("Whitelist: " + args.whitelist);
LOG.info("Pretokenized?: " + args.pretokenized);
-
- if (args.solr) {
- LOG.info("Indexing into Solr...");
- LOG.info("Solr batch size: " + args.solrBatch);
- LOG.info("Solr commitWithin: " + args.solrCommitWithin);
- LOG.info("Solr index: " + args.solrIndex);
- LOG.info("Solr ZooKeeper URL: " + args.zkUrl);
- LOG.info("SolrClient pool size: " + args.solrPoolSize);
- } else if (args.es) {
- LOG.info("Indexing into Elasticsearch...");
- LOG.info("Elasticsearch batch size: " + args.esBatch);
- LOG.info("Elasticsearch index: " + args.esIndex);
- LOG.info("Elasticsearch hostname: " + args.esHostname);
- LOG.info("Elasticsearch host port: " + args.esPort);
- LOG.info("Elasticsearch client connect timeout (in ms): " + args.esConnectTimeout);
- LOG.info("Elasticsearch client socket timeout (in ms): " + args.esSocketTimeout);
- LOG.info("Elasticsearch pool size: " + args.esPoolSize);
- LOG.info("Elasticsearch user: " + args.esUser);
- } else {
- LOG.info("Directly building Lucene indexes...");
- LOG.info("Index path: " + args.index);
- }
-
- if (args.index == null && !args.solr && !args.es) {
- throw new IllegalArgumentException("Must specify one of -index, -solr, or -es");
- }
+ LOG.info("Index path: " + args.index);
if (args.index != null) {
this.indexPath = Paths.get(args.index);
@@ -723,18 +299,6 @@ public IndexCollection(IndexArgs args) throws Exception {
this.whitelistDocids = null;
}
- if (args.solr) {
- GenericObjectPoolConfig config = new GenericObjectPoolConfig<>();
- config.setMaxTotal(args.solrPoolSize);
- config.setMinIdle(args.solrPoolSize); // To guard against premature discarding of solrClients
- this.solrPool = new GenericObjectPool<>(new SolrClientFactory(), config);
- } else if (args.es) {
- GenericObjectPoolConfig config = new GenericObjectPoolConfig<>();
- config.setMaxTotal(args.esPoolSize);
- config.setMinIdle(args.esPoolSize);
- this.esPool = new GenericObjectPool<>(new ESClientFactory(), config);
- }
-
this.counters = new Counters();
}
@@ -865,13 +429,7 @@ public Counters run() throws IOException {
LOG.info("Starting to index...");
for (int i = 0; i < segmentCnt; i++) {
- if (args.solr) {
- executor.execute(new SolrIndexerThread(collection, (Path) segmentPaths.get(i)));
- } else if (args.es) {
- executor.execute(new ESIndexerThread(collection, (Path) segmentPaths.get(i)));
- } else {
- executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i)));
- }
+ executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i)));
}
executor.shutdown();
@@ -898,31 +456,9 @@ public Counters run() throws IOException {
" is not equal to completedTaskCount = " + executor.getCompletedTaskCount());
}
- long numIndexed;
-
- if (args.solr || args.es) {
- numIndexed = counters.indexed.get();
- } else {
- numIndexed = writer.getDocStats().maxDoc;
- }
+ long numIndexed = writer.getDocStats().maxDoc;
// Do a final commit
- if (args.solr) {
- try {
- SolrClient client = solrPool.borrowObject();
- client.commit(args.solrIndex);
- // Needed for orderly shutdown so the SolrClient executor does not delay main thread exit
- solrPool.returnObject(client);
- solrPool.close();
- } catch (Exception e) {
- LOG.error("Exception during final Solr commit: ", e);
- }
- }
-
- if (args.es) {
- esPool.close();
- }
-
try {
if (writer != null) {
writer.commit();
diff --git a/src/main/java/io/anserini/rerank/ScoredDocuments.java b/src/main/java/io/anserini/rerank/ScoredDocuments.java
index c215927a31..e4eb692873 100644
--- a/src/main/java/io/anserini/rerank/ScoredDocuments.java
+++ b/src/main/java/io/anserini/rerank/ScoredDocuments.java
@@ -17,30 +17,21 @@
package io.anserini.rerank;
import io.anserini.index.IndexArgs;
-import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.util.BytesRef;
-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.common.SolrDocumentList;
-import org.apache.commons.lang3.ArrayUtils;
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.elasticsearch.search.SearchHit;
-import org.elasticsearch.search.SearchHits;
-
-import java.util.List;
+import java.io.IOException;
import java.util.ArrayList;
+import java.util.List;
import java.util.Map;
-import java.io.IOException;
/**
* ScoredDocuments object that converts TopDocs from the searcher into an Anserini format
@@ -74,71 +65,6 @@ public static ScoredDocuments fromTopDocs(TopDocs rs, IndexSearcher searcher) {
return scoredDocs;
}
- public static ScoredDocuments fromSolrDocs(SolrDocumentList rs) {
-
- ScoredDocuments scoredDocs = new ScoredDocuments();
-
- int length = rs.size();
- scoredDocs.documents = new Document[length];
- scoredDocs.ids = new int[length];
- scoredDocs.scores = new float[length];
-
- for (int i = 0; i < length; i++) {
-
- SolrDocument d = rs.get(i);
-
- // Create placeholder copies of Lucene Documents
- // Intention is for compatibility with ScoreTiesAdjusterReranker without disturbing other aspects of reranker code
-
- Document document = new Document();
- String id = d.getFieldValue("id").toString();
- float score = (float) d.getFieldValue("score");
-
- // Store the collection docid.
- document.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
- // This is needed to break score ties by docid.
- document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));
- scoredDocs.documents[i] = document;
- scoredDocs.scores[i] = score;
- scoredDocs.ids[i] = i; // no internal Lucene ID available, use index as placeholder
- }
-
- return scoredDocs;
- }
-
- public static ScoredDocuments fromESDocs(SearchHits rs) {
-
- ScoredDocuments scoredDocs = new ScoredDocuments();
- SearchHit[] searchHits = rs.getHits();
-
- int length = searchHits.length;
- scoredDocs.documents = new Document[length];
- scoredDocs.ids = new int[length];
- scoredDocs.scores = new float[length];
-
- for (int i = 0; i < length; i++) {
-
- SearchHit hit = searchHits[i];
-
- // Create placeholder copies of Lucene Documents
- // Intention is for compatibility with ScoreTiesAdjusterReranker without disturbing other aspects of reranker code
-
- Document document = new Document();
- String id = hit.getId();
- float score = hit.getScore();
-
- // Store the collection docid.
- document.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
- // This is needed to break score ties by docid.
- document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));
- scoredDocs.documents[i] = document;
- scoredDocs.scores[i] = score;
- scoredDocs.ids[i] = i; // no internal Lucene ID available, use index as placeholder
- }
-
- return scoredDocs;
- }
-
public static ScoredDocuments fromQrels(Map qrels, IndexReader reader) throws IOException {
ScoredDocuments scoredDocs = new ScoredDocuments();
diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java
index 286efe8e3c..1a7d1776ff 100644
--- a/src/main/java/io/anserini/search/SearchCollection.java
+++ b/src/main/java/io/anserini/search/SearchCollection.java
@@ -69,7 +69,6 @@
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
-
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@@ -114,7 +113,6 @@
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
-import java.nio.file.AtomicMoveNotSupportedException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -130,13 +128,11 @@
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.CompletionException;
-import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
-import java.util.zip.GZIPInputStream;
/**
* Main entry point for search.
diff --git a/src/main/java/io/anserini/search/SearchElastic.java b/src/main/java/io/anserini/search/SearchElastic.java
deleted file mode 100644
index fdc01e387f..0000000000
--- a/src/main/java/io/anserini/search/SearchElastic.java
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Anserini: A Lucene toolkit for reproducible information retrieval research
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.anserini.search;
-
-import io.anserini.index.IndexArgs;
-import io.anserini.index.generator.TweetGenerator;
-import io.anserini.rerank.ScoredDocuments;
-import io.anserini.rerank.lib.ScoreTiesAdjusterReranker;
-import io.anserini.search.topicreader.TopicReader;
-import org.apache.commons.lang3.time.DurationFormatUtils;
-import org.apache.http.HttpHost;
-import org.apache.http.auth.AuthScope;
-import org.apache.http.auth.UsernamePasswordCredentials;
-import org.apache.http.client.CredentialsProvider;
-import org.apache.http.impl.client.BasicCredentialsProvider;
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.elasticsearch.action.search.SearchRequest;
-import org.elasticsearch.action.search.SearchResponse;
-import org.elasticsearch.client.HttpAsyncResponseConsumerFactory;
-import org.elasticsearch.client.RequestOptions;
-import org.elasticsearch.client.RestClient;
-import org.elasticsearch.client.RestHighLevelClient;
-import org.elasticsearch.index.query.BoolQueryBuilder;
-import org.elasticsearch.index.query.QueryBuilders;
-import org.elasticsearch.index.query.QueryStringQueryBuilder;
-import org.elasticsearch.index.query.RangeQueryBuilder;
-import org.elasticsearch.search.SearchHits;
-import org.elasticsearch.search.builder.SearchSourceBuilder;
-import org.elasticsearch.search.sort.FieldSortBuilder;
-import org.elasticsearch.search.sort.ScoreSortBuilder;
-import org.elasticsearch.search.sort.SortOrder;
-import org.kohsuke.args4j.CmdLineException;
-import org.kohsuke.args4j.CmdLineParser;
-import org.kohsuke.args4j.Option;
-import org.kohsuke.args4j.OptionHandlerFilter;
-import org.kohsuke.args4j.ParserProperties;
-import org.kohsuke.args4j.spi.StringArrayOptionHandler;
-
-import java.io.Closeable;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.Locale;
-import java.util.Map;
-import java.util.SortedMap;
-import java.util.TreeMap;
-import java.util.concurrent.TimeUnit;
-
-/*
-* Entry point of the Retrieval.
- */
-public final class SearchElastic implements Closeable {
-
- private static final Logger LOG = LogManager.getLogger(SearchCollection.class);
- private static final int TIMEOUT = 600 * 1000;
- private final Args args;
- private RestHighLevelClient client;
-
- private static final RequestOptions COMMON_OPTIONS;
- static {
- RequestOptions.Builder builder = RequestOptions.DEFAULT.toBuilder();
- builder.setHttpAsyncResponseConsumerFactory(
- new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory(1024 * 1024 * 1024));
- COMMON_OPTIONS = builder.build();
- }
-
- public static final class Args {
-
- // required arguments
-
- @Option(name = "-topics", metaVar = "[file]", handler = StringArrayOptionHandler.class, required = true, usage = "topics file")
- public String[] topics;
-
- @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file")
- public String output;
-
- @Option(name = "-topicreader", required = true, usage = "define how to read the topic(query) file: one of [Trec|Webxml]")
- public String topicReader;
-
- @Option(name = "-es.index", usage = "the name of the index in Elasticsearch")
- public String esIndex = null;
-
- @Option(name = "-es.hostname", usage = "the name of Elasticsearch HTTP host")
- public String esHostname = "localhost";
-
- @Option(name = "-es.port", usage = "the port for Elasticsearch HTTP host")
- public int esPort = 9200;
-
- /**
- * The user and password are defaulted to those pre-configured for docker-elk
- */
- @Option(name = "-es.user", usage = "the user of the ELK stack")
- public String esUser = "elastic";
-
- @Option(name = "-es.password", usage = "the password for the ELK stack")
- public String esPassword = "changeme";
-
- // optional arguments
- @Option(name = "-topicfield", usage = "Which field of the query should be used, default \"title\"." +
- " For TREC ad hoc topics, description or narrative can be used.")
- public String topicfield = "title";
-
- @Option(name = "-searchtweets", usage = "Whether the search is against a tweet " +
- "index created by IndexCollection -collection TweetCollection")
- public Boolean searchtweets = false;
-
- @Option(name = "-hits", metaVar = "[number]", required = false, usage = "max number of hits to return")
- public int hits = 1000;
-
- @Option(name = "-runtag", metaVar = "[tag]", required = false, usage = "runtag")
- public String runtag = null;
-
- }
-
- private final class ESSearcherThread extends Thread {
-
- final private SortedMap> topics;
- final private String outputPath;
- final private String runTag;
-
- private ESSearcherThread(SortedMap> topics, String outputPath, String runTag){
-
- this.topics = topics;
- this.runTag = runTag;
- this.outputPath = outputPath;
- setName(outputPath);
- }
-
- @Override
- public void run() {
- try {
- LOG.info("[Start] Retrieval with Elasticsearch collection: " + args.esIndex);
- final long start = System.nanoTime();
- PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(outputPath), StandardCharsets.US_ASCII));
-
- for (Map.Entry> entry : topics.entrySet()) {
- K qid = entry.getKey();
- String queryString = entry.getValue().get(args.topicfield);
- ScoredDocuments docs;
- if (args.searchtweets) {
- docs = searchTweets(queryString, Long.parseLong(entry.getValue().get("time")));
- } else {
- docs = search(queryString);
- }
-
- /**
- * the first column is the topic number.
- * the second column is currently unused and should always be "Q0".
- * the third column is the official document identifier of the retrieved document.
- * the fourth column is the rank the document is retrieved.
- * the fifth column shows the score (integer or floating point) that generated the ranking.
- * the sixth column is called the "run tag" and should be a unique identifier for your
- */
- for (int i = 0; i < docs.documents.length; i++) {
- out.println(String.format(Locale.US, "%s Q0 %s %d %f %s", qid,
- docs.documents[i].getField(IndexArgs.ID).stringValue(), (i + 1), docs.scores[i], runTag));
- }
- }
- out.flush();
- out.close();
-
- final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
- LOG.info("[Finished] Run " + topics.size() + " topics searched in "
- + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
- } catch (Exception e) {
- LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e);
- }
- }
- }
-
- public SearchElastic(Args args) {
- this.args = args;
- LOG.info("Elasticsearch index: " + args.esIndex);
- LOG.info("Elasticsearch hostname: " + args.esHostname);
- LOG.info("Elasticsearch host port: " + args.esPort);
-
- final CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
- credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(args.esUser, args.esPassword));
-
- this.client = new RestHighLevelClient(
- RestClient.builder(new HttpHost(args.esHostname, args.esPort, "http"))
- .setHttpClientConfigCallback(builder -> builder.setDefaultCredentialsProvider(credentialsProvider))
- .setRequestConfigCallback(builder -> builder.setConnectTimeout(TIMEOUT).setSocketTimeout(TIMEOUT)));
- }
-
- @SuppressWarnings("unchecked")
- public void runTopics() throws IOException {
- TopicReader tr;
- SortedMap> topics = new TreeMap<>();
- for (String singleTopicsFile : args.topics) {
- Path topicsFilePath = Paths.get(singleTopicsFile);
- if (!Files.exists(topicsFilePath) || !Files.isRegularFile(topicsFilePath) || !Files.isReadable(topicsFilePath)) {
- throw new IllegalArgumentException("Topics file : " + topicsFilePath + " does not exist or is not a (readable) file.");
- }
- try {
- tr = (TopicReader) Class.forName("io.anserini.search.topicreader." + args.topicReader + "TopicReader")
- .getConstructor(Path.class).newInstance(topicsFilePath);
- topics.putAll(tr.read());
- } catch (Exception e) {
- throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader);
- }
- }
-
- final String runTag = args.runtag == null ? "Elastirini" : args.runtag;
- ESSearcherThread esThread = new ESSearcherThread(topics, args.output, runTag);
- esThread.run();
- }
-
- public ScoredDocuments search(String queryString){
-
- SearchHits results = null;
-
- String specials = "+-=&|> ScoredDocuments searchTweets(String queryString, long t){
-
- SearchHits results = null;
-
- String specials = "+-=&|> tag contains the timestamp of the query in terms of the
- // chronologically nearest tweet id within the corpus
- RangeQueryBuilder queryTweetTime = QueryBuilders
- .rangeQuery(TweetGenerator.TweetField.ID_LONG.name)
- .from(0L)
- .to(t);
-
- QueryStringQueryBuilder queryTerms = QueryBuilders
- .queryStringQuery(queryString)
- .defaultField("contents")
- .analyzer("english");
-
- BoolQueryBuilder query = QueryBuilders.boolQuery()
- .filter(queryTweetTime)
- .should(queryTerms);
-
- SearchRequest searchRequest = new SearchRequest(args.esIndex);
- SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
- sourceBuilder.query(query);
- sourceBuilder.size(args.hits);
- sourceBuilder.sort(new ScoreSortBuilder().order(SortOrder.DESC));
- sourceBuilder.sort(new FieldSortBuilder(TweetGenerator.TweetField.ID_LONG.name).order(SortOrder.DESC));
- searchRequest.source(sourceBuilder);
-
- try {
- SearchResponse searchResponse = client.search(searchRequest, COMMON_OPTIONS);
- results = searchResponse.getHits();
- } catch (Exception e) {
- LOG.error("Exception during ES query: ", e);
- }
-
- ScoreTiesAdjusterReranker reranker = new ScoreTiesAdjusterReranker();
- return reranker.rerank(ScoredDocuments.fromESDocs(results), null);
- }
-
- @Override
- public void close() throws IOException {
- client.close();
- }
-
- public static void main(String[] args) throws Exception {
- Args searchElasticArgs = new Args();
- CmdLineParser parser = new CmdLineParser(searchElasticArgs, ParserProperties.defaults().withUsageWidth(90));
-
- try {
- parser.parseArgument(args);
- } catch (CmdLineException e) {
- System.err.println(e.getMessage());
- parser.printUsage(System.err);
- System.err.println("Example: SearchElastic" + parser.printExample(OptionHandlerFilter.REQUIRED));
- return;
- }
-
- final long start = System.nanoTime();
- SearchElastic searcher = new SearchElastic(searchElasticArgs);
- searcher.runTopics();
- searcher.close();
- final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
- LOG.info("Total run time: " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
- }
-}
\ No newline at end of file
diff --git a/src/main/java/io/anserini/search/SearchSolr.java b/src/main/java/io/anserini/search/SearchSolr.java
deleted file mode 100644
index 9b01661aa4..0000000000
--- a/src/main/java/io/anserini/search/SearchSolr.java
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * Anserini: A Lucene toolkit for reproducible information retrieval research
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.anserini.search;
-
-import com.google.common.base.Splitter;
-import io.anserini.index.IndexArgs;
-import io.anserini.index.generator.TweetGenerator;
-import io.anserini.rerank.ScoredDocuments;
-import io.anserini.rerank.lib.ScoreTiesAdjusterReranker;
-import io.anserini.search.topicreader.TopicReader;
-import org.apache.commons.lang3.time.DurationFormatUtils;
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.apache.lucene.document.LongPoint;
-import org.apache.lucene.search.Query;
-import org.apache.solr.client.solrj.SolrClient;
-import org.apache.solr.client.solrj.SolrQuery;
-import org.apache.solr.client.solrj.SolrQuery.SortClause;
-import org.apache.solr.client.solrj.impl.CloudSolrClient;
-import org.apache.solr.client.solrj.impl.Http2SolrClient;
-import org.apache.solr.client.solrj.response.QueryResponse;
-import org.apache.solr.common.SolrDocumentList;
-import org.kohsuke.args4j.CmdLineException;
-import org.kohsuke.args4j.CmdLineParser;
-import org.kohsuke.args4j.Option;
-import org.kohsuke.args4j.OptionHandlerFilter;
-import org.kohsuke.args4j.ParserProperties;
-import org.kohsuke.args4j.spi.StringArrayOptionHandler;
-
-import java.io.Closeable;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Optional;
-import java.util.SortedMap;
-import java.util.TreeMap;
-import java.util.concurrent.TimeUnit;
-
-/*
-* Entry point of the Retrieval.
- */
-public final class SearchSolr implements Closeable {
-
- private static final Logger LOG = LogManager.getLogger(SearchCollection.class);
- private static final int TIMEOUT = 600 * 1000;
- private final Args args;
- private SolrClient client;
-
- public static final class Args {
-
- // required arguments
-
- @Option(name = "-topics", metaVar = "[file]", handler = StringArrayOptionHandler.class, required = true, usage = "topics file")
- public String[] topics;
-
- @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file")
- public String output;
-
- @Option(name = "-topicreader", required = true, usage = "define how to read the topic(query) file: one of [Trec|Webxml]")
- public String topicReader;
-
- @Option(name = "-solr.index", usage = "the name of the index in Solr")
- public String solrIndex = null;
-
- @Option(name = "-solr.zkUrl", usage = "the URL of Solr's ZooKeeper (comma separated list of using ensemble)")
- public String zkUrl = null;
-
- @Option(name = "-solr.zkChroot", usage = "the ZooKeeper chroot")
- public String zkChroot = "/";
-
- // optional arguments
- @Option(name = "-topicfield", usage = "Which field of the query should be used, default \"title\"." +
- " For TREC ad hoc topics, description or narrative can be used.")
- public String topicfield = "title";
-
- @Option(name = "-searchtweets", usage = "Whether the search is against a tweet " +
- "index created by IndexCollection -collection TweetCollection")
- public Boolean searchtweets = false;
-
- @Option(name = "-hits", metaVar = "[number]", required = false, usage = "max number of hits to return")
- public int hits = 1000;
-
- @Option(name = "-runtag", metaVar = "[tag]", required = false, usage = "runtag")
- public String runtag = null;
-
- }
-
- private final class SolrSearcherThread extends Thread {
-
- final private SortedMap> topics;
- final private String outputPath;
- final private String runTag;
-
- private SolrSearcherThread(SortedMap> topics, String outputPath, String runTag){
-
- this.topics = topics;
- this.runTag = runTag;
- this.outputPath = outputPath;
- setName(outputPath);
- }
-
- @Override
- public void run() {
- try {
- LOG.info("[Start] Retrieval with Solr collection: " + args.solrIndex);
- final long start = System.nanoTime();
- PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(outputPath), StandardCharsets.US_ASCII));
-
- for (Map.Entry> entry : topics.entrySet()) {
- K qid = entry.getKey();
- String queryString = entry.getValue().get(args.topicfield);
- ScoredDocuments docs;
- if (args.searchtweets) {
- docs = searchTweets(queryString, Long.parseLong(entry.getValue().get("time")));
- } else {
- docs = search(queryString);
- }
-
- /**
- * the first column is the topic number.
- * the second column is currently unused and should always be "Q0".
- * the third column is the official document identifier of the retrieved document.
- * the fourth column is the rank the document is retrieved.
- * the fifth column shows the score (integer or floating point) that generated the ranking.
- * the sixth column is called the "run tag" and should be a unique identifier for your
- */
- for (int i = 0; i < docs.documents.length; i++) {
- out.println(String.format(Locale.US, "%s Q0 %s %d %f %s", qid,
- docs.documents[i].getField(IndexArgs.ID).stringValue(), (i + 1), docs.scores[i], runTag));
- }
- }
- out.flush();
- out.close();
-
- final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
- LOG.info("[Finished] Run " + topics.size() + " topics searched in "
- + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
- } catch (Exception e) {
- LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e);
- }
- }
- }
-
- public SearchSolr(Args args) throws IOException {
- this.args = args;
- LOG.info("Solr index: " + args.solrIndex);
- LOG.info("Solr ZooKeeper URL: " + args.zkUrl);
- this.client = new CloudSolrClient.Builder(Splitter.on(',')
- .splitToList(args.zkUrl), Optional.of(args.zkChroot))
- .withInternalClientBuilder(new Http2SolrClient.Builder().connectionTimeout(TIMEOUT))
- .build();
- }
-
- @SuppressWarnings("unchecked")
- public void runTopics() throws IOException {
- TopicReader tr;
- SortedMap> topics = new TreeMap<>();
- for (String singleTopicsFile : args.topics) {
- Path topicsFilePath = Paths.get(singleTopicsFile);
- if (!Files.exists(topicsFilePath) || !Files.isRegularFile(topicsFilePath) || !Files.isReadable(topicsFilePath)) {
- throw new IllegalArgumentException("Topics file : " + topicsFilePath + " does not exist or is not a (readable) file.");
- }
- try {
- tr = (TopicReader) Class.forName("io.anserini.search.topicreader." + args.topicReader + "TopicReader")
- .getConstructor(Path.class).newInstance(topicsFilePath);
- topics.putAll(tr.read());
- } catch (Exception e) {
- throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader);
- }
- }
-
- final String runTag = args.runtag == null ? "Solrini" : args.runtag;
- SolrSearcherThread solrThread = new SolrSearcherThread(topics, args.output, runTag);
- solrThread.run();
- }
-
- public ScoredDocuments search(String queryString){
-
- SolrDocumentList results = null;
-
- SolrQuery solrq = new SolrQuery();
- solrq.set("df", "contents");
- solrq.set("fl", "* score");
- // Remove some characters in query which are special syntax in Solr query parser
- solrq.setQuery(queryString.replaceAll("[+=&|<>!(){}~*?:/\"\\^\\-\\[\\]\\\\]", " "));
- solrq.setRows(args.hits);
- solrq.setSort(SortClause.desc("score"));
- solrq.addSort(SortClause.asc(IndexArgs.ID));
-
- try {
- QueryResponse response = client.query(args.solrIndex, solrq);
- results = response.getResults();
- } catch (Exception e) {
- LOG.error("Exception during Solr query: ", e);
- }
-
- ScoreTiesAdjusterReranker reranker = new ScoreTiesAdjusterReranker();
- return reranker.rerank(ScoredDocuments.fromSolrDocs(results), null);
- }
-
- public ScoredDocuments searchTweets(String queryString, long t){
-
- SolrDocumentList results = null;
-
- SolrQuery solrq = new SolrQuery();
- solrq.set("df", "contents");
- solrq.set("fl", "* score");
- // Remove double quotes in query since they are special syntax in Solr query parser
- solrq.setQuery(queryString.replace("\"", ""));
- solrq.setRows(args.hits);
- solrq.setSort(SortClause.desc("score"));
- solrq.addSort(SortClause.desc(TweetGenerator.TweetField.ID_LONG.name));
-
- // Do not consider the tweets with tweet ids that are beyond the queryTweetTime
- // tag contains the timestamp of the query in terms of the
- // chronologically nearest tweet id within the corpus
- Query filter = LongPoint.newRangeQuery(TweetGenerator.TweetField.ID_LONG.name, 0L, t);
- solrq.set("fq", filter.toString());
-
- try {
- QueryResponse response = client.query(args.solrIndex, solrq);
- results = response.getResults();
- } catch (Exception e) {
- LOG.error("Exception during Solr query: ", e);
- }
-
- ScoreTiesAdjusterReranker reranker = new ScoreTiesAdjusterReranker();
- return reranker.rerank(ScoredDocuments.fromSolrDocs(results), null);
- }
-
- @Override
- public void close() throws IOException {
- client.close();
- }
-
- public static void main(String[] args) throws Exception {
- Args searchSolrArgs = new Args();
- CmdLineParser parser = new CmdLineParser(searchSolrArgs, ParserProperties.defaults().withUsageWidth(90));
-
- try {
- parser.parseArgument(args);
- } catch (CmdLineException e) {
- System.err.println(e.getMessage());
- parser.printUsage(System.err);
- System.err.println("Example: SearchSolr" + parser.printExample(OptionHandlerFilter.REQUIRED));
- return;
- }
-
- final long start = System.nanoTime();
- SearchSolr searcher = new SearchSolr(searchSolrArgs);
- searcher.runTopics();
- searcher.close();
- final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
- LOG.info("Total run time: " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
- }
-}
diff --git a/src/main/python/run_es_regression.py b/src/main/python/run_es_regression.py
deleted file mode 100644
index b2084de21d..0000000000
--- a/src/main/python/run_es_regression.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#
-# Pyserini: Python interface to the Anserini IR toolkit built on Lucene
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import logging
-import math
-import os
-import requests
-import time
-
-import regression_utils
-
-# Note that this class is specifically written with REST API requests instead of the
-# Elasticsearch client eliminate an additional dependency
-
-logger = logging.getLogger('run_es_regression')
-ch = logging.StreamHandler()
-ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s - %(message)s'))
-logger.addHandler(ch)
-logger.setLevel(logging.INFO)
-
-
-class ElasticsearchClient:
- def __init__(self):
- pass
-
- @staticmethod
- def is_alive():
- try:
- response = requests.get('http://localhost:9200/')
- response.raise_for_status()
- except requests.exceptions.RequestException:
- return False
- else:
- return True
-
- def does_index_exist(self, collection):
- # Make sure ES is alive:
- if self.is_alive():
- try:
- response = requests.get('http://localhost:9200/{}'.format(collection))
- response.raise_for_status()
- except requests.exceptions.RequestException:
- return False
- else:
- return True
- else:
- raise Exception('ES does not appear to be alive!')
-
- def delete_index(self, collection):
- logger.info('Deleting index {}...'.format(collection))
- # Make sure the index exists:
- if self.does_index_exist(collection):
- try:
- response = requests.request('DELETE', url='http://localhost:9200/{}'.format(collection))
- response.raise_for_status()
- except requests.exceptions.RequestException:
- return False
- else:
- return True
- else:
- raise Exception('The index {} does not exist!'.format(collection))
-
- def create_index(self, collection):
- logger.info('Creating index {}...'.format(collection))
- # Make sure the index does not exist:
- if not self.does_index_exist(collection):
- filename = 'src/main/resources/elasticsearch/index-config.{}.json'.format(collection)
- if not os.path.exists(filename):
- raise Exception('No config found in src/main/resources/elasticsearch/ for {}!'.format(collection))
- logger.info('Using index config for {} at {}'.format(collection, filename))
- with open(filename, mode='r') as file:
- json = file.read()
- response = ''
- try:
- response = requests.request('PUT', url='http://localhost:9200/{}'.format(collection),
- data=json, headers={'Content-type': 'application/json'})
- response.raise_for_status()
- except requests.exceptions.RequestException:
- logger.info(response)
- return False
- else:
- return True
- else:
- raise Exception('The index {} already exists!'.format(collection))
-
- def insert_docs(self, collection, path):
- logger.info('Inserting documents from {} into {}... '.format(path, collection))
- if not os.path.exists(args.input):
- raise Exception('{} does not exist!'.format(args.input))
- if not self.does_index_exist(collection):
- raise Exception('The index {} does not exist!'.format(collection))
- # TODO: abstract this into an external config instead of hard-coded.
- if collection == 'robust04':
- command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \
- '-generator DefaultLuceneDocumentGenerator -es -es.index robust04 -threads 8 -input ' + \
- path + ' -storePositions -storeDocvectors -storeRaw'
- elif collection == 'msmarco-passage':
- command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \
- '-generator DefaultLuceneDocumentGenerator -es -es.index msmarco-passage -threads 8 -input ' + \
- path + ' -storePositions -storeDocvectors -storeRaw'
- elif collection == 'core18':
- command = 'sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection ' + \
- '-generator WashingtonPostGenerator -es -es.index core18 -threads 8 -input ' + \
- path + ' -storePositions -storeDocvectors -storeContents'
- elif collection == 'msmarco-doc':
- command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \
- '-generator DefaultLuceneDocumentGenerator -es -es.index msmarco-doc -threads 8 -input ' + \
- path + ' -storePositions -storeDocvectors -storeRaw'
- else:
- raise Exception('Unknown collection: {}'.format(collection))
- logger.info('Running indexing command: ' + command)
- return regression_utils.run_shell_command(command, logger, echo=True)
-
- def evaluate(self, collection):
- if not self.does_index_exist(collection):
- raise Exception('The index {} does not exist!'.format(collection))
- # TODO: abstract this into an external config instead of hard-coded.
- if collection == 'robust04':
- command = 'sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index robust04 ' + \
- '-topics src/main/resources/topics-and-qrels/topics.robust04.txt ' + \
- '-output runs/run.es.robust04.bm25.topics.robust04.txt'
- elif collection == 'msmarco-passage':
- command = 'sh target/appassembler/bin/SearchElastic -topicreader TsvString -es.index msmarco-passage ' + \
- '-topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt ' + \
- '-output runs/run.es.msmarco-passage.txt'
- elif collection == 'core18':
- command = 'sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index core18 ' + \
- '-topics src/main/resources/topics-and-qrels/topics.core18.txt ' + \
- '-output runs/run.es.core18.bm25.topics.core18.txt'
- elif collection == 'msmarco-doc':
- command = 'sh target/appassembler/bin/SearchElastic -topicreader TsvInt -es.index msmarco-doc ' + \
- '-topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \
- '-output runs/run.es.msmarco-doc.txt'
- else:
- raise Exception('Unknown collection: {}'.format(collection))
-
- logger.info('Retrieval command: ' + command)
- regression_utils.run_shell_command(command, logger, echo=True)
- logger.info('Retrieval complete!')
-
- if collection == 'robust04':
- command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \
- 'src/main/resources/topics-and-qrels/qrels.robust04.txt ' + \
- 'runs/run.es.robust04.bm25.topics.robust04.txt'
- elif collection == 'msmarco-passage':
- command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \
- 'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ' + \
- 'runs/run.es.msmarco-passage.txt'
- elif collection == 'core18':
- command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \
- 'src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.es.core18.bm25.topics.core18.txt'
- elif collection == 'msmarco-doc':
- command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \
- 'src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.es.msmarco-doc.txt'
- else:
- raise Exception('Unknown collection: {}'.format(collection))
-
- logger.info('Evaluation command: ' + command)
- output = regression_utils.run_shell_command(command, logger, capture=True)
- ap = float(output[0].split('\t')[2])
-
- if collection == 'robust04':
- expected = 0.2531
- elif collection == 'msmarco-passage':
- expected = 0.1956
- elif collection == 'core18':
- expected = 0.2496
- elif collection == 'msmarco-doc':
- expected = 0.2307
- else:
- raise Exception('Unknown collection: {}'.format(collection))
-
- if math.isclose(ap, expected):
- logger.info('[SUCESS] {} MAP verified as expected!'.format(ap))
- else:
- logger.info('[FAILED] {} MAP, expected {} MAP!'.format(ap, expected))
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='Program for running Elasticsearch regressions.')
- parser.add_argument('--ping', action='store_true', default=False, help='Ping ES and exit.')
- parser.add_argument('--check-index-exists', default='', type=str, metavar='collection',
- help='Check if index exists.')
- parser.add_argument('--delete-index', default='', type=str, metavar='collection', help='Delete index.')
- parser.add_argument('--create-index', default='', type=str, metavar='collection', help='Create index.')
- parser.add_argument('--insert-docs', default='', type=str, metavar='collection',
- help='Insert documents into index.')
- parser.add_argument('--input', default='', type=str, metavar='directory',
- help='Location of documents to insert into index.')
- parser.add_argument('--evaluate', default='', type=str, metavar='collection',
- help='Search and evaluate on collection.')
- parser.add_argument('--regression', default='', type=str, metavar='collection', help='Run end-to-end regression.')
-
- args = parser.parse_args()
- es = ElasticsearchClient()
-
- if args.ping:
- logger.info('Pinging Elasticsearch instance...')
- if es.is_alive():
- logger.info('... appears to alive! :)')
- else:
- logger.info('... appears to dead! :(')
- elif args.check_index_exists:
- logger.info('Checking if index {} exists...'.format(args.check_index_exists))
- if es.does_index_exist(args.check_index_exists):
- logger.info('... yes indeed!')
- else:
- logger.info('... appears not.')
- elif args.delete_index:
- if es.delete_index(args.delete_index):
- logger.info('... successful!')
- else:
- logger.info('... failed!')
- elif args.create_index:
- if es.create_index(args.create_index):
- logger.info('... successful!')
- else:
- logger.info('... failed!')
- elif args.insert_docs:
- if not args.input:
- raise Exception('Location of corpus not specified (use --input)!')
- else:
- es.insert_docs(args.insert_docs, args.input)
- elif args.evaluate:
- es.evaluate(args.evaluate)
- elif args.regression:
- logger.info('Running BM25 regression on {}...'.format(args.regression))
- if not args.input:
- raise Exception('Location of corpus not specified (use --input)!')
- if not es.is_alive():
- raise Exception('Elasticsearch does not appear to be alive!')
- if es.does_index_exist(args.regression):
- logger.info('Index {} already exists: deleting and recreating.'.format(args.regression))
- es.delete_index(args.regression)
- es.create_index(args.regression)
- es.insert_docs(args.regression, args.input)
- # Documents ingested into ES are not immediately searchable. There are lots of 'refresh' options
- # to control the visibility behavior, but the simplest solution is just to wait for a bit...
- logger.info('Document ingestion complete. Sleeping now for 120s...')
- time.sleep(120)
- logger.info('Waking up!')
- es.evaluate(args.regression)
diff --git a/src/main/python/run_solr_regression.py b/src/main/python/run_solr_regression.py
deleted file mode 100644
index 3fa8486a4b..0000000000
--- a/src/main/python/run_solr_regression.py
+++ /dev/null
@@ -1,247 +0,0 @@
-#
-# Pyserini: Python interface to the Anserini IR toolkit built on Lucene
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import logging
-import math
-import os
-import requests
-
-import regression_utils
-
-logger = logging.getLogger('run_solr_regression')
-ch = logging.StreamHandler()
-ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s - %(message)s'))
-logger.addHandler(ch)
-logger.setLevel(logging.INFO)
-
-
-class SolrClient:
- def __init__(self):
- pass
-
- @staticmethod
- def is_alive():
- try:
- response = requests.get('http://localhost:8983/')
- response.raise_for_status()
- except requests.exceptions.RequestException:
- return False
- else:
- return True
-
- def does_index_exist(self, collection):
- # Make sure Solr is alive:
- if self.is_alive():
- try:
- response = requests.get('http://localhost:8983/solr/admin/collections?action=LIST')
- response.raise_for_status()
- except requests.exceptions.RequestException:
- return False
- else:
- return collection in response.json()['collections']
- else:
- raise Exception('Solr does not appear to be alive!')
-
- def delete_index(self, collection):
- # Make sure the index exists:
- if self.does_index_exist(collection):
- command = 'solrini/bin/solr delete -c {}'.format(collection)
- logger.info('Deleting index {} command: {}'.format(collection, command))
- regression_utils.run_shell_command(command, logger, echo=True)
- return not self.does_index_exist(collection)
- else:
- raise Exception('The index {} does not exist!'.format(collection))
-
- def create_index(self, collection):
- # Make sure the index does not exist:
- if not self.does_index_exist(collection):
- # Re-upload configsets to Solr's internal Zookeeper
- self.upload_configs()
- command = 'solrini/bin/solr create -n anserini -c {}'.format(collection)
- logger.info('Creating index {} command: {}'.format(collection, command))
- regression_utils.run_shell_command(command, logger, echo=True)
- return self.does_index_exist(collection)
- else:
- raise Exception('The index {} already exists!'.format(collection))
-
- def insert_docs(self, collection, path):
- logger.info('Inserting documents from {} into {}... '.format(path, collection))
- if not os.path.exists(args.input):
- raise Exception('{} does not exist!'.format(args.input))
- if not self.does_index_exist(collection):
- raise Exception('The index {} does not exist!'.format(collection))
- if collection == 'core18':
- command = 'sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection ' + \
- '-generator WashingtonPostGenerator -solr -solr.index core18 -solr.zkUrl localhost:9983 ' + \
- '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeContents'
- elif collection == 'robust04':
- command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \
- '-generator DefaultLuceneDocumentGenerator ' + \
- '-solr -solr.index robust04 -solr.zkUrl localhost:9983 ' + \
- '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw'
- elif collection == 'msmarco-passage':
- command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \
- '-generator DefaultLuceneDocumentGenerator ' + \
- '-solr -solr.index msmarco-passage -solr.zkUrl localhost:9983 ' + \
- '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw'
- elif collection == 'msmarco-doc':
- command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \
- '-generator DefaultLuceneDocumentGenerator ' + \
- '-solr -solr.index msmarco-doc -solr.zkUrl localhost:9983 ' + \
- '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw'
- else:
- raise Exception('Unknown collection: {}'.format(collection))
- logger.info('Running indexing command: ' + command)
- return regression_utils.run_shell_command(command, logger, echo=True)
-
- @staticmethod
- def upload_configs():
- os.chdir('src/main/resources/solr')
- command = 'rm -rf anserini/conf/lang anserini-twitter/conf/lang'
- logger.info('Deleting existed configs command: ' + command)
- regression_utils.run_shell_command(command, logger, echo=True)
- command = './solr.sh ../../../../solrini localhost:9983'
- logger.info('Uploading configs command: ' + command)
- regression_utils.run_shell_command(command, logger, echo=True)
- os.chdir('../../../..')
- logger.info('Uploading complete!')
-
- def evaluate(self, collection):
- if not self.does_index_exist(collection):
- raise Exception('The index {} does not exist!'.format(collection))
- if collection == 'core18':
- command = 'sh target/appassembler/bin/SearchSolr -topicreader Trec -solr.index core18 ' + \
- '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.core18.txt ' + \
- '-output runs/run.solr.core18.bm25.topics.core18.txt'
- elif collection == 'robust04':
- command = 'sh target/appassembler/bin/SearchSolr -topicreader Trec -solr.index robust04 ' + \
- '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.robust04.txt ' + \
- '-output runs/run.solr.robust04.bm25.topics.robust04.txt'
- elif collection == 'msmarco-passage':
- command = 'sh target/appassembler/bin/SearchSolr -topicreader TsvString -solr.index msmarco-passage ' + \
- '-solr.zkUrl localhost:9983 ' + \
- '-topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt ' + \
- '-output runs/run.solr.msmarco-passage.txt'
- elif collection == 'msmarco-doc':
- command = 'sh target/appassembler/bin/SearchSolr -topicreader TsvInt -solr.index msmarco-doc ' + \
- '-solr.zkUrl localhost:9983 ' + \
- '-topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \
- '-output runs/run.solr.msmarco-doc.txt '
- else:
- raise Exception('Unknown collection: {}'.format(collection))
-
- logger.info('Retrieval command: ' + command)
- regression_utils.run_shell_command(command, logger, echo=True)
- logger.info('Retrieval complete!')
-
- if collection == 'core18':
- command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \
- 'src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.solr.core18.bm25.topics.core18.txt'
- elif collection == 'robust04':
- command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \
- 'src/main/resources/topics-and-qrels/qrels.robust04.txt ' + \
- 'runs/run.solr.robust04.bm25.topics.robust04.txt'
- elif collection == 'msmarco-passage':
- command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \
- 'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ' + \
- 'runs/run.solr.msmarco-passage.txt'
- elif collection == 'msmarco-doc':
- command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \
- 'src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.solr.msmarco-doc.txt'
- else:
- raise Exception('Unknown collection: {}'.format(collection))
-
- logger.info('Evaluation command: ' + command)
- output = regression_utils.run_shell_command(command, logger, capture=True)
- ap = float(output[0].split('\t')[2])
-
- if collection == 'core18':
- expected = 0.2496
- elif collection == 'robust04':
- expected = 0.2531
- elif collection == 'msmarco-passage':
- expected = 0.1926
- elif collection == 'msmarco-doc':
- expected = 0.2305
- else:
- raise Exception('Unknown collection: {}'.format(collection))
-
- if math.isclose(ap, expected):
- logger.info('[SUCESS] {} MAP verified as expected!'.format(ap))
- else:
- logger.info('[FAILED] {} MAP, expected {} MAP!'.format(ap, expected))
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='Program for running Solr regressions.')
- parser.add_argument('--ping', action='store_true', default=False, help='ping Solr and exit')
- parser.add_argument('--check-index-exists', default='', type=str, metavar='collection',
- help='Check if index exists.')
- parser.add_argument('--delete-index', default='', type=str, metavar='collection', help='Delete index.')
- parser.add_argument('--create-index', default='', type=str, metavar='collection', help='Create index.')
- parser.add_argument('--insert-docs', default='', type=str, metavar='collection',
- help='Insert documents into index.')
- parser.add_argument('--input', default='', type=str, metavar='directory',
- help='Location of documents to insert into index.')
- parser.add_argument('--evaluate', default='', type=str, metavar='collection',
- help='Search and evaluate on collection.')
- parser.add_argument('--regression', default='', type=str, metavar='collection', help='Run end-to-end regression.')
-
- args = parser.parse_args()
- solr = SolrClient()
-
- if args.ping:
- logger.info('Pinging Solr instance...')
- if solr.is_alive():
- logger.info('... appears to alive! :)')
- else:
- logger.info('... appears to dead! :(')
- elif args.check_index_exists:
- logger.info('Checking if index {} exists...'.format(args.check_index_exists))
- if solr.does_index_exist(args.check_index_exists):
- logger.info('... yes indeed!')
- else:
- logger.info('... appears not.')
- elif args.delete_index:
- if solr.delete_index(args.delete_index):
- logger.info('... successful!')
- else:
- logger.info('... failed!')
- elif args.create_index:
- if solr.create_index(args.create_index):
- logger.info('... successful!')
- else:
- logger.info('... failed!')
- elif args.insert_docs:
- if not args.input:
- raise Exception('Location of corpus not specified (use --input)!')
- else:
- solr.insert_docs(args.insert_docs, args.input)
- elif args.evaluate:
- solr.evaluate(args.evaluate)
- elif args.regression:
- logger.info('Running BM25 regression on {}...'.format(args.regression))
- if not args.input:
- raise Exception('Location of corpus not specified (use --input)!')
- if not solr.is_alive():
- raise Exception('Solr does not appear to be alive!')
- if solr.does_index_exist(args.regression):
- logger.info('Index {} already exists: deleting and recreating.'.format(args.regression))
- solr.delete_index(args.regression)
- solr.create_index(args.regression)
- solr.insert_docs(args.regression, args.input)
- solr.evaluate(args.regression)
diff --git a/src/main/resources/elasticsearch/index-config.cord19.json b/src/main/resources/elasticsearch/index-config.cord19.json
deleted file mode 100644
index c7c08e4610..0000000000
--- a/src/main/resources/elasticsearch/index-config.cord19.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
- "mappings": {
- "properties": {
- "id": {
- "type": "keyword"
- },
- "contents": {
- "type": "text",
- "store": false,
- "index": true,
- "analyzer": "english"
- },
- "raw": {
- "type": "text",
- "store": true,
- "index": false
- }
- }
- },
- "settings": {
- "index": {
- "refresh_interval": "60s",
- "similarity": {
- "default": {
- "type": "BM25",
- "k1": "0.9",
- "b": "0.4"
- }
- }
- }
- }
-}
diff --git a/src/main/resources/elasticsearch/index-config.core18.json b/src/main/resources/elasticsearch/index-config.core18.json
deleted file mode 100644
index c7c08e4610..0000000000
--- a/src/main/resources/elasticsearch/index-config.core18.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
- "mappings": {
- "properties": {
- "id": {
- "type": "keyword"
- },
- "contents": {
- "type": "text",
- "store": false,
- "index": true,
- "analyzer": "english"
- },
- "raw": {
- "type": "text",
- "store": true,
- "index": false
- }
- }
- },
- "settings": {
- "index": {
- "refresh_interval": "60s",
- "similarity": {
- "default": {
- "type": "BM25",
- "k1": "0.9",
- "b": "0.4"
- }
- }
- }
- }
-}
diff --git a/src/main/resources/elasticsearch/index-config.msmarco-doc.json b/src/main/resources/elasticsearch/index-config.msmarco-doc.json
deleted file mode 100644
index c7c08e4610..0000000000
--- a/src/main/resources/elasticsearch/index-config.msmarco-doc.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
- "mappings": {
- "properties": {
- "id": {
- "type": "keyword"
- },
- "contents": {
- "type": "text",
- "store": false,
- "index": true,
- "analyzer": "english"
- },
- "raw": {
- "type": "text",
- "store": true,
- "index": false
- }
- }
- },
- "settings": {
- "index": {
- "refresh_interval": "60s",
- "similarity": {
- "default": {
- "type": "BM25",
- "k1": "0.9",
- "b": "0.4"
- }
- }
- }
- }
-}
diff --git a/src/main/resources/elasticsearch/index-config.msmarco-passage.json b/src/main/resources/elasticsearch/index-config.msmarco-passage.json
deleted file mode 100644
index ad33344097..0000000000
--- a/src/main/resources/elasticsearch/index-config.msmarco-passage.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
- "mappings": {
- "properties": {
- "id": {
- "type": "keyword"
- },
- "contents": {
- "type": "text",
- "store": false,
- "index": true,
- "analyzer": "english"
- },
- "raw": {
- "type": "text",
- "store": true,
- "index": false
- }
- }
- },
- "settings": {
- "index": {
- "refresh_interval": "60s",
- "similarity": {
- "default": {
- "type": "BM25",
- "k1": "0.82",
- "b": "0.68"
- }
- }
- }
- }
-}
diff --git a/src/main/resources/elasticsearch/index-config.robust04.json b/src/main/resources/elasticsearch/index-config.robust04.json
deleted file mode 100644
index c7c08e4610..0000000000
--- a/src/main/resources/elasticsearch/index-config.robust04.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
- "mappings": {
- "properties": {
- "id": {
- "type": "keyword"
- },
- "contents": {
- "type": "text",
- "store": false,
- "index": true,
- "analyzer": "english"
- },
- "raw": {
- "type": "text",
- "store": true,
- "index": false
- }
- }
- },
- "settings": {
- "index": {
- "refresh_interval": "60s",
- "similarity": {
- "default": {
- "type": "BM25",
- "k1": "0.9",
- "b": "0.4"
- }
- }
- }
- }
-}
diff --git a/src/main/resources/solr/anserini-twitter/conf/managed-schema b/src/main/resources/solr/anserini-twitter/conf/managed-schema
deleted file mode 100644
index 08e1f08be5..0000000000
--- a/src/main/resources/solr/anserini-twitter/conf/managed-schema
+++ /dev/null
@@ -1,216 +0,0 @@
-
-
-
- id
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 0.9
- 0.4
-
-
-
diff --git a/src/main/resources/solr/anserini-twitter/conf/params.json b/src/main/resources/solr/anserini-twitter/conf/params.json
deleted file mode 100644
index 06114ef257..0000000000
--- a/src/main/resources/solr/anserini-twitter/conf/params.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{"params":{
- "query":{
- "defType":"edismax",
- "q.alt":"*:*",
- "rows":"10",
- "fl":"*,score",
- "":{"v":0}
- },
- "facets":{
- "facet":"on",
- "facet.mincount": "1",
- "":{"v":0}
- },
- "velocity":{
- "wt": "velocity",
- "v.template":"browse",
- "v.layout": "layout",
- "":{"v":0}
- }
-}}
\ No newline at end of file
diff --git a/src/main/resources/solr/anserini-twitter/conf/protwords.txt b/src/main/resources/solr/anserini-twitter/conf/protwords.txt
deleted file mode 100644
index 1dfc0abecb..0000000000
--- a/src/main/resources/solr/anserini-twitter/conf/protwords.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#-----------------------------------------------------------------------
-# Use a protected word file to protect against the stemmer reducing two
-# unrelated words to the same base word.
-
-# Some non-words that normally won't be encountered,
-# just to test that they won't be stemmed.
-dontstems
-zwhacky
-
diff --git a/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml b/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml
deleted file mode 100644
index 5f3e4208ef..0000000000
--- a/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml
+++ /dev/null
@@ -1,1341 +0,0 @@
-
-
-
-
-
-
-
-
- 9.0.0
-
-
-
-
-
-
-
-
- ${solr.data.dir:}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 2048
-
-
-
-
-
-
-
-
-
- ${solr.lock.type:native}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- ${solr.ulog.dir:}
- ${solr.ulog.numVersionBuckets:65536}
-
-
-
-
- ${solr.autoCommit.maxTime:15000}
- false
-
-
-
-
-
- ${solr.autoSoftCommit.maxTime:-1}
-
-
-
-
-
-
-
-
-
-
-
-
-
- 1024
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- true
-
-
-
-
-
- 20
-
-
- 200
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- false
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- explicit
- 10
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- explicit
- json
- true
-
-
-
-
-
-
-
- explicit
-
-
-
-
-
- _text_
-
-
-
-
-
-
- true
- ignored_
- _text_
-
-
-
-
-
-
-
-
- text_general
-
-
-
-
-
- default
- _text_
- solr.DirectSolrSpellChecker
-
- internal
-
- 0.5
-
- 2
-
- 1
-
- 5
-
- 4
-
- 0.01
-
-
-
-
-
-
-
-
-
-
-
- default
- on
- true
- 10
- 5
- 5
- true
- true
- 10
- 5
-
-
- spellcheck
-
-
-
-
-
-
-
-
-
- true
-
-
- tvComponent
-
-
-
-
-
-
-
-
-
-
-
- true
- false
-
-
- terms
-
-
-
-
-
-
-
- string
-
-
-
-
-
- explicit
-
-
- elevator
-
-
-
-
-
-
-
-
-
-
- 100
-
-
-
-
-
-
-
- 70
-
- 0.5
-
- [-\w ,/\n\"']{20,200}
-
-
-
-
-
-
- ]]>
- ]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- ,,
- ,,
- ,,
- ,,
- ,]]>
- ]]>
-
-
-
-
-
- 10
- .,!?
-
-
-
-
-
-
- WORD
-
-
- en
- US
-
-
-
-
-
-
-
-
-
-
-
- [^\w-\.]
- _
-
-
-
-
-
-
- yyyy-MM-dd'T'HH:mm:ss.SSSZ
- yyyy-MM-dd'T'HH:mm:ss,SSSZ
- yyyy-MM-dd'T'HH:mm:ss.SSS
- yyyy-MM-dd'T'HH:mm:ss,SSS
- yyyy-MM-dd'T'HH:mm:ssZ
- yyyy-MM-dd'T'HH:mm:ss
- yyyy-MM-dd'T'HH:mmZ
- yyyy-MM-dd'T'HH:mm
- yyyy-MM-dd HH:mm:ss.SSSZ
- yyyy-MM-dd HH:mm:ss,SSSZ
- yyyy-MM-dd HH:mm:ss.SSS
- yyyy-MM-dd HH:mm:ss,SSS
- yyyy-MM-dd HH:mm:ssZ
- yyyy-MM-dd HH:mm:ss
- yyyy-MM-dd HH:mmZ
- yyyy-MM-dd HH:mm
- yyyy-MM-dd
-
-
-
-
- java.lang.String
- text_general
-
- *_str
- 256
-
-
- true
-
-
- java.lang.Boolean
- booleans
-
-
- java.util.Date
- pdates
-
-
- java.lang.Long
- java.lang.Integer
- plongs
-
-
- java.lang.Number
- pdoubles
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- text/plain; charset=UTF-8
-
-
-
-
- ${velocity.template.base.dir:}
- ${velocity.solr.resource.loader.enabled:true}
- ${velocity.params.resource.loader.enabled:false}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/main/resources/solr/anserini-twitter/conf/stopwords_en.txt b/src/main/resources/solr/anserini-twitter/conf/stopwords_en.txt
deleted file mode 100644
index e11bbd5670..0000000000
--- a/src/main/resources/solr/anserini-twitter/conf/stopwords_en.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Standard english stop words taken from Lucene's StopAnalyzer
-a
-an
-and
-are
-as
-at
-be
-but
-by
-for
-if
-in
-into
-is
-it
-no
-not
-of
-on
-or
-such
-that
-the
-their
-then
-there
-these
-they
-this
-to
-was
-will
-with
diff --git a/src/main/resources/solr/anserini-twitter/conf/synonyms.txt b/src/main/resources/solr/anserini-twitter/conf/synonyms.txt
deleted file mode 100644
index eab4ee8753..0000000000
--- a/src/main/resources/solr/anserini-twitter/conf/synonyms.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#-----------------------------------------------------------------------
-#some test synonym mappings unlikely to appear in real input text
-aaafoo => aaabar
-bbbfoo => bbbfoo bbbbar
-cccfoo => cccbar cccbaz
-fooaaa,baraaa,bazaaa
-
-# Some synonym groups specific to this example
-GB,gib,gigabyte,gigabytes
-MB,mib,megabyte,megabytes
-Television, Televisions, TV, TVs
-#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
-#after us won't split it into two words.
-
-# Synonym mappings can be used for spelling correction too
-pixima => pixma
-
diff --git a/src/main/resources/solr/anserini/conf/managed-schema b/src/main/resources/solr/anserini/conf/managed-schema
deleted file mode 100644
index 08e1f08be5..0000000000
--- a/src/main/resources/solr/anserini/conf/managed-schema
+++ /dev/null
@@ -1,216 +0,0 @@
-
-
-
- id
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 0.9
- 0.4
-
-
-
diff --git a/src/main/resources/solr/anserini/conf/params.json b/src/main/resources/solr/anserini/conf/params.json
deleted file mode 100644
index 06114ef257..0000000000
--- a/src/main/resources/solr/anserini/conf/params.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{"params":{
- "query":{
- "defType":"edismax",
- "q.alt":"*:*",
- "rows":"10",
- "fl":"*,score",
- "":{"v":0}
- },
- "facets":{
- "facet":"on",
- "facet.mincount": "1",
- "":{"v":0}
- },
- "velocity":{
- "wt": "velocity",
- "v.template":"browse",
- "v.layout": "layout",
- "":{"v":0}
- }
-}}
\ No newline at end of file
diff --git a/src/main/resources/solr/anserini/conf/protwords.txt b/src/main/resources/solr/anserini/conf/protwords.txt
deleted file mode 100644
index 1dfc0abecb..0000000000
--- a/src/main/resources/solr/anserini/conf/protwords.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#-----------------------------------------------------------------------
-# Use a protected word file to protect against the stemmer reducing two
-# unrelated words to the same base word.
-
-# Some non-words that normally won't be encountered,
-# just to test that they won't be stemmed.
-dontstems
-zwhacky
-
diff --git a/src/main/resources/solr/anserini/conf/solrconfig.xml b/src/main/resources/solr/anserini/conf/solrconfig.xml
deleted file mode 100644
index b00368515b..0000000000
--- a/src/main/resources/solr/anserini/conf/solrconfig.xml
+++ /dev/null
@@ -1,1343 +0,0 @@
-
-
-
-
-
-
-
-
- 9.0.0
-
-
-
-
-
-
-
-
-
- ${solr.data.dir:}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 2048
-
-
-
-
-
-
-
-
-
- ${solr.lock.type:native}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- ${solr.ulog.dir:}
- ${solr.ulog.numVersionBuckets:65536}
-
-
-
-
- ${solr.autoCommit.maxTime:15000}
- false
-
-
-
-
-
- ${solr.autoSoftCommit.maxTime:-1}
-
-
-
-
-
-
-
-
-
-
-
-
-
- 1024
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- true
-
-
-
-
-
- 20
-
-
- 200
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- false
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- explicit
- 10
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- explicit
- json
- true
-
-
-
-
-
-
-
- explicit
-
-
-
-
-
- _text_
-
-
-
-
-
-
- true
- ignored_
- _text_
-
-
-
-
-
-
-
-
- text_general
-
-
-
-
-
- default
- _text_
- solr.DirectSolrSpellChecker
-
- internal
-
- 0.5
-
- 2
-
- 1
-
- 5
-
- 4
-
- 0.01
-
-
-
-
-
-
-
-
-
-
-
- default
- on
- true
- 10
- 5
- 5
- true
- true
- 10
- 5
-
-
- spellcheck
-
-
-
-
-
-
-
-
-
- true
-
-
- tvComponent
-
-
-
-
-
-
-
-
-
-
-
- true
- false
-
-
- terms
-
-
-
-
-
-
-
- string
-
-
-
-
-
- explicit
-
-
- elevator
-
-
-
-
-
-
-
-
-
-
- 100
-
-
-
-
-
-
-
- 70
-
- 0.5
-
- [-\w ,/\n\"']{20,200}
-
-
-
-
-
-
- ]]>
- ]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- ,,
- ,,
- ,,
- ,,
- ,]]>
- ]]>
-
-
-
-
-
- 10
- .,!?
-
-
-
-
-
-
- WORD
-
-
- en
- US
-
-
-
-
-
-
-
-
-
-
-
- [^\w-\.]
- _
-
-
-
-
-
-
- yyyy-MM-dd'T'HH:mm:ss.SSSZ
- yyyy-MM-dd'T'HH:mm:ss,SSSZ
- yyyy-MM-dd'T'HH:mm:ss.SSS
- yyyy-MM-dd'T'HH:mm:ss,SSS
- yyyy-MM-dd'T'HH:mm:ssZ
- yyyy-MM-dd'T'HH:mm:ss
- yyyy-MM-dd'T'HH:mmZ
- yyyy-MM-dd'T'HH:mm
- yyyy-MM-dd HH:mm:ss.SSSZ
- yyyy-MM-dd HH:mm:ss,SSSZ
- yyyy-MM-dd HH:mm:ss.SSS
- yyyy-MM-dd HH:mm:ss,SSS
- yyyy-MM-dd HH:mm:ssZ
- yyyy-MM-dd HH:mm:ss
- yyyy-MM-dd HH:mmZ
- yyyy-MM-dd HH:mm
- yyyy-MM-dd
-
-
-
-
- java.lang.String
- text_general
-
- *_str
- 256
-
-
- true
-
-
- java.lang.Boolean
- booleans
-
-
- java.util.Date
- pdates
-
-
- java.lang.Long
- java.lang.Integer
- plongs
-
-
- java.lang.Number
- pdoubles
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- text/plain; charset=UTF-8
-
-
-
-
- ${velocity.template.base.dir:}
- ${velocity.solr.resource.loader.enabled:true}
- ${velocity.params.resource.loader.enabled:false}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/main/resources/solr/anserini/conf/stopwords_en.txt b/src/main/resources/solr/anserini/conf/stopwords_en.txt
deleted file mode 100644
index e11bbd5670..0000000000
--- a/src/main/resources/solr/anserini/conf/stopwords_en.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Standard english stop words taken from Lucene's StopAnalyzer
-a
-an
-and
-are
-as
-at
-be
-but
-by
-for
-if
-in
-into
-is
-it
-no
-not
-of
-on
-or
-such
-that
-the
-their
-then
-there
-these
-they
-this
-to
-was
-will
-with
diff --git a/src/main/resources/solr/anserini/conf/synonyms.txt b/src/main/resources/solr/anserini/conf/synonyms.txt
deleted file mode 100644
index eab4ee8753..0000000000
--- a/src/main/resources/solr/anserini/conf/synonyms.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#-----------------------------------------------------------------------
-#some test synonym mappings unlikely to appear in real input text
-aaafoo => aaabar
-bbbfoo => bbbfoo bbbbar
-cccfoo => cccbar cccbaz
-fooaaa,baraaa,bazaaa
-
-# Some synonym groups specific to this example
-GB,gib,gigabyte,gigabytes
-MB,mib,megabyte,megabytes
-Television, Televisions, TV, TVs
-#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
-#after us won't split it into two words.
-
-# Synonym mappings can be used for spelling correction too
-pixima => pixma
-
diff --git a/src/main/resources/solr/schemas/acl-anthology.json b/src/main/resources/solr/schemas/acl-anthology.json
deleted file mode 100644
index e358861e83..0000000000
--- a/src/main/resources/solr/schemas/acl-anthology.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "add-field": {
- "name":"authors",
- "type":"string",
- "stored":true,
- "multiValued":true
- },
- "add-field": {
- "name":"sigs",
- "type":"string",
- "stored":true,
- "multiValued":true
- },
- "add-field": {
- "name":"venues",
- "type":"string",
- "stored":true,
- "multiValued":true
- },
- "add-field": {
- "name":"pages",
- "type":"string",
- "stored":true,
- "docValues": false
- }
-}
\ No newline at end of file
diff --git a/src/main/resources/solr/schemas/cord19.json b/src/main/resources/solr/schemas/cord19.json
deleted file mode 100644
index 8a9d305b9b..0000000000
--- a/src/main/resources/solr/schemas/cord19.json
+++ /dev/null
@@ -1,86 +0,0 @@
-{
- "add-field": {
- "name":"authors",
- "type":"string",
- "stored":true,
- "multiValued":true
- },
- "add-field": {
- "name":"source_x",
- "type":"string",
- "stored":true,
- "multiValued": true
- },
- "add-field": {
- "name":"pmcid",
- "type":"string",
- "stored":true,
- "docValues": true
- },
- "add-field": {
- "name":"pubmed_id",
- "type":"string",
- "stored":true,
- "docValues": true
- },
- "add-field": {
- "name":"publish_time",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"doi",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"journal",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"license",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"sha",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"url",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"year",
- "type":"pint",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"outcomes_vocab",
- "type":"string",
- "stored":true,
- "multiValued":true
- },
- "add-field": {
- "name":"population_vocab",
- "type":"string",
- "stored":true,
- "multiValued":true
- },
- "add-field": {
- "name":"interventions_vocab",
- "type":"string",
- "stored":true,
- "multiValued":true
- }
-}
diff --git a/src/main/resources/solr/schemas/core.json b/src/main/resources/solr/schemas/core.json
deleted file mode 100644
index f6c205539b..0000000000
--- a/src/main/resources/solr/schemas/core.json
+++ /dev/null
@@ -1,56 +0,0 @@
-{
- "add-field": {
- "name":"authors",
- "type":"string",
- "stored":true,
- "multiValued":true,
- "docValues":true
- },
- "add-field": {
- "name":"contributors",
- "type":"string",
- "stored":true,
- "multiValued":true,
- "docValues":true
- },
- "add-field": {
- "name":"identifiers",
- "type":"string",
- "stored":true,
- "multiValued":true,
- "docValues":true
- },
- "add-field": {
- "name":"journals",
- "type":"string",
- "stored":true,
- "multiValued":true,
- "docValues":false
- },
- "add-field": {
- "name":"relations",
- "type":"string",
- "stored":true,
- "multiValued":true,
- "docValues":true
- },
- "add-field": {
- "name":"subjects",
- "type":"string",
- "stored":true,
- "multiValued":true,
- "docValues":true
- },
- "add-field": {
- "name":"topics",
- "type":"string",
- "stored":true,
- "multiValued":true,
- "docValues":true
- },
- "add-field": {
- "name":"datePublished",
- "type":"string",
- "stored":true
- }
-}
\ No newline at end of file
diff --git a/src/main/resources/solr/schemas/covid.json b/src/main/resources/solr/schemas/covid.json
deleted file mode 100644
index f6a1f237f3..0000000000
--- a/src/main/resources/solr/schemas/covid.json
+++ /dev/null
@@ -1,86 +0,0 @@
-{
- "add-field": {
- "name":"authors",
- "type":"string",
- "stored":true,
- "multiValued":true
- },
- "add-field": {
- "name":"source_x",
- "type":"string",
- "stored":true,
- "docValues": true
- },
- "add-field": {
- "name":"pmcid",
- "type":"string",
- "stored":true,
- "docValues": true
- },
- "add-field": {
- "name":"pubmed_id",
- "type":"string",
- "stored":true,
- "docValues": true
- },
- "add-field": {
- "name":"publish_time",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"doi",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"journal",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"license",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"sha",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"url",
- "type":"string",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"year",
- "type":"pint",
- "stored":true,
- "docValues":true
- },
- "add-field": {
- "name":"outcomes_vocab",
- "type":"string",
- "stored":true,
- "multiValued":true
- },
- "add-field": {
- "name":"population_vocab",
- "type":"string",
- "stored":true,
- "multiValued":true
- },
- "add-field": {
- "name":"interventions_vocab",
- "type":"string",
- "stored":true,
- "multiValued":true
- }
-}
diff --git a/src/main/resources/solr/solr.sh b/src/main/resources/solr/solr.sh
deleted file mode 100755
index 194ea446d8..0000000000
--- a/src/main/resources/solr/solr.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env sh
-
-###
-# This script assumes a single-node SolrCloud instance is running locally.
-###
-
-if [[ -z "$1" ]]; then
- echo "Usage: ./solr.sh "
- exit 1
-fi
-
-# Solr install directory
-SOLR_DIR=$1
-
-# Solr's ZooKeeper URL
-ZOOKEEPER_URL=$2
-
-# Copy anserini into lib dir
-mkdir ${SOLR_DIR}/lib && cp ../../../../target/anserini-*-fatjar.jar ${SOLR_DIR}/lib
-
-# Upload configset to Solr
-${SOLR_DIR}/bin/solr zk -z ${ZOOKEEPER_URL:-localhost:9983} upconfig -n anserini -d anserini
-${SOLR_DIR}/bin/solr zk -z ${ZOOKEEPER_URL:-localhost:9983} upconfig -n anserini-twitter -d anserini-twitter
diff --git a/src/test/java/io/anserini/GeoIndexerTestBase.java b/src/test/java/io/anserini/GeoIndexerTestBase.java
index 8c3c94e88d..e3ecc13edf 100644
--- a/src/test/java/io/anserini/GeoIndexerTestBase.java
+++ b/src/test/java/io/anserini/GeoIndexerTestBase.java
@@ -17,7 +17,11 @@
package io.anserini;
import io.anserini.index.IndexArgs;
-import org.apache.lucene.document.*;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.LatLonDocValuesField;
+import org.apache.lucene.document.LatLonShape;
+import org.apache.lucene.document.StringField;
import org.apache.lucene.geo.Line;
import org.apache.lucene.geo.Polygon;
import org.apache.lucene.geo.SimpleWKTShapeParser;
@@ -25,9 +29,10 @@
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.Before;
+import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.After;
+import org.junit.Before;
+
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
diff --git a/src/test/java/io/anserini/IndexerTestBase.java b/src/test/java/io/anserini/IndexerTestBase.java
index ffd16c0b0c..8a1410bdc6 100644
--- a/src/test/java/io/anserini/IndexerTestBase.java
+++ b/src/test/java/io/anserini/IndexerTestBase.java
@@ -30,7 +30,7 @@
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;
import org.junit.After;
import org.junit.Before;
diff --git a/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java b/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java
index 417a0fb0ea..e4a854d2ca 100644
--- a/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java
+++ b/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java
@@ -30,7 +30,7 @@
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;
import org.junit.After;
import org.junit.Before;
diff --git a/src/test/java/io/anserini/collection/DocumentCollectionTest.java b/src/test/java/io/anserini/collection/DocumentCollectionTest.java
index 64b2faee63..ce06003621 100644
--- a/src/test/java/io/anserini/collection/DocumentCollectionTest.java
+++ b/src/test/java/io/anserini/collection/DocumentCollectionTest.java
@@ -16,7 +16,7 @@
package io.anserini.collection;
-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
diff --git a/src/test/java/io/anserini/integration/EndToEndTest.java b/src/test/java/io/anserini/integration/EndToEndTest.java
index 3b509702e3..d0a55efe03 100644
--- a/src/test/java/io/anserini/integration/EndToEndTest.java
+++ b/src/test/java/io/anserini/integration/EndToEndTest.java
@@ -28,8 +28,8 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TestRuleLimitSysouts;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.TestRuleLimitSysouts;
import org.apache.lucene.util.IOUtils;
import org.junit.After;
import org.junit.Before;
diff --git a/src/test/java/io/anserini/integration/solr/AclAnthologyEndToEndTest.java b/src/test/java/io/anserini/integration/solr/AclAnthologyEndToEndTest.java
deleted file mode 100644
index a46383b484..0000000000
--- a/src/test/java/io/anserini/integration/solr/AclAnthologyEndToEndTest.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Anserini: A Lucene toolkit for reproducible information retrieval research
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.anserini.integration.solr;
-
-import io.anserini.collection.AclAnthology;
-import io.anserini.index.IndexArgs;
-import io.anserini.index.generator.AclAnthologyGenerator;
-import io.anserini.search.SearchSolr;
-
-public class AclAnthologyEndToEndTest extends SolrEndToEndTest {
- @Override
- protected String getCollectionName() {
- return "AclAnthology";
- }
-
- @Override
- protected String getSchemaAdjustmentFile() {
- return "solr/schemas/acl-anthology.json";
- }
-
- @Override
- public IndexArgs getIndexArgs() {
- IndexArgs indexArgs = createDefaultIndexArgs();
- indexArgs.input = "src/test/resources/sample_docs/acl";
- indexArgs.collectionClass = AclAnthology.class.getSimpleName();
- indexArgs.generatorClass = AclAnthologyGenerator.class.getSimpleName();
- return indexArgs;
- }
-
- @Override
- protected SearchSolr.Args getSearchArgs() {
- return createSearchArgs("TsvInt", "src/test/resources/sample_topics/acl_topics.tsv");
- }
-
- @Override
- protected String[] getRefRankingResult() {
- return new String[]{ // bm25
- "1 Q0 C00-1007 1 0.294000 Solrini",
- "1 Q0 E17-1003 2 0.186100 Solrini",
- "2 Q0 C00-1003 1 0.622700 Solrini"
- };
- }
-}
diff --git a/src/test/java/io/anserini/integration/solr/CoreEndToEndTest.java b/src/test/java/io/anserini/integration/solr/CoreEndToEndTest.java
deleted file mode 100644
index 761e12e537..0000000000
--- a/src/test/java/io/anserini/integration/solr/CoreEndToEndTest.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Anserini: A Lucene toolkit for reproducible information retrieval research
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.anserini.integration.solr;
-
-import io.anserini.collection.CoreCollection;
-import io.anserini.index.IndexArgs;
-import io.anserini.index.generator.CoreGenerator;
-import io.anserini.search.SearchSolr;
-
-public class CoreEndToEndTest extends SolrEndToEndTest {
- @Override
- protected String getCollectionName() {
- return "Core";
- }
-
- @Override
- protected String getSchemaAdjustmentFile() {
- return "solr/schemas/core.json";
- }
-
- @Override
- protected IndexArgs getIndexArgs() {
- IndexArgs indexArgs = createDefaultIndexArgs();
- indexArgs.input = "src/test/resources/sample_docs/core";
- indexArgs.collectionClass = CoreCollection.class.getSimpleName();
- indexArgs.generatorClass = CoreGenerator.class.getSimpleName();
- return indexArgs;
- }
-
- @Override
- protected SearchSolr.Args getSearchArgs() {
- return createSearchArgs("TsvInt", "src/test/resources/sample_topics/core_topics.tsv");
- }
-
- @Override
- protected String[] getRefRankingResult() {
- return new String[]{ // bm25
- "1 Q0 coreDoc1 1 0.243200 Solrini",
- "1 Q0 doi2 2 0.243199 Solrini",
- "2 Q0 coreDoc1 1 0.243200 Solrini",
- "2 Q0 doi2 2 0.243199 Solrini",
- "3 Q0 fullCoreDoc 1 0.534600 Solrini"
- };
- }
-}
diff --git a/src/test/java/io/anserini/integration/solr/SolrEndToEndTest.java b/src/test/java/io/anserini/integration/solr/SolrEndToEndTest.java
deleted file mode 100644
index d2529d7c6d..0000000000
--- a/src/test/java/io/anserini/integration/solr/SolrEndToEndTest.java
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Anserini: A Lucene toolkit for reproducible information retrieval research
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.anserini.integration.solr;
-
-import io.anserini.index.IndexArgs;
-import io.anserini.index.IndexCollection;
-import io.anserini.search.SearchSolr;
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.pool2.BasePooledObjectFactory;
-import org.apache.commons.pool2.ObjectPool;
-import org.apache.commons.pool2.PooledObject;
-import org.apache.commons.pool2.impl.DefaultPooledObject;
-import org.apache.commons.pool2.impl.GenericObjectPool;
-import org.apache.commons.pool2.impl.GenericObjectPoolConfig;
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.solr.client.solrj.SolrClient;
-import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
-import org.apache.solr.client.solrj.request.CoreAdminRequest;
-import org.apache.solr.client.solrj.request.json.DirectJsonQueryRequest;
-import org.apache.solr.client.solrj.response.QueryResponse;
-import org.apache.solr.common.params.CommonParams;
-import org.apache.solr.common.params.ModifiableSolrParams;
-import org.apache.solr.core.NodeConfig;
-import org.apache.solr.core.SolrResourceLoader;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.lang.reflect.Field;
-import java.net.URL;
-import java.nio.file.Files;
-
-
-@LuceneTestCase.SuppressSysoutChecks(bugUrl = "None")
-public abstract class SolrEndToEndTest extends LuceneTestCase {
- private static final Logger LOG = LogManager.getLogger(SolrEndToEndTest.class);
-
- protected ObjectPool stubSolrPool;
- protected final String searchOutputPrefix = "e2eTestSearch";
-
- protected EmbeddedSolrServer client;
-
- protected static File getFile(String path) {
- final URL url = SolrEndToEndTest.class.getClassLoader().getResource(path);
- if (url != null) {
- try {
- return new File(url.toURI());
- } catch (Exception e) {
- throw new RuntimeException("Resource was found on classpath, but cannot be resolved to a normal file: " + path);
- }
- }
- final File file = new File(path);
- if (file.exists()) {
- return file;
- }
- throw new RuntimeException("Cannot find resource in classpath or in file-system (relative to CWD): " + path);
- }
-
- @Before
- @Override
- public void setUp() throws Exception {
- super.setUp();
-
- final File solrHome = createTempDir().toFile();
- final File configSetBaseDir = new File(solrHome.toPath() + File.separator + "configsets");
- FileUtils.copyDirectory(getFile("solr/anserini"), new File(configSetBaseDir + File.separator + "anserini"));
-
- SolrResourceLoader loader = new SolrResourceLoader(solrHome.toPath());
- NodeConfig config = new NodeConfig.NodeConfigBuilder("embeddedSolrServerNode", loader.getInstancePath())
- .setConfigSetBaseDirectory(configSetBaseDir.getAbsolutePath()).build();
- client = new EmbeddedSolrServer(config, getCollectionName());
- LOG.info("Created Embedded Solr Server");
-
- CoreAdminRequest.Create createRequest = new CoreAdminRequest.Create();
- createRequest.setCoreName(getCollectionName());
- createRequest.setConfigSet("anserini");
- createRequest.process(client);
- client.commit();
- LOG.info("Created Solr Core: " + getCollectionName());
-
- GenericObjectPoolConfig poolConfig = new GenericObjectPoolConfig<>();
- poolConfig.setMaxTotal(1); // only 1 EmbeddedSolrServer instance will be created by getSolrClient
- poolConfig.setMinIdle(1);
- stubSolrPool = new GenericObjectPool<>(new StubSolrClientFactory(client), poolConfig);
- }
-
- @After
- @Override
- public void tearDown() throws Exception {
- super.tearDown();
-
- client.deleteByQuery("*:*");
- client.commit();
- client.close();
- stubSolrPool.close();
- }
-
- protected IndexArgs createDefaultIndexArgs() {
- IndexArgs args = new IndexArgs();
-
- args.solrIndex = getCollectionName();
- args.threads = 1;
- args.storePositions = true;
- args.storeDocvectors = true;
- args.storeContents = true;
- args.storeRaw = true;
- args.optimize = true;
- args.quiet = true;
- args.solr = true;
-
- return args;
- }
-
- protected SearchSolr.Args createSearchArgs(String topicReader, String topicFile) {
- SearchSolr.Args args = new SearchSolr.Args();
-
- args.solrIndex = getCollectionName();
- args.output = searchOutputPrefix + topicReader;
- args.topicReader = topicReader;
- args.topics = new String[]{topicFile};
- args.zkUrl = "localhost"; // SearchSolr initialization workaround
-
- return args;
- }
-
- protected static class StubSolrClientFactory extends BasePooledObjectFactory {
- final SolrClient client;
-
- public StubSolrClientFactory(SolrClient client) {
- this.client = client;
- }
-
- @Override
- public SolrClient create() {
- return this.client;
- }
-
- @Override
- public PooledObject wrap(SolrClient solrClient) {
- return new DefaultPooledObject<>(solrClient);
- }
- }
-
- protected IndexCollection getIndexRunner(IndexArgs args) throws Exception {
- IndexCollection runner = new IndexCollection(args);
- Field f = runner.getClass().getDeclaredField("solrPool");
- f.setAccessible(true);
- f.set(runner, stubSolrPool);
- return runner;
- }
-
- protected SearchSolr getSearchRunner(SearchSolr.Args args) throws Exception {
- SearchSolr runner = new SearchSolr(args);
- Field f = runner.getClass().getDeclaredField("client");
- f.setAccessible(true);
- ((SolrClient) f.get(runner)).close(); // close the old client
- f.set(runner, client);
- return runner;
- }
-
- protected abstract String getCollectionName();
-
- protected abstract String getSchemaAdjustmentFile();
-
- protected abstract IndexArgs getIndexArgs();
-
- protected abstract SearchSolr.Args getSearchArgs();
-
- protected abstract String[] getRefRankingResult();
-
- @Test
- public void testIndexAndSearch() throws Exception {
- String schemaAdjustmentFile = getSchemaAdjustmentFile();
- if (schemaAdjustmentFile != null) {
- // update schema, much like curl -X POST -H 'Content-type:application/json' --data-binary SCHEMA_NAME.json http://localhost:8983/solr/COLLECTION_NAME/schema
- String schemaJson = Files.readString(getFile(schemaAdjustmentFile).toPath());
- ModifiableSolrParams params = new ModifiableSolrParams();
- params.add(CommonParams.QT, "/schema");
- DirectJsonQueryRequest schemaRequest = new DirectJsonQueryRequest(schemaJson, params);
- QueryResponse response = schemaRequest.process(client, getCollectionName());
- assertEquals(0, response.getStatus());
- }
-
- IndexArgs indexArgs = getIndexArgs();
- IndexCollection indexRunner = getIndexRunner(indexArgs);
- indexRunner.run();
-
- SearchSolr.Args searchArgs = getSearchArgs();
- SearchSolr searchRunner = getSearchRunner(searchArgs);
- searchRunner.runTopics();
-
- BufferedReader br = new BufferedReader(new FileReader(searchArgs.output));
- String[] ref = getRefRankingResult();
- String s;
- int cnt = 0;
- while ((s = br.readLine()) != null) {
- assertEquals(ref[cnt], s);
- cnt++;
- }
- assertEquals(cnt, ref.length);
- FileUtils.deleteQuietly(new File(searchArgs.output));
- }
-}
diff --git a/src/test/java/io/anserini/integration/solr/TrecEndToEndTest.java b/src/test/java/io/anserini/integration/solr/TrecEndToEndTest.java
deleted file mode 100644
index f9d95a9d29..0000000000
--- a/src/test/java/io/anserini/integration/solr/TrecEndToEndTest.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Anserini: A Lucene toolkit for reproducible information retrieval research
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.anserini.integration.solr;
-
-import io.anserini.collection.TrecCollection;
-import io.anserini.index.IndexArgs;
-import io.anserini.search.SearchSolr;
-
-public class TrecEndToEndTest extends SolrEndToEndTest {
- @Override
- protected String getCollectionName() {
- return "Trec";
- }
-
- @Override
- protected String getSchemaAdjustmentFile() {
- return null; // no need to adjust schema
- }
-
- @Override
- protected IndexArgs getIndexArgs() {
- IndexArgs indexArgs = createDefaultIndexArgs();
- indexArgs.input = "src/test/resources/sample_docs/trec/collection2";
- indexArgs.collectionClass = TrecCollection.class.getSimpleName();
- return indexArgs;
- }
-
- @Override
- protected SearchSolr.Args getSearchArgs() {
- return createSearchArgs("Trec", "src/test/resources/sample_topics/Trec");
- }
-
- @Override
- protected String[] getRefRankingResult() {
- return new String[]{ // bm25
- "1 Q0 DOC222 1 0.343200 Solrini",
- "1 Q0 TREC_DOC_1 2 0.333400 Solrini",
- "1 Q0 WSJ_1 3 0.068700 Solrini"
- };
- }
-}
diff --git a/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java b/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java
index b93cd5b42b..bfbe194a68 100644
--- a/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java
+++ b/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java
@@ -31,7 +31,7 @@
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.After;
import org.junit.Before;
diff --git a/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java b/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java
index 6b82cdcc29..00ce6a20a0 100644
--- a/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java
+++ b/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java
@@ -21,7 +21,7 @@
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.Query;
-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.Test;
import java.util.Map;
diff --git a/src/test/java/io/anserini/search/query/SdmQueryTest.java b/src/test/java/io/anserini/search/query/SdmQueryTest.java
index 89663ffdca..c032882834 100644
--- a/src/test/java/io/anserini/search/query/SdmQueryTest.java
+++ b/src/test/java/io/anserini/search/query/SdmQueryTest.java
@@ -36,7 +36,7 @@
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
diff --git a/src/test/java/io/anserini/util/FeatureVectorTest.java b/src/test/java/io/anserini/util/FeatureVectorTest.java
index c747194004..12694bd90b 100644
--- a/src/test/java/io/anserini/util/FeatureVectorTest.java
+++ b/src/test/java/io/anserini/util/FeatureVectorTest.java
@@ -16,7 +16,7 @@
package io.anserini.util;
-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.Test;
import java.util.Arrays;