diff --git a/pom.xml b/pom.xml index 368f29e610..ecb15d4b84 100644 --- a/pom.xml +++ b/pom.xml @@ -2,7 +2,7 @@ 4.0.0 io.anserini anserini - 0.14.5-SNAPSHOT + 0.15.0-SNAPSHOT Anserini An information retrieval toolkit built on Lucene http://anserini.io/ @@ -26,8 +26,7 @@ - 9.0.0 - 9.0.0 + 9.3.0 UTF-8 @@ -110,14 +109,6 @@ io.anserini.search.SearchCollection SearchCollection - - io.anserini.search.SearchSolr - SearchSolr - - - io.anserini.search.SearchElastic - SearchElastic - io.anserini.search.SearchMsmarco SearchMsmarco @@ -296,11 +287,31 @@ lucene-core ${lucene.version} + + org.apache.lucene + lucene-codecs + ${lucene.version} + + + org.apache.lucene + lucene-backward-codecs + ${lucene.version} + org.apache.lucene lucene-queries ${lucene.version} + + org.apache.lucene + lucene-queryparser + ${lucene.version} + + + org.apache.lucene + lucene-analysis-common + ${lucene.version} + org.apache.lucene lucene-analysis-kuromoji @@ -323,71 +334,6 @@ 4.13.2 test - - org.apache.solr - solr-solrj - ${solr.version} - - - org.apache.lucene - lucene-core - - - org.apache.lucene - lucene-analysis-common - - - org.apache.lucene - lucene-queries - - - org.slf4j - slf4j-api - - - org.apache.logging.log4j - log4j-slf4j-impl - - - - - org.apache.solr - solr-test-framework - ${solr.version} - test - - - org.apache.lucene - lucene-core - - - org.apache.lucene - lucene-analysis-common - - - org.apache.lucene - lucene-queries - - - org.slf4j - slf4j-api - - - org.apache.logging.log4j - log4j-slf4j-impl - - - - - org.apache.lucene - lucene-codecs - ${lucene.version} - - - org.elasticsearch.client - elasticsearch-rest-high-level-client - 7.0.0 - org.tukaani xz @@ -491,6 +437,11 @@ commons-csv 1.8 + + org.apache.commons + commons-text + 1.9 + org.mockito mockito-all diff --git a/src/main/java/io/anserini/analysis/TweetLowerCaseEntityPreservingFilterFactory.java b/src/main/java/io/anserini/analysis/TweetLowerCaseEntityPreservingFilterFactory.java index 14a63c6790..948911672f 100644 --- a/src/main/java/io/anserini/analysis/TweetLowerCaseEntityPreservingFilterFactory.java +++ b/src/main/java/io/anserini/analysis/TweetLowerCaseEntityPreservingFilterFactory.java @@ -17,7 +17,7 @@ package io.anserini.analysis; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.TokenFilterFactory; import java.util.Map; diff --git a/src/main/java/io/anserini/index/IndexArgs.java b/src/main/java/io/anserini/index/IndexArgs.java index 6997f95886..70957978da 100644 --- a/src/main/java/io/anserini/index/IndexArgs.java +++ b/src/main/java/io/anserini/index/IndexArgs.java @@ -69,8 +69,7 @@ public class IndexArgs { // optional arguments - @Option(name = "-index", metaVar = "[path]", forbids = {"-solr", "-es"}, - usage = "Index path.") + @Option(name = "-index", metaVar = "[path]", usage = "Index path.") public String index; @Option(name = "-fields", handler = StringArrayOptionHandler.class, @@ -160,82 +159,6 @@ public class IndexArgs { usage = "File that contains deleted tweet ids (longs), one per line; these tweets will be skipped during indexing.") public String tweetDeletedIdsFile = ""; - // Solr options - - @Option(name = "-solr", forbids = {"-index", "-es"}, - usage = "Indexes into Solr.") - public boolean solr = false; - - @Option(name = "-solr.batch", metaVar = "[n]", - usage = "Solr indexing batch size.") - public int solrBatch = 1000; - - @Option(name = "-solr.commitWithin", metaVar = "[s]", - usage = "Solr commitWithin setting (in seconds).") - public int solrCommitWithin = 60; - - @Option(name = "-solr.index", metaVar = "[name]", - usage = "Solr index name.") - public String solrIndex = null; - - @Option(name = "-solr.zkUrl", metaVar = "[urls]", - usage = "Solr ZooKeeper URLs (comma separated list).") - public String zkUrl = null; - - @Option(name = "-solr.zkChroot", metaVar = "[path]", - usage = "Solr ZooKeeper chroot") - public String zkChroot = "/"; - - @Option(name = "-solr.poolSize", metaVar = "[n]", - usage = "Solr client pool size.") - public int solrPoolSize = 16; - - // Elasticsearch options - - @Option(name = "-es", forbids = {"-index", "-solr"}, - usage = "Indexes into Elasticsearch.") - public boolean es = false; - - @Option(name = "-es.index", metaVar = "[name]", - usage = "Elasticsearch index name.") - public String esIndex = null; - - @Option(name = "-es.batch", metaVar = "[n]", - usage = "Elasticsearch batch index requests size.") - public int esBatch = 1000; - - @Option(name = "-es.bulk", metaVar = "[n]", - usage = "Elasticsearch max bulk requests size in bytes.") - public int esBulk = 80000000; - - @Option(name = "-es.hostname", metaVar = "[host]", - usage = "Elasticsearch host.") - public String esHostname = "localhost"; - - @Option(name = "-es.port", metaVar = "[port]", - usage = "Elasticsearch port number.") - public int esPort = 9200; - - @Option(name = "-es.user", metaVar = "[username]", - usage = "Elasticsearch user name.") - public String esUser = "elastic"; - - @Option(name = "-es.password", metaVar = "[password]", - usage = "Elasticsearch password.") - public String esPassword = "changeme"; - - @Option(name = "-es.poolSize", metaVar = "[num]", - usage = "Elasticsearch client pool size.") - public int esPoolSize = 10; - - @Option(name = "-es.connectTimeout", metaVar = "[ms]", - usage = "Elasticsearch (low level) REST client connect timeout (in ms).") - public int esConnectTimeout = TIMEOUT; - - @Option(name = "-es.socketTimeout", metaVar = "[ms]", - usage = "Elasticsearch (low level) REST client socket timeout (in ms).") - public int esSocketTimeout = TIMEOUT; - // Sharding options @Option(name = "-shard.count", metaVar = "[n]", diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index e768f587fe..b83403a3ae 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -16,8 +16,6 @@ package io.anserini.index; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; import io.anserini.analysis.DefaultEnglishAnalyzer; import io.anserini.analysis.TweetAnalyzer; import io.anserini.collection.DocumentCollection; @@ -27,22 +25,10 @@ import io.anserini.index.generator.InvalidDocumentException; import io.anserini.index.generator.LuceneDocumentGenerator; import io.anserini.index.generator.SkippedDocumentException; -import io.anserini.index.generator.WashingtonPostGenerator; import io.anserini.search.similarity.AccurateBM25Similarity; import io.anserini.search.similarity.ImpactSimilarity; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.time.DurationFormatUtils; -import org.apache.commons.pool2.BasePooledObjectFactory; -import org.apache.commons.pool2.ObjectPool; -import org.apache.commons.pool2.PooledObject; -import org.apache.commons.pool2.impl.DefaultPooledObject; -import org.apache.commons.pool2.impl.GenericObjectPool; -import org.apache.commons.pool2.impl.GenericObjectPoolConfig; -import org.apache.http.HttpHost; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CredentialsProvider; -import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -72,29 +58,14 @@ import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer; - import org.apache.lucene.document.Document; import org.apache.lucene.index.ConcurrentMergeScheduler; -import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.solr.client.solrj.SolrClient; -import org.apache.solr.client.solrj.impl.CloudSolrClient; -import org.apache.solr.client.solrj.impl.Http2SolrClient; -import org.apache.solr.common.SolrInputDocument; -import org.elasticsearch.action.DocWriteRequest; -import org.elasticsearch.action.bulk.BulkRequest; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestClient; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.common.xcontent.XContentFactory; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.OptionHandlerFilter; @@ -105,32 +76,21 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.HashSet; import java.util.List; -import java.util.Optional; import java.util.Set; import java.util.concurrent.Executors; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import java.util.stream.Stream; public final class IndexCollection { private static final Logger LOG = LogManager.getLogger(IndexCollection.class); - private static final int TIMEOUT = 600 * 1000; // This is the default analyzer used, unless another stemming algorithm or language is specified. public static final Analyzer DEFAULT_ANALYZER = DefaultEnglishAnalyzer.newDefaultInstance(); - // When duplicates of these fields are attempted to be indexed in Solr, they are ignored. This allows some fields to be multi-valued, but not others. - // Stored vs. indexed vs. doc values vs. multi-valued vs. ... are controlled via config, rather than code, in Solr. - private static final List IGNORED_DUPLICATE_FIELDS = - Lists.newArrayList(WashingtonPostGenerator.WashingtonPostField.PUBLISHED_DATE.name); - public final class Counters { /** * Counter for successfully indexed documents. @@ -262,361 +222,6 @@ public void run() { } } - private final class SolrIndexerThread implements Runnable { - private final Path input; - private final DocumentCollection collection; - private final List buffer = new ArrayList<>(args.solrBatch); - private FileSegment fileSegment; - - private SolrIndexerThread(DocumentCollection collection, Path input) { - this.input = input; - this.collection = collection; - } - - @Override - @SuppressWarnings("unchecked") - public void run() { - try { - LuceneDocumentGenerator generator = (LuceneDocumentGenerator) - generatorClass.getDeclaredConstructor(IndexArgs.class).newInstance(args); - - // We keep track of two separate counts: the total count of documents in this file segment (cnt), - // and the number of documents in this current "batch" (batch). We update the global counter every - // 10k documents: this is so that we get intermediate updates, which is informative if a collection - // has only one file segment; see https://github.com/castorini/anserini/issues/683 - int cnt = 0; - int batch = 0; - - @SuppressWarnings("unchecked") - FileSegment segment = (FileSegment) collection.createFileSegment(input); - // in order to call close() and clean up resources in case of exception - this.fileSegment = segment; - - for (SourceDocument sourceDocument : segment) { - if (!sourceDocument.indexable()) { - counters.unindexable.incrementAndGet(); - continue; - } - - Document document; - try { - document = generator.createDocument(sourceDocument); - } catch (EmptyDocumentException e1) { - counters.empty.incrementAndGet(); - continue; - } catch (SkippedDocumentException e2) { - counters.skipped.incrementAndGet(); - continue; - } catch (InvalidDocumentException e3) { - counters.errors.incrementAndGet(); - continue; - } - - if (whitelistDocids != null && !whitelistDocids.contains(sourceDocument.id())) { - counters.skipped.incrementAndGet(); - continue; - } - - SolrInputDocument solrDocument = new SolrInputDocument(); - - // Copy all Lucene Document fields to Solr document - for (IndexableField field : document.getFields()) { - // Skip docValues fields - this is done via Solr config. - if (field.fieldType().docValuesType() != DocValuesType.NONE) { - continue; - } - // If the field is already in the doc, skip it. - // This fixes an issue with WaPo where published_date is in the Lucene doc as LongPoint and StoredField. Solr needs one copy, more fine-grained control in config. - if (solrDocument.containsKey(field.name()) && IGNORED_DUPLICATE_FIELDS.contains(field.name())) { - continue; - } - if (field.numericValue() != null) { - solrDocument.addField(field.name(), field.numericValue()); - } else if (field.stringValue() != null) { // For some reason, id is multi-valued with null as one of the values - solrDocument.addField(field.name(), field.stringValue()); - } - } - - buffer.add(solrDocument); - if (buffer.size() == args.solrBatch) { - flush(); - } - - cnt++; - batch++; - - // And the counts from this batch, reset batch counter. - if (batch % 10000 == 0) { - counters.indexed.addAndGet(batch); - batch = 0; - } - } - - // If we have docs in the buffer, flush them. - if (!buffer.isEmpty()) { - flush(); - } - - // Add the remaining documents. - counters.indexed.addAndGet(batch); - - int skipped = segment.getSkippedCount(); - if (skipped > 0) { - // When indexing tweets, this is normal, because there are delete messages that are skipped over. - counters.skipped.addAndGet(skipped); - LOG.warn(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": " + skipped + " docs skipped."); - } - - if (segment.getErrorStatus()) { - counters.errors.incrementAndGet(); - LOG.error(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": error iterating through segment."); - } - - // Log at the debug level because this can be quite noisy if there are lots of file segments. - LOG.debug(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": " + cnt + " docs added."); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } finally { - if (fileSegment != null) { - fileSegment.close(); - } - } - } - - private void flush() { - if (!buffer.isEmpty()) { - SolrClient solrClient = null; - try { - solrClient = solrPool.borrowObject(); - solrClient.add(args.solrIndex, buffer, args.solrCommitWithin * 1000); - buffer.clear(); - } catch (Exception e) { - LOG.error("Error flushing documents to Solr", e); - } finally { - if (solrClient != null) { - try { - solrPool.returnObject(solrClient); - } catch (Exception e) { - LOG.error("Error returning SolrClient to pool", e); - } - } - } - } - } - } - - private class SolrClientFactory extends BasePooledObjectFactory { - @Override - public SolrClient create() { - return new CloudSolrClient.Builder(Splitter.on(',').splitToList(args.zkUrl), Optional.of(args.zkChroot)) - .withInternalClientBuilder(new Http2SolrClient.Builder().connectionTimeout(TIMEOUT)) - .build(); - } - - @Override - public PooledObject wrap(SolrClient solrClient) { - return new DefaultPooledObject<>(solrClient); - } - - @Override - public void destroyObject(PooledObject pooled) throws Exception { - pooled.getObject().close(); - } - } - - private final class ESIndexerThread implements Runnable { - private final Path input; - private final DocumentCollection collection; - private BulkRequest bulkRequest; - private FileSegment fileSegment; - - private ESIndexerThread(DocumentCollection collection, Path input) { - this.input = input; - this.collection = collection; - this.bulkRequest = new BulkRequest(); - } - - @Override - @SuppressWarnings("unchecked") - public void run() { - try { - LuceneDocumentGenerator generator = (LuceneDocumentGenerator) - generatorClass.getDeclaredConstructor(IndexArgs.class).newInstance(args); - - // We keep track of two separate counts: the total count of documents in this file segment (cnt), - // and the number of documents in this current "batch" (batch). We update the global counter every - // 10k documents: this is so that we get intermediate updates, which is informative if a collection - // has only one file segment; see https://github.com/castorini/anserini/issues/683 - int cnt = 0; - int batch = 0; - - FileSegment segment = collection.createFileSegment(input); - // in order to call close() and clean up resources in case of exception - this.fileSegment = segment; - - for (SourceDocument sourceDocument : segment) { - if (!sourceDocument.indexable()) { - counters.unindexable.incrementAndGet(); - continue; - } - - Document document; - try { - document = generator.createDocument(sourceDocument); - } catch (EmptyDocumentException e1) { - counters.empty.incrementAndGet(); - continue; - } catch (SkippedDocumentException e2) { - counters.skipped.incrementAndGet(); - continue; - } catch (InvalidDocumentException e3) { - counters.errors.incrementAndGet(); - continue; - } - - if (whitelistDocids != null && !whitelistDocids.contains(sourceDocument.id())) { - counters.skipped.incrementAndGet(); - continue; - } - - // Get distinct field names - List fields = document.getFields().stream().map(field -> field.name()).distinct().collect(Collectors.toList()); - - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - - for (String field : fields) { - - // Skip docValues fields - if (document.getField(field).fieldType().docValuesType() != DocValuesType.NONE) continue; - - // Get field objects for current field name (could be multiple, such as WaPo's fullCaption) - IndexableField[] indexableFields = document.getFields(field); - - if (field.equalsIgnoreCase("id") || indexableFields.length == 1) { - // Single value fields or "id" field - Object value = document.getField(field).stringValue() != null ? document.getField(field).stringValue() : document.getField(field).numericValue(); - builder.field(field, value); - } else { - // Multi-valued fields - Object[] values = Stream.of(indexableFields).map(f -> f.stringValue()).toArray(); - builder.array(field, values); - } - } - - builder.endObject(); - - String indexName = (args.esIndex != null) ? args.esIndex : input.getFileName().toString(); - bulkRequest.add(new IndexRequest(indexName).id(sourceDocument.id()).source(builder)); - - // sendBulkRequest when the batch size is reached OR the bulk size is reached - if (bulkRequest.numberOfActions() == args.esBatch || - bulkRequest.estimatedSizeInBytes() >= args.esBulk) { - sendBulkRequest(); - } - - cnt++; - batch++; - - // And the counts from this batch, reset batch counter. - if (batch % 10000 == 0) { - counters.indexed.addAndGet(batch); - batch = 0; - } - } - - if (bulkRequest.numberOfActions() != 0) { - sendBulkRequest(); - } - - // Add the remaining documents. - counters.indexed.addAndGet(batch); - - int skipped = segment.getSkippedCount(); - if (skipped > 0) { - // When indexing tweets, this is normal, because there are delete messages that are skipped over. - counters.skipped.addAndGet(skipped); - LOG.warn(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": " + skipped + " docs skipped."); - } - - if (segment.getErrorStatus()) { - counters.errors.incrementAndGet(); - LOG.error(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": error iterating through segment."); - } - - // Log at the debug level because this can be quite noisy if there are lots of file segments. - LOG.debug(input.getParent().getFileName().toString() + File.separator + - input.getFileName().toString() + ": " + cnt + " docs added."); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } finally { - if (fileSegment != null){ - fileSegment.close(); - } - } - } - - private void sendBulkRequest() { - if (bulkRequest.numberOfActions() == 0) { - return; - } - - RestHighLevelClient esClient = null; - try { - esClient = esPool.borrowObject(); - esClient.bulk(bulkRequest, RequestOptions.DEFAULT); - bulkRequest = new BulkRequest(); - } catch (Exception e) { - LOG.error("Error sending bulk requests to Elasticsearch", e); - - // Log the 10 docs that have the largest sizes in this request - List> docs = bulkRequest.requests(); - Collections.sort(docs, (d1, d2) -> ((IndexRequest) d2).source().length() - ((IndexRequest) d1).source().length()); - - LOG.info("Error sending bulkRequest. The 10 largest docs in this request are the following cord_uid: "); - for (int i = 0; i < 10; i++) { - IndexRequest doc = (IndexRequest) docs.get(i); - LOG.info(doc.id()); - } - } finally { - if (esClient != null) { - try { - esPool.returnObject(esClient); - } catch (Exception e) { - LOG.error("Error returning ES client to pool", e); - } - } - } - } - } - - private class ESClientFactory extends BasePooledObjectFactory { - @Override - public RestHighLevelClient create() { - final CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); - credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(args.esUser, args.esPassword)); - return new RestHighLevelClient( - RestClient.builder(new HttpHost(args.esHostname, args.esPort, "http")) - .setHttpClientConfigCallback(builder -> builder.setDefaultCredentialsProvider(credentialsProvider)) - .setRequestConfigCallback(builder -> builder.setConnectTimeout(args.esConnectTimeout).setSocketTimeout(args.esSocketTimeout)) - ); - } - - @Override - public PooledObject wrap(RestHighLevelClient esClient) { - return new DefaultPooledObject<>(esClient); - } - - @Override - public void destroyObject(PooledObject pooled) throws Exception { - pooled.getObject().close(); - } - } - private final IndexArgs args; private final Path collectionPath; private final Set whitelistDocids; @@ -625,10 +230,6 @@ public void destroyObject(PooledObject pooled) throws Excep private final DocumentCollection collection; private final Counters counters; private Path indexPath; - private ObjectPool solrPool; - private ObjectPool esPool; - - @SuppressWarnings("unchecked") public IndexCollection(IndexArgs args) throws Exception { @@ -665,32 +266,7 @@ public IndexCollection(IndexArgs args) throws Exception { LOG.info("Optimize (merge segments)? " + args.optimize); LOG.info("Whitelist: " + args.whitelist); LOG.info("Pretokenized?: " + args.pretokenized); - - if (args.solr) { - LOG.info("Indexing into Solr..."); - LOG.info("Solr batch size: " + args.solrBatch); - LOG.info("Solr commitWithin: " + args.solrCommitWithin); - LOG.info("Solr index: " + args.solrIndex); - LOG.info("Solr ZooKeeper URL: " + args.zkUrl); - LOG.info("SolrClient pool size: " + args.solrPoolSize); - } else if (args.es) { - LOG.info("Indexing into Elasticsearch..."); - LOG.info("Elasticsearch batch size: " + args.esBatch); - LOG.info("Elasticsearch index: " + args.esIndex); - LOG.info("Elasticsearch hostname: " + args.esHostname); - LOG.info("Elasticsearch host port: " + args.esPort); - LOG.info("Elasticsearch client connect timeout (in ms): " + args.esConnectTimeout); - LOG.info("Elasticsearch client socket timeout (in ms): " + args.esSocketTimeout); - LOG.info("Elasticsearch pool size: " + args.esPoolSize); - LOG.info("Elasticsearch user: " + args.esUser); - } else { - LOG.info("Directly building Lucene indexes..."); - LOG.info("Index path: " + args.index); - } - - if (args.index == null && !args.solr && !args.es) { - throw new IllegalArgumentException("Must specify one of -index, -solr, or -es"); - } + LOG.info("Index path: " + args.index); if (args.index != null) { this.indexPath = Paths.get(args.index); @@ -723,18 +299,6 @@ public IndexCollection(IndexArgs args) throws Exception { this.whitelistDocids = null; } - if (args.solr) { - GenericObjectPoolConfig config = new GenericObjectPoolConfig<>(); - config.setMaxTotal(args.solrPoolSize); - config.setMinIdle(args.solrPoolSize); // To guard against premature discarding of solrClients - this.solrPool = new GenericObjectPool<>(new SolrClientFactory(), config); - } else if (args.es) { - GenericObjectPoolConfig config = new GenericObjectPoolConfig<>(); - config.setMaxTotal(args.esPoolSize); - config.setMinIdle(args.esPoolSize); - this.esPool = new GenericObjectPool<>(new ESClientFactory(), config); - } - this.counters = new Counters(); } @@ -865,13 +429,7 @@ public Counters run() throws IOException { LOG.info("Starting to index..."); for (int i = 0; i < segmentCnt; i++) { - if (args.solr) { - executor.execute(new SolrIndexerThread(collection, (Path) segmentPaths.get(i))); - } else if (args.es) { - executor.execute(new ESIndexerThread(collection, (Path) segmentPaths.get(i))); - } else { - executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i))); - } + executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i))); } executor.shutdown(); @@ -898,31 +456,9 @@ public Counters run() throws IOException { " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); } - long numIndexed; - - if (args.solr || args.es) { - numIndexed = counters.indexed.get(); - } else { - numIndexed = writer.getDocStats().maxDoc; - } + long numIndexed = writer.getDocStats().maxDoc; // Do a final commit - if (args.solr) { - try { - SolrClient client = solrPool.borrowObject(); - client.commit(args.solrIndex); - // Needed for orderly shutdown so the SolrClient executor does not delay main thread exit - solrPool.returnObject(client); - solrPool.close(); - } catch (Exception e) { - LOG.error("Exception during final Solr commit: ", e); - } - } - - if (args.es) { - esPool.close(); - } - try { if (writer != null) { writer.commit(); diff --git a/src/main/java/io/anserini/rerank/ScoredDocuments.java b/src/main/java/io/anserini/rerank/ScoredDocuments.java index c215927a31..e4eb692873 100644 --- a/src/main/java/io/anserini/rerank/ScoredDocuments.java +++ b/src/main/java/io/anserini/rerank/ScoredDocuments.java @@ -17,30 +17,21 @@ package io.anserini.rerank; import io.anserini.index.IndexArgs; -import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.util.BytesRef; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; -import org.apache.commons.lang3.ArrayUtils; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.SearchHits; - -import java.util.List; +import java.io.IOException; import java.util.ArrayList; +import java.util.List; import java.util.Map; -import java.io.IOException; /** * ScoredDocuments object that converts TopDocs from the searcher into an Anserini format @@ -74,71 +65,6 @@ public static ScoredDocuments fromTopDocs(TopDocs rs, IndexSearcher searcher) { return scoredDocs; } - public static ScoredDocuments fromSolrDocs(SolrDocumentList rs) { - - ScoredDocuments scoredDocs = new ScoredDocuments(); - - int length = rs.size(); - scoredDocs.documents = new Document[length]; - scoredDocs.ids = new int[length]; - scoredDocs.scores = new float[length]; - - for (int i = 0; i < length; i++) { - - SolrDocument d = rs.get(i); - - // Create placeholder copies of Lucene Documents - // Intention is for compatibility with ScoreTiesAdjusterReranker without disturbing other aspects of reranker code - - Document document = new Document(); - String id = d.getFieldValue("id").toString(); - float score = (float) d.getFieldValue("score"); - - // Store the collection docid. - document.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); - // This is needed to break score ties by docid. - document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); - scoredDocs.documents[i] = document; - scoredDocs.scores[i] = score; - scoredDocs.ids[i] = i; // no internal Lucene ID available, use index as placeholder - } - - return scoredDocs; - } - - public static ScoredDocuments fromESDocs(SearchHits rs) { - - ScoredDocuments scoredDocs = new ScoredDocuments(); - SearchHit[] searchHits = rs.getHits(); - - int length = searchHits.length; - scoredDocs.documents = new Document[length]; - scoredDocs.ids = new int[length]; - scoredDocs.scores = new float[length]; - - for (int i = 0; i < length; i++) { - - SearchHit hit = searchHits[i]; - - // Create placeholder copies of Lucene Documents - // Intention is for compatibility with ScoreTiesAdjusterReranker without disturbing other aspects of reranker code - - Document document = new Document(); - String id = hit.getId(); - float score = hit.getScore(); - - // Store the collection docid. - document.add(new StringField(IndexArgs.ID, id, Field.Store.YES)); - // This is needed to break score ties by docid. - document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id))); - scoredDocs.documents[i] = document; - scoredDocs.scores[i] = score; - scoredDocs.ids[i] = i; // no internal Lucene ID available, use index as placeholder - } - - return scoredDocs; - } - public static ScoredDocuments fromQrels(Map qrels, IndexReader reader) throws IOException { ScoredDocuments scoredDocs = new ScoredDocuments(); diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 286efe8e3c..1a7d1776ff 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -69,7 +69,6 @@ import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer; - import org.apache.lucene.document.LongPoint; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -114,7 +113,6 @@ import java.io.InputStreamReader; import java.io.PrintWriter; import java.nio.charset.StandardCharsets; -import java.nio.file.AtomicMoveNotSupportedException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -130,13 +128,11 @@ import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.CompletionException; -import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.Executors; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; -import java.util.zip.GZIPInputStream; /** * Main entry point for search. diff --git a/src/main/java/io/anserini/search/SearchElastic.java b/src/main/java/io/anserini/search/SearchElastic.java deleted file mode 100644 index fdc01e387f..0000000000 --- a/src/main/java/io/anserini/search/SearchElastic.java +++ /dev/null @@ -1,334 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search; - -import io.anserini.index.IndexArgs; -import io.anserini.index.generator.TweetGenerator; -import io.anserini.rerank.ScoredDocuments; -import io.anserini.rerank.lib.ScoreTiesAdjusterReranker; -import io.anserini.search.topicreader.TopicReader; -import org.apache.commons.lang3.time.DurationFormatUtils; -import org.apache.http.HttpHost; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CredentialsProvider; -import org.apache.http.impl.client.BasicCredentialsProvider; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.HttpAsyncResponseConsumerFactory; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestClient; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.query.QueryStringQueryBuilder; -import org.elasticsearch.index.query.RangeQueryBuilder; -import org.elasticsearch.search.SearchHits; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.FieldSortBuilder; -import org.elasticsearch.search.sort.ScoreSortBuilder; -import org.elasticsearch.search.sort.SortOrder; -import org.kohsuke.args4j.CmdLineException; -import org.kohsuke.args4j.CmdLineParser; -import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.OptionHandlerFilter; -import org.kohsuke.args4j.ParserProperties; -import org.kohsuke.args4j.spi.StringArrayOptionHandler; - -import java.io.Closeable; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Locale; -import java.util.Map; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.concurrent.TimeUnit; - -/* -* Entry point of the Retrieval. - */ -public final class SearchElastic implements Closeable { - - private static final Logger LOG = LogManager.getLogger(SearchCollection.class); - private static final int TIMEOUT = 600 * 1000; - private final Args args; - private RestHighLevelClient client; - - private static final RequestOptions COMMON_OPTIONS; - static { - RequestOptions.Builder builder = RequestOptions.DEFAULT.toBuilder(); - builder.setHttpAsyncResponseConsumerFactory( - new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory(1024 * 1024 * 1024)); - COMMON_OPTIONS = builder.build(); - } - - public static final class Args { - - // required arguments - - @Option(name = "-topics", metaVar = "[file]", handler = StringArrayOptionHandler.class, required = true, usage = "topics file") - public String[] topics; - - @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file") - public String output; - - @Option(name = "-topicreader", required = true, usage = "define how to read the topic(query) file: one of [Trec|Webxml]") - public String topicReader; - - @Option(name = "-es.index", usage = "the name of the index in Elasticsearch") - public String esIndex = null; - - @Option(name = "-es.hostname", usage = "the name of Elasticsearch HTTP host") - public String esHostname = "localhost"; - - @Option(name = "-es.port", usage = "the port for Elasticsearch HTTP host") - public int esPort = 9200; - - /** - * The user and password are defaulted to those pre-configured for docker-elk - */ - @Option(name = "-es.user", usage = "the user of the ELK stack") - public String esUser = "elastic"; - - @Option(name = "-es.password", usage = "the password for the ELK stack") - public String esPassword = "changeme"; - - // optional arguments - @Option(name = "-topicfield", usage = "Which field of the query should be used, default \"title\"." + - " For TREC ad hoc topics, description or narrative can be used.") - public String topicfield = "title"; - - @Option(name = "-searchtweets", usage = "Whether the search is against a tweet " + - "index created by IndexCollection -collection TweetCollection") - public Boolean searchtweets = false; - - @Option(name = "-hits", metaVar = "[number]", required = false, usage = "max number of hits to return") - public int hits = 1000; - - @Option(name = "-runtag", metaVar = "[tag]", required = false, usage = "runtag") - public String runtag = null; - - } - - private final class ESSearcherThread extends Thread { - - final private SortedMap> topics; - final private String outputPath; - final private String runTag; - - private ESSearcherThread(SortedMap> topics, String outputPath, String runTag){ - - this.topics = topics; - this.runTag = runTag; - this.outputPath = outputPath; - setName(outputPath); - } - - @Override - public void run() { - try { - LOG.info("[Start] Retrieval with Elasticsearch collection: " + args.esIndex); - final long start = System.nanoTime(); - PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(outputPath), StandardCharsets.US_ASCII)); - - for (Map.Entry> entry : topics.entrySet()) { - K qid = entry.getKey(); - String queryString = entry.getValue().get(args.topicfield); - ScoredDocuments docs; - if (args.searchtweets) { - docs = searchTweets(queryString, Long.parseLong(entry.getValue().get("time"))); - } else { - docs = search(queryString); - } - - /** - * the first column is the topic number. - * the second column is currently unused and should always be "Q0". - * the third column is the official document identifier of the retrieved document. - * the fourth column is the rank the document is retrieved. - * the fifth column shows the score (integer or floating point) that generated the ranking. - * the sixth column is called the "run tag" and should be a unique identifier for your - */ - for (int i = 0; i < docs.documents.length; i++) { - out.println(String.format(Locale.US, "%s Q0 %s %d %f %s", qid, - docs.documents[i].getField(IndexArgs.ID).stringValue(), (i + 1), docs.scores[i], runTag)); - } - } - out.flush(); - out.close(); - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("[Finished] Run " + topics.size() + " topics searched in " - + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } - } - } - - public SearchElastic(Args args) { - this.args = args; - LOG.info("Elasticsearch index: " + args.esIndex); - LOG.info("Elasticsearch hostname: " + args.esHostname); - LOG.info("Elasticsearch host port: " + args.esPort); - - final CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); - credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(args.esUser, args.esPassword)); - - this.client = new RestHighLevelClient( - RestClient.builder(new HttpHost(args.esHostname, args.esPort, "http")) - .setHttpClientConfigCallback(builder -> builder.setDefaultCredentialsProvider(credentialsProvider)) - .setRequestConfigCallback(builder -> builder.setConnectTimeout(TIMEOUT).setSocketTimeout(TIMEOUT))); - } - - @SuppressWarnings("unchecked") - public void runTopics() throws IOException { - TopicReader tr; - SortedMap> topics = new TreeMap<>(); - for (String singleTopicsFile : args.topics) { - Path topicsFilePath = Paths.get(singleTopicsFile); - if (!Files.exists(topicsFilePath) || !Files.isRegularFile(topicsFilePath) || !Files.isReadable(topicsFilePath)) { - throw new IllegalArgumentException("Topics file : " + topicsFilePath + " does not exist or is not a (readable) file."); - } - try { - tr = (TopicReader) Class.forName("io.anserini.search.topicreader." + args.topicReader + "TopicReader") - .getConstructor(Path.class).newInstance(topicsFilePath); - topics.putAll(tr.read()); - } catch (Exception e) { - throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader); - } - } - - final String runTag = args.runtag == null ? "Elastirini" : args.runtag; - ESSearcherThread esThread = new ESSearcherThread(topics, args.output, runTag); - esThread.run(); - } - - public ScoredDocuments search(String queryString){ - - SearchHits results = null; - - String specials = "+-=&|> ScoredDocuments searchTweets(String queryString, long t){ - - SearchHits results = null; - - String specials = "+-=&|> tag contains the timestamp of the query in terms of the - // chronologically nearest tweet id within the corpus - RangeQueryBuilder queryTweetTime = QueryBuilders - .rangeQuery(TweetGenerator.TweetField.ID_LONG.name) - .from(0L) - .to(t); - - QueryStringQueryBuilder queryTerms = QueryBuilders - .queryStringQuery(queryString) - .defaultField("contents") - .analyzer("english"); - - BoolQueryBuilder query = QueryBuilders.boolQuery() - .filter(queryTweetTime) - .should(queryTerms); - - SearchRequest searchRequest = new SearchRequest(args.esIndex); - SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); - sourceBuilder.query(query); - sourceBuilder.size(args.hits); - sourceBuilder.sort(new ScoreSortBuilder().order(SortOrder.DESC)); - sourceBuilder.sort(new FieldSortBuilder(TweetGenerator.TweetField.ID_LONG.name).order(SortOrder.DESC)); - searchRequest.source(sourceBuilder); - - try { - SearchResponse searchResponse = client.search(searchRequest, COMMON_OPTIONS); - results = searchResponse.getHits(); - } catch (Exception e) { - LOG.error("Exception during ES query: ", e); - } - - ScoreTiesAdjusterReranker reranker = new ScoreTiesAdjusterReranker(); - return reranker.rerank(ScoredDocuments.fromESDocs(results), null); - } - - @Override - public void close() throws IOException { - client.close(); - } - - public static void main(String[] args) throws Exception { - Args searchElasticArgs = new Args(); - CmdLineParser parser = new CmdLineParser(searchElasticArgs, ParserProperties.defaults().withUsageWidth(90)); - - try { - parser.parseArgument(args); - } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: SearchElastic" + parser.printExample(OptionHandlerFilter.REQUIRED)); - return; - } - - final long start = System.nanoTime(); - SearchElastic searcher = new SearchElastic(searchElasticArgs); - searcher.runTopics(); - searcher.close(); - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("Total run time: " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); - } -} \ No newline at end of file diff --git a/src/main/java/io/anserini/search/SearchSolr.java b/src/main/java/io/anserini/search/SearchSolr.java deleted file mode 100644 index 9b01661aa4..0000000000 --- a/src/main/java/io/anserini/search/SearchSolr.java +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search; - -import com.google.common.base.Splitter; -import io.anserini.index.IndexArgs; -import io.anserini.index.generator.TweetGenerator; -import io.anserini.rerank.ScoredDocuments; -import io.anserini.rerank.lib.ScoreTiesAdjusterReranker; -import io.anserini.search.topicreader.TopicReader; -import org.apache.commons.lang3.time.DurationFormatUtils; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.apache.lucene.document.LongPoint; -import org.apache.lucene.search.Query; -import org.apache.solr.client.solrj.SolrClient; -import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.SolrQuery.SortClause; -import org.apache.solr.client.solrj.impl.CloudSolrClient; -import org.apache.solr.client.solrj.impl.Http2SolrClient; -import org.apache.solr.client.solrj.response.QueryResponse; -import org.apache.solr.common.SolrDocumentList; -import org.kohsuke.args4j.CmdLineException; -import org.kohsuke.args4j.CmdLineParser; -import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.OptionHandlerFilter; -import org.kohsuke.args4j.ParserProperties; -import org.kohsuke.args4j.spi.StringArrayOptionHandler; - -import java.io.Closeable; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Locale; -import java.util.Map; -import java.util.Optional; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.concurrent.TimeUnit; - -/* -* Entry point of the Retrieval. - */ -public final class SearchSolr implements Closeable { - - private static final Logger LOG = LogManager.getLogger(SearchCollection.class); - private static final int TIMEOUT = 600 * 1000; - private final Args args; - private SolrClient client; - - public static final class Args { - - // required arguments - - @Option(name = "-topics", metaVar = "[file]", handler = StringArrayOptionHandler.class, required = true, usage = "topics file") - public String[] topics; - - @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file") - public String output; - - @Option(name = "-topicreader", required = true, usage = "define how to read the topic(query) file: one of [Trec|Webxml]") - public String topicReader; - - @Option(name = "-solr.index", usage = "the name of the index in Solr") - public String solrIndex = null; - - @Option(name = "-solr.zkUrl", usage = "the URL of Solr's ZooKeeper (comma separated list of using ensemble)") - public String zkUrl = null; - - @Option(name = "-solr.zkChroot", usage = "the ZooKeeper chroot") - public String zkChroot = "/"; - - // optional arguments - @Option(name = "-topicfield", usage = "Which field of the query should be used, default \"title\"." + - " For TREC ad hoc topics, description or narrative can be used.") - public String topicfield = "title"; - - @Option(name = "-searchtweets", usage = "Whether the search is against a tweet " + - "index created by IndexCollection -collection TweetCollection") - public Boolean searchtweets = false; - - @Option(name = "-hits", metaVar = "[number]", required = false, usage = "max number of hits to return") - public int hits = 1000; - - @Option(name = "-runtag", metaVar = "[tag]", required = false, usage = "runtag") - public String runtag = null; - - } - - private final class SolrSearcherThread extends Thread { - - final private SortedMap> topics; - final private String outputPath; - final private String runTag; - - private SolrSearcherThread(SortedMap> topics, String outputPath, String runTag){ - - this.topics = topics; - this.runTag = runTag; - this.outputPath = outputPath; - setName(outputPath); - } - - @Override - public void run() { - try { - LOG.info("[Start] Retrieval with Solr collection: " + args.solrIndex); - final long start = System.nanoTime(); - PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(outputPath), StandardCharsets.US_ASCII)); - - for (Map.Entry> entry : topics.entrySet()) { - K qid = entry.getKey(); - String queryString = entry.getValue().get(args.topicfield); - ScoredDocuments docs; - if (args.searchtweets) { - docs = searchTweets(queryString, Long.parseLong(entry.getValue().get("time"))); - } else { - docs = search(queryString); - } - - /** - * the first column is the topic number. - * the second column is currently unused and should always be "Q0". - * the third column is the official document identifier of the retrieved document. - * the fourth column is the rank the document is retrieved. - * the fifth column shows the score (integer or floating point) that generated the ranking. - * the sixth column is called the "run tag" and should be a unique identifier for your - */ - for (int i = 0; i < docs.documents.length; i++) { - out.println(String.format(Locale.US, "%s Q0 %s %d %f %s", qid, - docs.documents[i].getField(IndexArgs.ID).stringValue(), (i + 1), docs.scores[i], runTag)); - } - } - out.flush(); - out.close(); - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("[Finished] Run " + topics.size() + " topics searched in " - + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); - } catch (Exception e) { - LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); - } - } - } - - public SearchSolr(Args args) throws IOException { - this.args = args; - LOG.info("Solr index: " + args.solrIndex); - LOG.info("Solr ZooKeeper URL: " + args.zkUrl); - this.client = new CloudSolrClient.Builder(Splitter.on(',') - .splitToList(args.zkUrl), Optional.of(args.zkChroot)) - .withInternalClientBuilder(new Http2SolrClient.Builder().connectionTimeout(TIMEOUT)) - .build(); - } - - @SuppressWarnings("unchecked") - public void runTopics() throws IOException { - TopicReader tr; - SortedMap> topics = new TreeMap<>(); - for (String singleTopicsFile : args.topics) { - Path topicsFilePath = Paths.get(singleTopicsFile); - if (!Files.exists(topicsFilePath) || !Files.isRegularFile(topicsFilePath) || !Files.isReadable(topicsFilePath)) { - throw new IllegalArgumentException("Topics file : " + topicsFilePath + " does not exist or is not a (readable) file."); - } - try { - tr = (TopicReader) Class.forName("io.anserini.search.topicreader." + args.topicReader + "TopicReader") - .getConstructor(Path.class).newInstance(topicsFilePath); - topics.putAll(tr.read()); - } catch (Exception e) { - throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader); - } - } - - final String runTag = args.runtag == null ? "Solrini" : args.runtag; - SolrSearcherThread solrThread = new SolrSearcherThread(topics, args.output, runTag); - solrThread.run(); - } - - public ScoredDocuments search(String queryString){ - - SolrDocumentList results = null; - - SolrQuery solrq = new SolrQuery(); - solrq.set("df", "contents"); - solrq.set("fl", "* score"); - // Remove some characters in query which are special syntax in Solr query parser - solrq.setQuery(queryString.replaceAll("[+=&|<>!(){}~*?:/\"\\^\\-\\[\\]\\\\]", " ")); - solrq.setRows(args.hits); - solrq.setSort(SortClause.desc("score")); - solrq.addSort(SortClause.asc(IndexArgs.ID)); - - try { - QueryResponse response = client.query(args.solrIndex, solrq); - results = response.getResults(); - } catch (Exception e) { - LOG.error("Exception during Solr query: ", e); - } - - ScoreTiesAdjusterReranker reranker = new ScoreTiesAdjusterReranker(); - return reranker.rerank(ScoredDocuments.fromSolrDocs(results), null); - } - - public ScoredDocuments searchTweets(String queryString, long t){ - - SolrDocumentList results = null; - - SolrQuery solrq = new SolrQuery(); - solrq.set("df", "contents"); - solrq.set("fl", "* score"); - // Remove double quotes in query since they are special syntax in Solr query parser - solrq.setQuery(queryString.replace("\"", "")); - solrq.setRows(args.hits); - solrq.setSort(SortClause.desc("score")); - solrq.addSort(SortClause.desc(TweetGenerator.TweetField.ID_LONG.name)); - - // Do not consider the tweets with tweet ids that are beyond the queryTweetTime - // tag contains the timestamp of the query in terms of the - // chronologically nearest tweet id within the corpus - Query filter = LongPoint.newRangeQuery(TweetGenerator.TweetField.ID_LONG.name, 0L, t); - solrq.set("fq", filter.toString()); - - try { - QueryResponse response = client.query(args.solrIndex, solrq); - results = response.getResults(); - } catch (Exception e) { - LOG.error("Exception during Solr query: ", e); - } - - ScoreTiesAdjusterReranker reranker = new ScoreTiesAdjusterReranker(); - return reranker.rerank(ScoredDocuments.fromSolrDocs(results), null); - } - - @Override - public void close() throws IOException { - client.close(); - } - - public static void main(String[] args) throws Exception { - Args searchSolrArgs = new Args(); - CmdLineParser parser = new CmdLineParser(searchSolrArgs, ParserProperties.defaults().withUsageWidth(90)); - - try { - parser.parseArgument(args); - } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: SearchSolr" + parser.printExample(OptionHandlerFilter.REQUIRED)); - return; - } - - final long start = System.nanoTime(); - SearchSolr searcher = new SearchSolr(searchSolrArgs); - searcher.runTopics(); - searcher.close(); - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("Total run time: " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); - } -} diff --git a/src/main/python/run_es_regression.py b/src/main/python/run_es_regression.py deleted file mode 100644 index b2084de21d..0000000000 --- a/src/main/python/run_es_regression.py +++ /dev/null @@ -1,256 +0,0 @@ -# -# Pyserini: Python interface to the Anserini IR toolkit built on Lucene -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import logging -import math -import os -import requests -import time - -import regression_utils - -# Note that this class is specifically written with REST API requests instead of the -# Elasticsearch client eliminate an additional dependency - -logger = logging.getLogger('run_es_regression') -ch = logging.StreamHandler() -ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s - %(message)s')) -logger.addHandler(ch) -logger.setLevel(logging.INFO) - - -class ElasticsearchClient: - def __init__(self): - pass - - @staticmethod - def is_alive(): - try: - response = requests.get('http://localhost:9200/') - response.raise_for_status() - except requests.exceptions.RequestException: - return False - else: - return True - - def does_index_exist(self, collection): - # Make sure ES is alive: - if self.is_alive(): - try: - response = requests.get('http://localhost:9200/{}'.format(collection)) - response.raise_for_status() - except requests.exceptions.RequestException: - return False - else: - return True - else: - raise Exception('ES does not appear to be alive!') - - def delete_index(self, collection): - logger.info('Deleting index {}...'.format(collection)) - # Make sure the index exists: - if self.does_index_exist(collection): - try: - response = requests.request('DELETE', url='http://localhost:9200/{}'.format(collection)) - response.raise_for_status() - except requests.exceptions.RequestException: - return False - else: - return True - else: - raise Exception('The index {} does not exist!'.format(collection)) - - def create_index(self, collection): - logger.info('Creating index {}...'.format(collection)) - # Make sure the index does not exist: - if not self.does_index_exist(collection): - filename = 'src/main/resources/elasticsearch/index-config.{}.json'.format(collection) - if not os.path.exists(filename): - raise Exception('No config found in src/main/resources/elasticsearch/ for {}!'.format(collection)) - logger.info('Using index config for {} at {}'.format(collection, filename)) - with open(filename, mode='r') as file: - json = file.read() - response = '' - try: - response = requests.request('PUT', url='http://localhost:9200/{}'.format(collection), - data=json, headers={'Content-type': 'application/json'}) - response.raise_for_status() - except requests.exceptions.RequestException: - logger.info(response) - return False - else: - return True - else: - raise Exception('The index {} already exists!'.format(collection)) - - def insert_docs(self, collection, path): - logger.info('Inserting documents from {} into {}... '.format(path, collection)) - if not os.path.exists(args.input): - raise Exception('{} does not exist!'.format(args.input)) - if not self.does_index_exist(collection): - raise Exception('The index {} does not exist!'.format(collection)) - # TODO: abstract this into an external config instead of hard-coded. - if collection == 'robust04': - command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \ - '-generator DefaultLuceneDocumentGenerator -es -es.index robust04 -threads 8 -input ' + \ - path + ' -storePositions -storeDocvectors -storeRaw' - elif collection == 'msmarco-passage': - command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \ - '-generator DefaultLuceneDocumentGenerator -es -es.index msmarco-passage -threads 8 -input ' + \ - path + ' -storePositions -storeDocvectors -storeRaw' - elif collection == 'core18': - command = 'sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection ' + \ - '-generator WashingtonPostGenerator -es -es.index core18 -threads 8 -input ' + \ - path + ' -storePositions -storeDocvectors -storeContents' - elif collection == 'msmarco-doc': - command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \ - '-generator DefaultLuceneDocumentGenerator -es -es.index msmarco-doc -threads 8 -input ' + \ - path + ' -storePositions -storeDocvectors -storeRaw' - else: - raise Exception('Unknown collection: {}'.format(collection)) - logger.info('Running indexing command: ' + command) - return regression_utils.run_shell_command(command, logger, echo=True) - - def evaluate(self, collection): - if not self.does_index_exist(collection): - raise Exception('The index {} does not exist!'.format(collection)) - # TODO: abstract this into an external config instead of hard-coded. - if collection == 'robust04': - command = 'sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index robust04 ' + \ - '-topics src/main/resources/topics-and-qrels/topics.robust04.txt ' + \ - '-output runs/run.es.robust04.bm25.topics.robust04.txt' - elif collection == 'msmarco-passage': - command = 'sh target/appassembler/bin/SearchElastic -topicreader TsvString -es.index msmarco-passage ' + \ - '-topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt ' + \ - '-output runs/run.es.msmarco-passage.txt' - elif collection == 'core18': - command = 'sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index core18 ' + \ - '-topics src/main/resources/topics-and-qrels/topics.core18.txt ' + \ - '-output runs/run.es.core18.bm25.topics.core18.txt' - elif collection == 'msmarco-doc': - command = 'sh target/appassembler/bin/SearchElastic -topicreader TsvInt -es.index msmarco-doc ' + \ - '-topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \ - '-output runs/run.es.msmarco-doc.txt' - else: - raise Exception('Unknown collection: {}'.format(collection)) - - logger.info('Retrieval command: ' + command) - regression_utils.run_shell_command(command, logger, echo=True) - logger.info('Retrieval complete!') - - if collection == 'robust04': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ - 'src/main/resources/topics-and-qrels/qrels.robust04.txt ' + \ - 'runs/run.es.robust04.bm25.topics.robust04.txt' - elif collection == 'msmarco-passage': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ' + \ - 'runs/run.es.msmarco-passage.txt' - elif collection == 'core18': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ - 'src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.es.core18.bm25.topics.core18.txt' - elif collection == 'msmarco-doc': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.es.msmarco-doc.txt' - else: - raise Exception('Unknown collection: {}'.format(collection)) - - logger.info('Evaluation command: ' + command) - output = regression_utils.run_shell_command(command, logger, capture=True) - ap = float(output[0].split('\t')[2]) - - if collection == 'robust04': - expected = 0.2531 - elif collection == 'msmarco-passage': - expected = 0.1956 - elif collection == 'core18': - expected = 0.2496 - elif collection == 'msmarco-doc': - expected = 0.2307 - else: - raise Exception('Unknown collection: {}'.format(collection)) - - if math.isclose(ap, expected): - logger.info('[SUCESS] {} MAP verified as expected!'.format(ap)) - else: - logger.info('[FAILED] {} MAP, expected {} MAP!'.format(ap, expected)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Program for running Elasticsearch regressions.') - parser.add_argument('--ping', action='store_true', default=False, help='Ping ES and exit.') - parser.add_argument('--check-index-exists', default='', type=str, metavar='collection', - help='Check if index exists.') - parser.add_argument('--delete-index', default='', type=str, metavar='collection', help='Delete index.') - parser.add_argument('--create-index', default='', type=str, metavar='collection', help='Create index.') - parser.add_argument('--insert-docs', default='', type=str, metavar='collection', - help='Insert documents into index.') - parser.add_argument('--input', default='', type=str, metavar='directory', - help='Location of documents to insert into index.') - parser.add_argument('--evaluate', default='', type=str, metavar='collection', - help='Search and evaluate on collection.') - parser.add_argument('--regression', default='', type=str, metavar='collection', help='Run end-to-end regression.') - - args = parser.parse_args() - es = ElasticsearchClient() - - if args.ping: - logger.info('Pinging Elasticsearch instance...') - if es.is_alive(): - logger.info('... appears to alive! :)') - else: - logger.info('... appears to dead! :(') - elif args.check_index_exists: - logger.info('Checking if index {} exists...'.format(args.check_index_exists)) - if es.does_index_exist(args.check_index_exists): - logger.info('... yes indeed!') - else: - logger.info('... appears not.') - elif args.delete_index: - if es.delete_index(args.delete_index): - logger.info('... successful!') - else: - logger.info('... failed!') - elif args.create_index: - if es.create_index(args.create_index): - logger.info('... successful!') - else: - logger.info('... failed!') - elif args.insert_docs: - if not args.input: - raise Exception('Location of corpus not specified (use --input)!') - else: - es.insert_docs(args.insert_docs, args.input) - elif args.evaluate: - es.evaluate(args.evaluate) - elif args.regression: - logger.info('Running BM25 regression on {}...'.format(args.regression)) - if not args.input: - raise Exception('Location of corpus not specified (use --input)!') - if not es.is_alive(): - raise Exception('Elasticsearch does not appear to be alive!') - if es.does_index_exist(args.regression): - logger.info('Index {} already exists: deleting and recreating.'.format(args.regression)) - es.delete_index(args.regression) - es.create_index(args.regression) - es.insert_docs(args.regression, args.input) - # Documents ingested into ES are not immediately searchable. There are lots of 'refresh' options - # to control the visibility behavior, but the simplest solution is just to wait for a bit... - logger.info('Document ingestion complete. Sleeping now for 120s...') - time.sleep(120) - logger.info('Waking up!') - es.evaluate(args.regression) diff --git a/src/main/python/run_solr_regression.py b/src/main/python/run_solr_regression.py deleted file mode 100644 index 3fa8486a4b..0000000000 --- a/src/main/python/run_solr_regression.py +++ /dev/null @@ -1,247 +0,0 @@ -# -# Pyserini: Python interface to the Anserini IR toolkit built on Lucene -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import logging -import math -import os -import requests - -import regression_utils - -logger = logging.getLogger('run_solr_regression') -ch = logging.StreamHandler() -ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s - %(message)s')) -logger.addHandler(ch) -logger.setLevel(logging.INFO) - - -class SolrClient: - def __init__(self): - pass - - @staticmethod - def is_alive(): - try: - response = requests.get('http://localhost:8983/') - response.raise_for_status() - except requests.exceptions.RequestException: - return False - else: - return True - - def does_index_exist(self, collection): - # Make sure Solr is alive: - if self.is_alive(): - try: - response = requests.get('http://localhost:8983/solr/admin/collections?action=LIST') - response.raise_for_status() - except requests.exceptions.RequestException: - return False - else: - return collection in response.json()['collections'] - else: - raise Exception('Solr does not appear to be alive!') - - def delete_index(self, collection): - # Make sure the index exists: - if self.does_index_exist(collection): - command = 'solrini/bin/solr delete -c {}'.format(collection) - logger.info('Deleting index {} command: {}'.format(collection, command)) - regression_utils.run_shell_command(command, logger, echo=True) - return not self.does_index_exist(collection) - else: - raise Exception('The index {} does not exist!'.format(collection)) - - def create_index(self, collection): - # Make sure the index does not exist: - if not self.does_index_exist(collection): - # Re-upload configsets to Solr's internal Zookeeper - self.upload_configs() - command = 'solrini/bin/solr create -n anserini -c {}'.format(collection) - logger.info('Creating index {} command: {}'.format(collection, command)) - regression_utils.run_shell_command(command, logger, echo=True) - return self.does_index_exist(collection) - else: - raise Exception('The index {} already exists!'.format(collection)) - - def insert_docs(self, collection, path): - logger.info('Inserting documents from {} into {}... '.format(path, collection)) - if not os.path.exists(args.input): - raise Exception('{} does not exist!'.format(args.input)) - if not self.does_index_exist(collection): - raise Exception('The index {} does not exist!'.format(collection)) - if collection == 'core18': - command = 'sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection ' + \ - '-generator WashingtonPostGenerator -solr -solr.index core18 -solr.zkUrl localhost:9983 ' + \ - '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeContents' - elif collection == 'robust04': - command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \ - '-generator DefaultLuceneDocumentGenerator ' + \ - '-solr -solr.index robust04 -solr.zkUrl localhost:9983 ' + \ - '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw' - elif collection == 'msmarco-passage': - command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \ - '-generator DefaultLuceneDocumentGenerator ' + \ - '-solr -solr.index msmarco-passage -solr.zkUrl localhost:9983 ' + \ - '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw' - elif collection == 'msmarco-doc': - command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \ - '-generator DefaultLuceneDocumentGenerator ' + \ - '-solr -solr.index msmarco-doc -solr.zkUrl localhost:9983 ' + \ - '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw' - else: - raise Exception('Unknown collection: {}'.format(collection)) - logger.info('Running indexing command: ' + command) - return regression_utils.run_shell_command(command, logger, echo=True) - - @staticmethod - def upload_configs(): - os.chdir('src/main/resources/solr') - command = 'rm -rf anserini/conf/lang anserini-twitter/conf/lang' - logger.info('Deleting existed configs command: ' + command) - regression_utils.run_shell_command(command, logger, echo=True) - command = './solr.sh ../../../../solrini localhost:9983' - logger.info('Uploading configs command: ' + command) - regression_utils.run_shell_command(command, logger, echo=True) - os.chdir('../../../..') - logger.info('Uploading complete!') - - def evaluate(self, collection): - if not self.does_index_exist(collection): - raise Exception('The index {} does not exist!'.format(collection)) - if collection == 'core18': - command = 'sh target/appassembler/bin/SearchSolr -topicreader Trec -solr.index core18 ' + \ - '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.core18.txt ' + \ - '-output runs/run.solr.core18.bm25.topics.core18.txt' - elif collection == 'robust04': - command = 'sh target/appassembler/bin/SearchSolr -topicreader Trec -solr.index robust04 ' + \ - '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.robust04.txt ' + \ - '-output runs/run.solr.robust04.bm25.topics.robust04.txt' - elif collection == 'msmarco-passage': - command = 'sh target/appassembler/bin/SearchSolr -topicreader TsvString -solr.index msmarco-passage ' + \ - '-solr.zkUrl localhost:9983 ' + \ - '-topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt ' + \ - '-output runs/run.solr.msmarco-passage.txt' - elif collection == 'msmarco-doc': - command = 'sh target/appassembler/bin/SearchSolr -topicreader TsvInt -solr.index msmarco-doc ' + \ - '-solr.zkUrl localhost:9983 ' + \ - '-topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \ - '-output runs/run.solr.msmarco-doc.txt ' - else: - raise Exception('Unknown collection: {}'.format(collection)) - - logger.info('Retrieval command: ' + command) - regression_utils.run_shell_command(command, logger, echo=True) - logger.info('Retrieval complete!') - - if collection == 'core18': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ - 'src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.solr.core18.bm25.topics.core18.txt' - elif collection == 'robust04': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ - 'src/main/resources/topics-and-qrels/qrels.robust04.txt ' + \ - 'runs/run.solr.robust04.bm25.topics.robust04.txt' - elif collection == 'msmarco-passage': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ' + \ - 'runs/run.solr.msmarco-passage.txt' - elif collection == 'msmarco-doc': - command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.solr.msmarco-doc.txt' - else: - raise Exception('Unknown collection: {}'.format(collection)) - - logger.info('Evaluation command: ' + command) - output = regression_utils.run_shell_command(command, logger, capture=True) - ap = float(output[0].split('\t')[2]) - - if collection == 'core18': - expected = 0.2496 - elif collection == 'robust04': - expected = 0.2531 - elif collection == 'msmarco-passage': - expected = 0.1926 - elif collection == 'msmarco-doc': - expected = 0.2305 - else: - raise Exception('Unknown collection: {}'.format(collection)) - - if math.isclose(ap, expected): - logger.info('[SUCESS] {} MAP verified as expected!'.format(ap)) - else: - logger.info('[FAILED] {} MAP, expected {} MAP!'.format(ap, expected)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Program for running Solr regressions.') - parser.add_argument('--ping', action='store_true', default=False, help='ping Solr and exit') - parser.add_argument('--check-index-exists', default='', type=str, metavar='collection', - help='Check if index exists.') - parser.add_argument('--delete-index', default='', type=str, metavar='collection', help='Delete index.') - parser.add_argument('--create-index', default='', type=str, metavar='collection', help='Create index.') - parser.add_argument('--insert-docs', default='', type=str, metavar='collection', - help='Insert documents into index.') - parser.add_argument('--input', default='', type=str, metavar='directory', - help='Location of documents to insert into index.') - parser.add_argument('--evaluate', default='', type=str, metavar='collection', - help='Search and evaluate on collection.') - parser.add_argument('--regression', default='', type=str, metavar='collection', help='Run end-to-end regression.') - - args = parser.parse_args() - solr = SolrClient() - - if args.ping: - logger.info('Pinging Solr instance...') - if solr.is_alive(): - logger.info('... appears to alive! :)') - else: - logger.info('... appears to dead! :(') - elif args.check_index_exists: - logger.info('Checking if index {} exists...'.format(args.check_index_exists)) - if solr.does_index_exist(args.check_index_exists): - logger.info('... yes indeed!') - else: - logger.info('... appears not.') - elif args.delete_index: - if solr.delete_index(args.delete_index): - logger.info('... successful!') - else: - logger.info('... failed!') - elif args.create_index: - if solr.create_index(args.create_index): - logger.info('... successful!') - else: - logger.info('... failed!') - elif args.insert_docs: - if not args.input: - raise Exception('Location of corpus not specified (use --input)!') - else: - solr.insert_docs(args.insert_docs, args.input) - elif args.evaluate: - solr.evaluate(args.evaluate) - elif args.regression: - logger.info('Running BM25 regression on {}...'.format(args.regression)) - if not args.input: - raise Exception('Location of corpus not specified (use --input)!') - if not solr.is_alive(): - raise Exception('Solr does not appear to be alive!') - if solr.does_index_exist(args.regression): - logger.info('Index {} already exists: deleting and recreating.'.format(args.regression)) - solr.delete_index(args.regression) - solr.create_index(args.regression) - solr.insert_docs(args.regression, args.input) - solr.evaluate(args.regression) diff --git a/src/main/resources/elasticsearch/index-config.cord19.json b/src/main/resources/elasticsearch/index-config.cord19.json deleted file mode 100644 index c7c08e4610..0000000000 --- a/src/main/resources/elasticsearch/index-config.cord19.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mappings": { - "properties": { - "id": { - "type": "keyword" - }, - "contents": { - "type": "text", - "store": false, - "index": true, - "analyzer": "english" - }, - "raw": { - "type": "text", - "store": true, - "index": false - } - } - }, - "settings": { - "index": { - "refresh_interval": "60s", - "similarity": { - "default": { - "type": "BM25", - "k1": "0.9", - "b": "0.4" - } - } - } - } -} diff --git a/src/main/resources/elasticsearch/index-config.core18.json b/src/main/resources/elasticsearch/index-config.core18.json deleted file mode 100644 index c7c08e4610..0000000000 --- a/src/main/resources/elasticsearch/index-config.core18.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mappings": { - "properties": { - "id": { - "type": "keyword" - }, - "contents": { - "type": "text", - "store": false, - "index": true, - "analyzer": "english" - }, - "raw": { - "type": "text", - "store": true, - "index": false - } - } - }, - "settings": { - "index": { - "refresh_interval": "60s", - "similarity": { - "default": { - "type": "BM25", - "k1": "0.9", - "b": "0.4" - } - } - } - } -} diff --git a/src/main/resources/elasticsearch/index-config.msmarco-doc.json b/src/main/resources/elasticsearch/index-config.msmarco-doc.json deleted file mode 100644 index c7c08e4610..0000000000 --- a/src/main/resources/elasticsearch/index-config.msmarco-doc.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mappings": { - "properties": { - "id": { - "type": "keyword" - }, - "contents": { - "type": "text", - "store": false, - "index": true, - "analyzer": "english" - }, - "raw": { - "type": "text", - "store": true, - "index": false - } - } - }, - "settings": { - "index": { - "refresh_interval": "60s", - "similarity": { - "default": { - "type": "BM25", - "k1": "0.9", - "b": "0.4" - } - } - } - } -} diff --git a/src/main/resources/elasticsearch/index-config.msmarco-passage.json b/src/main/resources/elasticsearch/index-config.msmarco-passage.json deleted file mode 100644 index ad33344097..0000000000 --- a/src/main/resources/elasticsearch/index-config.msmarco-passage.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mappings": { - "properties": { - "id": { - "type": "keyword" - }, - "contents": { - "type": "text", - "store": false, - "index": true, - "analyzer": "english" - }, - "raw": { - "type": "text", - "store": true, - "index": false - } - } - }, - "settings": { - "index": { - "refresh_interval": "60s", - "similarity": { - "default": { - "type": "BM25", - "k1": "0.82", - "b": "0.68" - } - } - } - } -} diff --git a/src/main/resources/elasticsearch/index-config.robust04.json b/src/main/resources/elasticsearch/index-config.robust04.json deleted file mode 100644 index c7c08e4610..0000000000 --- a/src/main/resources/elasticsearch/index-config.robust04.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mappings": { - "properties": { - "id": { - "type": "keyword" - }, - "contents": { - "type": "text", - "store": false, - "index": true, - "analyzer": "english" - }, - "raw": { - "type": "text", - "store": true, - "index": false - } - } - }, - "settings": { - "index": { - "refresh_interval": "60s", - "similarity": { - "default": { - "type": "BM25", - "k1": "0.9", - "b": "0.4" - } - } - } - } -} diff --git a/src/main/resources/solr/anserini-twitter/conf/managed-schema b/src/main/resources/solr/anserini-twitter/conf/managed-schema deleted file mode 100644 index 08e1f08be5..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/managed-schema +++ /dev/null @@ -1,216 +0,0 @@ - - - - id - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 0.9 - 0.4 - - - diff --git a/src/main/resources/solr/anserini-twitter/conf/params.json b/src/main/resources/solr/anserini-twitter/conf/params.json deleted file mode 100644 index 06114ef257..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/params.json +++ /dev/null @@ -1,20 +0,0 @@ -{"params":{ - "query":{ - "defType":"edismax", - "q.alt":"*:*", - "rows":"10", - "fl":"*,score", - "":{"v":0} - }, - "facets":{ - "facet":"on", - "facet.mincount": "1", - "":{"v":0} - }, - "velocity":{ - "wt": "velocity", - "v.template":"browse", - "v.layout": "layout", - "":{"v":0} - } -}} \ No newline at end of file diff --git a/src/main/resources/solr/anserini-twitter/conf/protwords.txt b/src/main/resources/solr/anserini-twitter/conf/protwords.txt deleted file mode 100644 index 1dfc0abecb..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/protwords.txt +++ /dev/null @@ -1,21 +0,0 @@ -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#----------------------------------------------------------------------- -# Use a protected word file to protect against the stemmer reducing two -# unrelated words to the same base word. - -# Some non-words that normally won't be encountered, -# just to test that they won't be stemmed. -dontstems -zwhacky - diff --git a/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml b/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml deleted file mode 100644 index 5f3e4208ef..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/solrconfig.xml +++ /dev/null @@ -1,1341 +0,0 @@ - - - - - - - - - 9.0.0 - - - - - - - - - ${solr.data.dir:} - - - - - - - - - - - - - - - - - - - 2048 - - - - - - - - - - ${solr.lock.type:native} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${solr.ulog.dir:} - ${solr.ulog.numVersionBuckets:65536} - - - - - ${solr.autoCommit.maxTime:15000} - false - - - - - - ${solr.autoSoftCommit.maxTime:-1} - - - - - - - - - - - - - - 1024 - - - - - - - - - - - - - - - - - - - - - - - - true - - - - - - 20 - - - 200 - - - - - - - - - - - - - - - - false - - - - - - - - - - - - - - - - - - - - - - explicit - 10 - - - - - - - - - - - - - - - - explicit - json - true - - - - - - - - explicit - - - - - - _text_ - - - - - - - true - ignored_ - _text_ - - - - - - - - - text_general - - - - - - default - _text_ - solr.DirectSolrSpellChecker - - internal - - 0.5 - - 2 - - 1 - - 5 - - 4 - - 0.01 - - - - - - - - - - - - default - on - true - 10 - 5 - 5 - true - true - 10 - 5 - - - spellcheck - - - - - - - - - - true - - - tvComponent - - - - - - - - - - - - true - false - - - terms - - - - - - - - string - - - - - - explicit - - - elevator - - - - - - - - - - - 100 - - - - - - - - 70 - - 0.5 - - [-\w ,/\n\"']{20,200} - - - - - - - ]]> - ]]> - - - - - - - - - - - - - - - - - - - - - - - - ,, - ,, - ,, - ,, - ,]]> - ]]> - - - - - - 10 - .,!? - - - - - - - WORD - - - en - US - - - - - - - - - - - - [^\w-\.] - _ - - - - - - - yyyy-MM-dd'T'HH:mm:ss.SSSZ - yyyy-MM-dd'T'HH:mm:ss,SSSZ - yyyy-MM-dd'T'HH:mm:ss.SSS - yyyy-MM-dd'T'HH:mm:ss,SSS - yyyy-MM-dd'T'HH:mm:ssZ - yyyy-MM-dd'T'HH:mm:ss - yyyy-MM-dd'T'HH:mmZ - yyyy-MM-dd'T'HH:mm - yyyy-MM-dd HH:mm:ss.SSSZ - yyyy-MM-dd HH:mm:ss,SSSZ - yyyy-MM-dd HH:mm:ss.SSS - yyyy-MM-dd HH:mm:ss,SSS - yyyy-MM-dd HH:mm:ssZ - yyyy-MM-dd HH:mm:ss - yyyy-MM-dd HH:mmZ - yyyy-MM-dd HH:mm - yyyy-MM-dd - - - - - java.lang.String - text_general - - *_str - 256 - - - true - - - java.lang.Boolean - booleans - - - java.util.Date - pdates - - - java.lang.Long - java.lang.Integer - plongs - - - java.lang.Number - pdoubles - - - - - - - - - - - - - - - - - - - - - - - - - - text/plain; charset=UTF-8 - - - - - ${velocity.template.base.dir:} - ${velocity.solr.resource.loader.enabled:true} - ${velocity.params.resource.loader.enabled:false} - - - - - - - - - - - - - - diff --git a/src/main/resources/solr/anserini-twitter/conf/stopwords_en.txt b/src/main/resources/solr/anserini-twitter/conf/stopwords_en.txt deleted file mode 100644 index e11bbd5670..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/stopwords_en.txt +++ /dev/null @@ -1,49 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Standard english stop words taken from Lucene's StopAnalyzer -a -an -and -are -as -at -be -but -by -for -if -in -into -is -it -no -not -of -on -or -such -that -the -their -then -there -these -they -this -to -was -will -with diff --git a/src/main/resources/solr/anserini-twitter/conf/synonyms.txt b/src/main/resources/solr/anserini-twitter/conf/synonyms.txt deleted file mode 100644 index eab4ee8753..0000000000 --- a/src/main/resources/solr/anserini-twitter/conf/synonyms.txt +++ /dev/null @@ -1,29 +0,0 @@ -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#----------------------------------------------------------------------- -#some test synonym mappings unlikely to appear in real input text -aaafoo => aaabar -bbbfoo => bbbfoo bbbbar -cccfoo => cccbar cccbaz -fooaaa,baraaa,bazaaa - -# Some synonym groups specific to this example -GB,gib,gigabyte,gigabytes -MB,mib,megabyte,megabytes -Television, Televisions, TV, TVs -#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming -#after us won't split it into two words. - -# Synonym mappings can be used for spelling correction too -pixima => pixma - diff --git a/src/main/resources/solr/anserini/conf/managed-schema b/src/main/resources/solr/anserini/conf/managed-schema deleted file mode 100644 index 08e1f08be5..0000000000 --- a/src/main/resources/solr/anserini/conf/managed-schema +++ /dev/null @@ -1,216 +0,0 @@ - - - - id - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 0.9 - 0.4 - - - diff --git a/src/main/resources/solr/anserini/conf/params.json b/src/main/resources/solr/anserini/conf/params.json deleted file mode 100644 index 06114ef257..0000000000 --- a/src/main/resources/solr/anserini/conf/params.json +++ /dev/null @@ -1,20 +0,0 @@ -{"params":{ - "query":{ - "defType":"edismax", - "q.alt":"*:*", - "rows":"10", - "fl":"*,score", - "":{"v":0} - }, - "facets":{ - "facet":"on", - "facet.mincount": "1", - "":{"v":0} - }, - "velocity":{ - "wt": "velocity", - "v.template":"browse", - "v.layout": "layout", - "":{"v":0} - } -}} \ No newline at end of file diff --git a/src/main/resources/solr/anserini/conf/protwords.txt b/src/main/resources/solr/anserini/conf/protwords.txt deleted file mode 100644 index 1dfc0abecb..0000000000 --- a/src/main/resources/solr/anserini/conf/protwords.txt +++ /dev/null @@ -1,21 +0,0 @@ -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#----------------------------------------------------------------------- -# Use a protected word file to protect against the stemmer reducing two -# unrelated words to the same base word. - -# Some non-words that normally won't be encountered, -# just to test that they won't be stemmed. -dontstems -zwhacky - diff --git a/src/main/resources/solr/anserini/conf/solrconfig.xml b/src/main/resources/solr/anserini/conf/solrconfig.xml deleted file mode 100644 index b00368515b..0000000000 --- a/src/main/resources/solr/anserini/conf/solrconfig.xml +++ /dev/null @@ -1,1343 +0,0 @@ - - - - - - - - - 9.0.0 - - - - - - - - - - ${solr.data.dir:} - - - - - - - - - - - - - - - - - - - 2048 - - - - - - - - - - ${solr.lock.type:native} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${solr.ulog.dir:} - ${solr.ulog.numVersionBuckets:65536} - - - - - ${solr.autoCommit.maxTime:15000} - false - - - - - - ${solr.autoSoftCommit.maxTime:-1} - - - - - - - - - - - - - - 1024 - - - - - - - - - - - - - - - - - - - - - - - - true - - - - - - 20 - - - 200 - - - - - - - - - - - - - - - - false - - - - - - - - - - - - - - - - - - - - - - explicit - 10 - - - - - - - - - - - - - - - - explicit - json - true - - - - - - - - explicit - - - - - - _text_ - - - - - - - true - ignored_ - _text_ - - - - - - - - - text_general - - - - - - default - _text_ - solr.DirectSolrSpellChecker - - internal - - 0.5 - - 2 - - 1 - - 5 - - 4 - - 0.01 - - - - - - - - - - - - default - on - true - 10 - 5 - 5 - true - true - 10 - 5 - - - spellcheck - - - - - - - - - - true - - - tvComponent - - - - - - - - - - - - true - false - - - terms - - - - - - - - string - - - - - - explicit - - - elevator - - - - - - - - - - - 100 - - - - - - - - 70 - - 0.5 - - [-\w ,/\n\"']{20,200} - - - - - - - ]]> - ]]> - - - - - - - - - - - - - - - - - - - - - - - - ,, - ,, - ,, - ,, - ,]]> - ]]> - - - - - - 10 - .,!? - - - - - - - WORD - - - en - US - - - - - - - - - - - - [^\w-\.] - _ - - - - - - - yyyy-MM-dd'T'HH:mm:ss.SSSZ - yyyy-MM-dd'T'HH:mm:ss,SSSZ - yyyy-MM-dd'T'HH:mm:ss.SSS - yyyy-MM-dd'T'HH:mm:ss,SSS - yyyy-MM-dd'T'HH:mm:ssZ - yyyy-MM-dd'T'HH:mm:ss - yyyy-MM-dd'T'HH:mmZ - yyyy-MM-dd'T'HH:mm - yyyy-MM-dd HH:mm:ss.SSSZ - yyyy-MM-dd HH:mm:ss,SSSZ - yyyy-MM-dd HH:mm:ss.SSS - yyyy-MM-dd HH:mm:ss,SSS - yyyy-MM-dd HH:mm:ssZ - yyyy-MM-dd HH:mm:ss - yyyy-MM-dd HH:mmZ - yyyy-MM-dd HH:mm - yyyy-MM-dd - - - - - java.lang.String - text_general - - *_str - 256 - - - true - - - java.lang.Boolean - booleans - - - java.util.Date - pdates - - - java.lang.Long - java.lang.Integer - plongs - - - java.lang.Number - pdoubles - - - - - - - - - - - - - - - - - - - - - - - - - - text/plain; charset=UTF-8 - - - - - ${velocity.template.base.dir:} - ${velocity.solr.resource.loader.enabled:true} - ${velocity.params.resource.loader.enabled:false} - - - - - - - - - - - - - - - diff --git a/src/main/resources/solr/anserini/conf/stopwords_en.txt b/src/main/resources/solr/anserini/conf/stopwords_en.txt deleted file mode 100644 index e11bbd5670..0000000000 --- a/src/main/resources/solr/anserini/conf/stopwords_en.txt +++ /dev/null @@ -1,49 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Standard english stop words taken from Lucene's StopAnalyzer -a -an -and -are -as -at -be -but -by -for -if -in -into -is -it -no -not -of -on -or -such -that -the -their -then -there -these -they -this -to -was -will -with diff --git a/src/main/resources/solr/anserini/conf/synonyms.txt b/src/main/resources/solr/anserini/conf/synonyms.txt deleted file mode 100644 index eab4ee8753..0000000000 --- a/src/main/resources/solr/anserini/conf/synonyms.txt +++ /dev/null @@ -1,29 +0,0 @@ -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#----------------------------------------------------------------------- -#some test synonym mappings unlikely to appear in real input text -aaafoo => aaabar -bbbfoo => bbbfoo bbbbar -cccfoo => cccbar cccbaz -fooaaa,baraaa,bazaaa - -# Some synonym groups specific to this example -GB,gib,gigabyte,gigabytes -MB,mib,megabyte,megabytes -Television, Televisions, TV, TVs -#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming -#after us won't split it into two words. - -# Synonym mappings can be used for spelling correction too -pixima => pixma - diff --git a/src/main/resources/solr/schemas/acl-anthology.json b/src/main/resources/solr/schemas/acl-anthology.json deleted file mode 100644 index e358861e83..0000000000 --- a/src/main/resources/solr/schemas/acl-anthology.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "add-field": { - "name":"authors", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"sigs", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"venues", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"pages", - "type":"string", - "stored":true, - "docValues": false - } -} \ No newline at end of file diff --git a/src/main/resources/solr/schemas/cord19.json b/src/main/resources/solr/schemas/cord19.json deleted file mode 100644 index 8a9d305b9b..0000000000 --- a/src/main/resources/solr/schemas/cord19.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "add-field": { - "name":"authors", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"source_x", - "type":"string", - "stored":true, - "multiValued": true - }, - "add-field": { - "name":"pmcid", - "type":"string", - "stored":true, - "docValues": true - }, - "add-field": { - "name":"pubmed_id", - "type":"string", - "stored":true, - "docValues": true - }, - "add-field": { - "name":"publish_time", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"doi", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"journal", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"license", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"sha", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"url", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"year", - "type":"pint", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"outcomes_vocab", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"population_vocab", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"interventions_vocab", - "type":"string", - "stored":true, - "multiValued":true - } -} diff --git a/src/main/resources/solr/schemas/core.json b/src/main/resources/solr/schemas/core.json deleted file mode 100644 index f6c205539b..0000000000 --- a/src/main/resources/solr/schemas/core.json +++ /dev/null @@ -1,56 +0,0 @@ -{ - "add-field": { - "name":"authors", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"contributors", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"identifiers", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"journals", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":false - }, - "add-field": { - "name":"relations", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"subjects", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"topics", - "type":"string", - "stored":true, - "multiValued":true, - "docValues":true - }, - "add-field": { - "name":"datePublished", - "type":"string", - "stored":true - } -} \ No newline at end of file diff --git a/src/main/resources/solr/schemas/covid.json b/src/main/resources/solr/schemas/covid.json deleted file mode 100644 index f6a1f237f3..0000000000 --- a/src/main/resources/solr/schemas/covid.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "add-field": { - "name":"authors", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"source_x", - "type":"string", - "stored":true, - "docValues": true - }, - "add-field": { - "name":"pmcid", - "type":"string", - "stored":true, - "docValues": true - }, - "add-field": { - "name":"pubmed_id", - "type":"string", - "stored":true, - "docValues": true - }, - "add-field": { - "name":"publish_time", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"doi", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"journal", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"license", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"sha", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"url", - "type":"string", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"year", - "type":"pint", - "stored":true, - "docValues":true - }, - "add-field": { - "name":"outcomes_vocab", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"population_vocab", - "type":"string", - "stored":true, - "multiValued":true - }, - "add-field": { - "name":"interventions_vocab", - "type":"string", - "stored":true, - "multiValued":true - } -} diff --git a/src/main/resources/solr/solr.sh b/src/main/resources/solr/solr.sh deleted file mode 100755 index 194ea446d8..0000000000 --- a/src/main/resources/solr/solr.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env sh - -### -# This script assumes a single-node SolrCloud instance is running locally. -### - -if [[ -z "$1" ]]; then - echo "Usage: ./solr.sh " - exit 1 -fi - -# Solr install directory -SOLR_DIR=$1 - -# Solr's ZooKeeper URL -ZOOKEEPER_URL=$2 - -# Copy anserini into lib dir -mkdir ${SOLR_DIR}/lib && cp ../../../../target/anserini-*-fatjar.jar ${SOLR_DIR}/lib - -# Upload configset to Solr -${SOLR_DIR}/bin/solr zk -z ${ZOOKEEPER_URL:-localhost:9983} upconfig -n anserini -d anserini -${SOLR_DIR}/bin/solr zk -z ${ZOOKEEPER_URL:-localhost:9983} upconfig -n anserini-twitter -d anserini-twitter diff --git a/src/test/java/io/anserini/GeoIndexerTestBase.java b/src/test/java/io/anserini/GeoIndexerTestBase.java index 8c3c94e88d..e3ecc13edf 100644 --- a/src/test/java/io/anserini/GeoIndexerTestBase.java +++ b/src/test/java/io/anserini/GeoIndexerTestBase.java @@ -17,7 +17,11 @@ package io.anserini; import io.anserini.index.IndexArgs; -import org.apache.lucene.document.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.LatLonDocValuesField; +import org.apache.lucene.document.LatLonShape; +import org.apache.lucene.document.StringField; import org.apache.lucene.geo.Line; import org.apache.lucene.geo.Polygon; import org.apache.lucene.geo.SimpleWKTShapeParser; @@ -25,9 +29,10 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; -import org.junit.Before; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.After; +import org.junit.Before; + import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; diff --git a/src/test/java/io/anserini/IndexerTestBase.java b/src/test/java/io/anserini/IndexerTestBase.java index ffd16c0b0c..8a1410bdc6 100644 --- a/src/test/java/io/anserini/IndexerTestBase.java +++ b/src/test/java/io/anserini/IndexerTestBase.java @@ -30,7 +30,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; import org.junit.After; import org.junit.Before; diff --git a/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java b/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java index 417a0fb0ea..e4a854d2ca 100644 --- a/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java +++ b/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java @@ -30,7 +30,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; import org.junit.After; import org.junit.Before; diff --git a/src/test/java/io/anserini/collection/DocumentCollectionTest.java b/src/test/java/io/anserini/collection/DocumentCollectionTest.java index 64b2faee63..ce06003621 100644 --- a/src/test/java/io/anserini/collection/DocumentCollectionTest.java +++ b/src/test/java/io/anserini/collection/DocumentCollectionTest.java @@ -16,7 +16,7 @@ package io.anserini.collection; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.After; import org.junit.Before; import org.junit.Test; diff --git a/src/test/java/io/anserini/integration/EndToEndTest.java b/src/test/java/io/anserini/integration/EndToEndTest.java index 3b509702e3..d0a55efe03 100644 --- a/src/test/java/io/anserini/integration/EndToEndTest.java +++ b/src/test/java/io/anserini/integration/EndToEndTest.java @@ -28,8 +28,8 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.TestRuleLimitSysouts; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestRuleLimitSysouts; import org.apache.lucene.util.IOUtils; import org.junit.After; import org.junit.Before; diff --git a/src/test/java/io/anserini/integration/solr/AclAnthologyEndToEndTest.java b/src/test/java/io/anserini/integration/solr/AclAnthologyEndToEndTest.java deleted file mode 100644 index a46383b484..0000000000 --- a/src/test/java/io/anserini/integration/solr/AclAnthologyEndToEndTest.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.integration.solr; - -import io.anserini.collection.AclAnthology; -import io.anserini.index.IndexArgs; -import io.anserini.index.generator.AclAnthologyGenerator; -import io.anserini.search.SearchSolr; - -public class AclAnthologyEndToEndTest extends SolrEndToEndTest { - @Override - protected String getCollectionName() { - return "AclAnthology"; - } - - @Override - protected String getSchemaAdjustmentFile() { - return "solr/schemas/acl-anthology.json"; - } - - @Override - public IndexArgs getIndexArgs() { - IndexArgs indexArgs = createDefaultIndexArgs(); - indexArgs.input = "src/test/resources/sample_docs/acl"; - indexArgs.collectionClass = AclAnthology.class.getSimpleName(); - indexArgs.generatorClass = AclAnthologyGenerator.class.getSimpleName(); - return indexArgs; - } - - @Override - protected SearchSolr.Args getSearchArgs() { - return createSearchArgs("TsvInt", "src/test/resources/sample_topics/acl_topics.tsv"); - } - - @Override - protected String[] getRefRankingResult() { - return new String[]{ // bm25 - "1 Q0 C00-1007 1 0.294000 Solrini", - "1 Q0 E17-1003 2 0.186100 Solrini", - "2 Q0 C00-1003 1 0.622700 Solrini" - }; - } -} diff --git a/src/test/java/io/anserini/integration/solr/CoreEndToEndTest.java b/src/test/java/io/anserini/integration/solr/CoreEndToEndTest.java deleted file mode 100644 index 761e12e537..0000000000 --- a/src/test/java/io/anserini/integration/solr/CoreEndToEndTest.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.integration.solr; - -import io.anserini.collection.CoreCollection; -import io.anserini.index.IndexArgs; -import io.anserini.index.generator.CoreGenerator; -import io.anserini.search.SearchSolr; - -public class CoreEndToEndTest extends SolrEndToEndTest { - @Override - protected String getCollectionName() { - return "Core"; - } - - @Override - protected String getSchemaAdjustmentFile() { - return "solr/schemas/core.json"; - } - - @Override - protected IndexArgs getIndexArgs() { - IndexArgs indexArgs = createDefaultIndexArgs(); - indexArgs.input = "src/test/resources/sample_docs/core"; - indexArgs.collectionClass = CoreCollection.class.getSimpleName(); - indexArgs.generatorClass = CoreGenerator.class.getSimpleName(); - return indexArgs; - } - - @Override - protected SearchSolr.Args getSearchArgs() { - return createSearchArgs("TsvInt", "src/test/resources/sample_topics/core_topics.tsv"); - } - - @Override - protected String[] getRefRankingResult() { - return new String[]{ // bm25 - "1 Q0 coreDoc1 1 0.243200 Solrini", - "1 Q0 doi2 2 0.243199 Solrini", - "2 Q0 coreDoc1 1 0.243200 Solrini", - "2 Q0 doi2 2 0.243199 Solrini", - "3 Q0 fullCoreDoc 1 0.534600 Solrini" - }; - } -} diff --git a/src/test/java/io/anserini/integration/solr/SolrEndToEndTest.java b/src/test/java/io/anserini/integration/solr/SolrEndToEndTest.java deleted file mode 100644 index d2529d7c6d..0000000000 --- a/src/test/java/io/anserini/integration/solr/SolrEndToEndTest.java +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.integration.solr; - -import io.anserini.index.IndexArgs; -import io.anserini.index.IndexCollection; -import io.anserini.search.SearchSolr; -import org.apache.commons.io.FileUtils; -import org.apache.commons.pool2.BasePooledObjectFactory; -import org.apache.commons.pool2.ObjectPool; -import org.apache.commons.pool2.PooledObject; -import org.apache.commons.pool2.impl.DefaultPooledObject; -import org.apache.commons.pool2.impl.GenericObjectPool; -import org.apache.commons.pool2.impl.GenericObjectPoolConfig; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.solr.client.solrj.SolrClient; -import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; -import org.apache.solr.client.solrj.request.CoreAdminRequest; -import org.apache.solr.client.solrj.request.json.DirectJsonQueryRequest; -import org.apache.solr.client.solrj.response.QueryResponse; -import org.apache.solr.common.params.CommonParams; -import org.apache.solr.common.params.ModifiableSolrParams; -import org.apache.solr.core.NodeConfig; -import org.apache.solr.core.SolrResourceLoader; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.lang.reflect.Field; -import java.net.URL; -import java.nio.file.Files; - - -@LuceneTestCase.SuppressSysoutChecks(bugUrl = "None") -public abstract class SolrEndToEndTest extends LuceneTestCase { - private static final Logger LOG = LogManager.getLogger(SolrEndToEndTest.class); - - protected ObjectPool stubSolrPool; - protected final String searchOutputPrefix = "e2eTestSearch"; - - protected EmbeddedSolrServer client; - - protected static File getFile(String path) { - final URL url = SolrEndToEndTest.class.getClassLoader().getResource(path); - if (url != null) { - try { - return new File(url.toURI()); - } catch (Exception e) { - throw new RuntimeException("Resource was found on classpath, but cannot be resolved to a normal file: " + path); - } - } - final File file = new File(path); - if (file.exists()) { - return file; - } - throw new RuntimeException("Cannot find resource in classpath or in file-system (relative to CWD): " + path); - } - - @Before - @Override - public void setUp() throws Exception { - super.setUp(); - - final File solrHome = createTempDir().toFile(); - final File configSetBaseDir = new File(solrHome.toPath() + File.separator + "configsets"); - FileUtils.copyDirectory(getFile("solr/anserini"), new File(configSetBaseDir + File.separator + "anserini")); - - SolrResourceLoader loader = new SolrResourceLoader(solrHome.toPath()); - NodeConfig config = new NodeConfig.NodeConfigBuilder("embeddedSolrServerNode", loader.getInstancePath()) - .setConfigSetBaseDirectory(configSetBaseDir.getAbsolutePath()).build(); - client = new EmbeddedSolrServer(config, getCollectionName()); - LOG.info("Created Embedded Solr Server"); - - CoreAdminRequest.Create createRequest = new CoreAdminRequest.Create(); - createRequest.setCoreName(getCollectionName()); - createRequest.setConfigSet("anserini"); - createRequest.process(client); - client.commit(); - LOG.info("Created Solr Core: " + getCollectionName()); - - GenericObjectPoolConfig poolConfig = new GenericObjectPoolConfig<>(); - poolConfig.setMaxTotal(1); // only 1 EmbeddedSolrServer instance will be created by getSolrClient - poolConfig.setMinIdle(1); - stubSolrPool = new GenericObjectPool<>(new StubSolrClientFactory(client), poolConfig); - } - - @After - @Override - public void tearDown() throws Exception { - super.tearDown(); - - client.deleteByQuery("*:*"); - client.commit(); - client.close(); - stubSolrPool.close(); - } - - protected IndexArgs createDefaultIndexArgs() { - IndexArgs args = new IndexArgs(); - - args.solrIndex = getCollectionName(); - args.threads = 1; - args.storePositions = true; - args.storeDocvectors = true; - args.storeContents = true; - args.storeRaw = true; - args.optimize = true; - args.quiet = true; - args.solr = true; - - return args; - } - - protected SearchSolr.Args createSearchArgs(String topicReader, String topicFile) { - SearchSolr.Args args = new SearchSolr.Args(); - - args.solrIndex = getCollectionName(); - args.output = searchOutputPrefix + topicReader; - args.topicReader = topicReader; - args.topics = new String[]{topicFile}; - args.zkUrl = "localhost"; // SearchSolr initialization workaround - - return args; - } - - protected static class StubSolrClientFactory extends BasePooledObjectFactory { - final SolrClient client; - - public StubSolrClientFactory(SolrClient client) { - this.client = client; - } - - @Override - public SolrClient create() { - return this.client; - } - - @Override - public PooledObject wrap(SolrClient solrClient) { - return new DefaultPooledObject<>(solrClient); - } - } - - protected IndexCollection getIndexRunner(IndexArgs args) throws Exception { - IndexCollection runner = new IndexCollection(args); - Field f = runner.getClass().getDeclaredField("solrPool"); - f.setAccessible(true); - f.set(runner, stubSolrPool); - return runner; - } - - protected SearchSolr getSearchRunner(SearchSolr.Args args) throws Exception { - SearchSolr runner = new SearchSolr(args); - Field f = runner.getClass().getDeclaredField("client"); - f.setAccessible(true); - ((SolrClient) f.get(runner)).close(); // close the old client - f.set(runner, client); - return runner; - } - - protected abstract String getCollectionName(); - - protected abstract String getSchemaAdjustmentFile(); - - protected abstract IndexArgs getIndexArgs(); - - protected abstract SearchSolr.Args getSearchArgs(); - - protected abstract String[] getRefRankingResult(); - - @Test - public void testIndexAndSearch() throws Exception { - String schemaAdjustmentFile = getSchemaAdjustmentFile(); - if (schemaAdjustmentFile != null) { - // update schema, much like curl -X POST -H 'Content-type:application/json' --data-binary SCHEMA_NAME.json http://localhost:8983/solr/COLLECTION_NAME/schema - String schemaJson = Files.readString(getFile(schemaAdjustmentFile).toPath()); - ModifiableSolrParams params = new ModifiableSolrParams(); - params.add(CommonParams.QT, "/schema"); - DirectJsonQueryRequest schemaRequest = new DirectJsonQueryRequest(schemaJson, params); - QueryResponse response = schemaRequest.process(client, getCollectionName()); - assertEquals(0, response.getStatus()); - } - - IndexArgs indexArgs = getIndexArgs(); - IndexCollection indexRunner = getIndexRunner(indexArgs); - indexRunner.run(); - - SearchSolr.Args searchArgs = getSearchArgs(); - SearchSolr searchRunner = getSearchRunner(searchArgs); - searchRunner.runTopics(); - - BufferedReader br = new BufferedReader(new FileReader(searchArgs.output)); - String[] ref = getRefRankingResult(); - String s; - int cnt = 0; - while ((s = br.readLine()) != null) { - assertEquals(ref[cnt], s); - cnt++; - } - assertEquals(cnt, ref.length); - FileUtils.deleteQuietly(new File(searchArgs.output)); - } -} diff --git a/src/test/java/io/anserini/integration/solr/TrecEndToEndTest.java b/src/test/java/io/anserini/integration/solr/TrecEndToEndTest.java deleted file mode 100644 index f9d95a9d29..0000000000 --- a/src/test/java/io/anserini/integration/solr/TrecEndToEndTest.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.integration.solr; - -import io.anserini.collection.TrecCollection; -import io.anserini.index.IndexArgs; -import io.anserini.search.SearchSolr; - -public class TrecEndToEndTest extends SolrEndToEndTest { - @Override - protected String getCollectionName() { - return "Trec"; - } - - @Override - protected String getSchemaAdjustmentFile() { - return null; // no need to adjust schema - } - - @Override - protected IndexArgs getIndexArgs() { - IndexArgs indexArgs = createDefaultIndexArgs(); - indexArgs.input = "src/test/resources/sample_docs/trec/collection2"; - indexArgs.collectionClass = TrecCollection.class.getSimpleName(); - return indexArgs; - } - - @Override - protected SearchSolr.Args getSearchArgs() { - return createSearchArgs("Trec", "src/test/resources/sample_topics/Trec"); - } - - @Override - protected String[] getRefRankingResult() { - return new String[]{ // bm25 - "1 Q0 DOC222 1 0.343200 Solrini", - "1 Q0 TREC_DOC_1 2 0.333400 Solrini", - "1 Q0 WSJ_1 3 0.068700 Solrini" - }; - } -} diff --git a/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java b/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java index b93cd5b42b..bfbe194a68 100644 --- a/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java +++ b/src/test/java/io/anserini/ltr/BaseFeatureExtractorTest.java @@ -31,7 +31,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.After; import org.junit.Before; diff --git a/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java b/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java index 6b82cdcc29..00ce6a20a0 100644 --- a/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java +++ b/src/test/java/io/anserini/search/query/DisjunctionMaxQueryGeneratorTest.java @@ -21,7 +21,7 @@ import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.Test; import java.util.Map; diff --git a/src/test/java/io/anserini/search/query/SdmQueryTest.java b/src/test/java/io/anserini/search/query/SdmQueryTest.java index 89663ffdca..c032882834 100644 --- a/src/test/java/io/anserini/search/query/SdmQueryTest.java +++ b/src/test/java/io/anserini/search/query/SdmQueryTest.java @@ -36,7 +36,7 @@ import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.After; import org.junit.Before; import org.junit.Test; diff --git a/src/test/java/io/anserini/util/FeatureVectorTest.java b/src/test/java/io/anserini/util/FeatureVectorTest.java index c747194004..12694bd90b 100644 --- a/src/test/java/io/anserini/util/FeatureVectorTest.java +++ b/src/test/java/io/anserini/util/FeatureVectorTest.java @@ -16,7 +16,7 @@ package io.anserini.util; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.Test; import java.util.Arrays;