Multilingual Retrieval (#901)

+ Add analyzer for different languages. + Add documents and regression test for TREC2002 Arabic, CLEF2006 French, FIRE2012 English, Bengali and Hindi.
castorini · Nov 27, 2019 · 4116188 · 4116188
1 parent fb9ecf4
commit 4116188
Show file tree

Hide file tree

Showing 41 changed files with 179,933 additions and 0 deletions.
diff --git a/docs/regressions-clef06-fr.md b/docs/regressions-clef06-fr.md
@@ -0,0 +1,56 @@
+# Anserini: Regressions for [CLEF2006 Monolingual French](http://www.clef-initiative.eu/edition/clef2006)
+
+This page documents regression experiments for [CLEF2006 monolingual French topics)](http://www.clef-initiative.eu/edition/clef2006).
+The description of the document collection can be found in the [CLEF corpus page](http://www.clef-initiative.eu/dataset/corpus).
+
+The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/clef06-fr.yaml).
+Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/celf06-fr.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead.
+
+## Indexing
+
+Typical indexing command:
+
+```
+nohup sh target/appassembler/bin/IndexCollection -collection JsonCollection \
+-generator LuceneDocumentGenerator -threads 16 -input /path/to/clef06-fr -index \
+lucene-index.clef06-fr.pos+docvectors+rawdocs -storePositions -storeDocvectors \
+-storeRawDocs -language fr >& log.clef06-fr.pos+docvectors+rawdocs &
+```
+
+The directory `/path/to/clef06-fr/` should be a directory containing the collection (the format is jsonline format).
+
+For additional details, see explanation of [common indexing options](common-indexing-options.md).
+
+## Retrieval
+
+Topics and qrels are stored in [`src/main/resources/topics-and-qrels/`](../src/main/resources/topics-and-qrels/).
+The regression experiments here evaluate on the 49 questions.
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```
+nohup target/appassembler/bin/SearchCollection -topicreader TsvString -index lucene-index.clef06-fr.pos+docvectors+rawdocs -topics src/main/resources/topics-and-qrels/topics.clef06fr.mono.fr.txt -output run.clef06-fr.bm25.topics.clef06fr.mono.fr.txt -language fr -bm25 &
+
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```
+eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.clef06fr.txt run.clef06-fr.bm25.topics.clef06fr.mono.fr.txt
+
+```
+
+## Effectiveness
+
+With the above commands, you should be able to replicate the following results:
+
+MAP                                     | BM25      |
+:---------------------------------------|-----------|
+[CLEF2006 (French monolingual)](http://www.clef-initiative.eu/edition/clef2006)| 0.3111    |
+
+
+P30                                     | BM25      |
+:---------------------------------------|-----------|
+[CLEF2006 (French monolingual)](http://www.clef-initiative.eu/edition/clef2006)| 0.2735    |
+
+
diff --git a/docs/regressions-fire12-bn.md b/docs/regressions-fire12-bn.md
@@ -0,0 +1,56 @@
+# Anserini: Regressions for [FIRE 2012 Monolingual Bengali](http://isical.ac.in/~fire/2012/adhoc.html)
+
+This page documents regression experiments for [FIRE 2012 Ad-hoc retrieval (Monolingual Bengali topic)](http://isical.ac.in/~fire/2012/adhoc.html).
+The document collection can be found in [FIRE 2012 data page](http://fire.irsi.res.in/fire/static/data).
+
+The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/fire-bn.yaml).
+Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/fire12-bn.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead.
+
+## Indexing
+
+Typical indexing command:
+
+```
+nohup sh target/appassembler/bin/IndexCollection -collection TrecCollection \
+-generator LuceneDocumentGenerator -threads 16 -input /path/to/fire12-bn -index \
+lucene-index.fire12-hi.pos+docvectors+rawdocs -storePositions -storeDocvectors \
+-storeRawDocs -language bn >& log.fire12-bn.pos+docvectors+rawdocs &
+```
+
+The directory `/path/to/fire12-bn/` should be a directory containing the collection, containing `bn_ABP` and `bn_BDNews24` directories.
+
+For additional details, see explanation of [common indexing options](common-indexing-options.md).
+
+## Retrieval
+
+Topics and qrels are stored in [`src/main/resources/topics-and-qrels/`](../src/main/resources/topics-and-qrels/).
+The regression experiments here evaluate on the 50 questions.
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```
+nohup target/appassembler/bin/SearchCollection -topicreader TsvString -index lucene-index.fire12-bn.pos+docvectors+rawdocs -topics src/main/resources/topics-and-qrels/topics.fire12bn.176-225.txt -output run.fire12-bn.bm25.topics.fire12bn.176-225.txt -language bn -bm25 &
+
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```
+eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.fire12bn.176-225.txt run.fire12-bn.bm25.topics.fire12bn.176-225.txt
+
+```
+
+## Effectiveness
+
+With the above commands, you should be able to replicate the following results:
+
+MAP                                     | BM25      |
+:---------------------------------------|-----------|
+[FIRE2012 (Bengali monolingual)](http://isical.ac.in/~fire/2012/adhoc.html)| 0.2881    |
+
+
+P30                                     | BM25      |
+:---------------------------------------|-----------|
+[FIRE2012 (Bengali monolingual)](http://isical.ac.in/~fire/2012/adhoc.html)| 0.3360    |
+
+
diff --git a/docs/regressions-fire12-en.md b/docs/regressions-fire12-en.md
@@ -0,0 +1,56 @@
+# Anserini: Regressions for [FIRE 2012 Monolingual English](http://isical.ac.in/~fire/2012/adhoc.html)
+
+This page documents regression experiments for [FIRE 2012 Ad-hoc retrieval (Monolingual English topic)](http://isical.ac.in/~fire/2012/adhoc.html).
+The document collection can be found in [FIRE 2012 data page](http://fire.irsi.res.in/fire/static/data).
+
+The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/fire-en.yaml).
+Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/fire12-en.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead.
+
+## Indexing
+
+Typical indexing command:
+
+```
+nohup sh target/appassembler/bin/IndexCollection -collection TrecCollection \
+-generator LuceneDocumentGenerator -threads 16 -input /path/to/fire12-en -index \
+lucene-index.fire12-en.pos+docvectors+rawdocs -storePositions -storeDocvectors \
+-storeRawDocs -language en >& log.fire12-en.pos+docvectors+rawdocs &
+```
+
+The directory `/path/to/fire12-en/` should be a directory containing the collection, containing `en_BDNews24` and `en_TheTelegraph_2001-2010` directories.
+
+For additional details, see explanation of [common indexing options](common-indexing-options.md).
+
+## Retrieval
+
+Topics and qrels are stored in [`src/main/resources/topics-and-qrels/`](../src/main/resources/topics-and-qrels/).
+The regression experiments here evaluate on the 50 questions.
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```
+nohup target/appassembler/bin/SearchCollection -topicreader TsvString -index lucene-index.fire12-en.pos+docvectors+rawdocs -topics src/main/resources/topics-and-qrels/topics.fire12en.176-225.txt -output run.fire12-en.bm25.topics.fire12en.176-225.txt -language en -bm25 &
+
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```
+eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.fire12en.176-225.txt run.fire12-en.bm25.topics.fire12en.176-225.txt
+
+```
+
+## Effectiveness
+
+With the above commands, you should be able to replicate the following results:
+
+MAP                                     | BM25      |
+:---------------------------------------|-----------|
+[FIRE2012 (English monolingual)](http://isical.ac.in/~fire/2012/adhoc.html)| 0.3867    |
+
+
+P30                                     | BM25      |
+:---------------------------------------|-----------|
+[FIRE2012 (English monolingual)](http://isical.ac.in/~fire/2012/adhoc.html)| 0.3920    |
+
+
diff --git a/docs/regressions-fire12-hi.md b/docs/regressions-fire12-hi.md
@@ -0,0 +1,56 @@
+# Anserini: Regressions for [FIRE 2012 Monolingual Hindi](http://isical.ac.in/~fire/2012/adhoc.html)
+
+This page documents regression experiments for [FIRE 2012 Ad-hoc retrieval (Monolingual Hindi topic)](http://isical.ac.in/~fire/2012/adhoc.html).
+The document collection can be found in [FIRE 2012 data page](http://fire.irsi.res.in/fire/static/data).
+
+The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/fire-hi.yaml).
+Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/fire12-hi.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead.
+
+## Indexing
+
+Typical indexing command:
+
+```
+nohup sh target/appassembler/bin/IndexCollection -collection TrecCollection \
+-generator LuceneDocumentGenerator -threads 16 -input /path/to/fire12-hi -index \
+lucene-index.fire12-hi.pos+docvectors+rawdocs -storePositions -storeDocvectors \
+-storeRawDocs -language hi >& log.fire12-hi.pos+docvectors+rawdocs &
+```
+
+The directory `/path/to/fire12-hi/` should be a directory containing the collection, containing `hi_AmarUjala` and `hi_NavbharatTimes` directories.
+
+For additional details, see explanation of [common indexing options](common-indexing-options.md).
+
+## Retrieval
+
+Topics and qrels are stored in [`src/main/resources/topics-and-qrels/`](../src/main/resources/topics-and-qrels/).
+The regression experiments here evaluate on the 50 questions.
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```
+nohup target/appassembler/bin/SearchCollection -topicreader TsvString -index lucene-index.fire12-hi.pos+docvectors+rawdocs -topics src/main/resources/topics-and-qrels/topics.fire12hi.176-225.txt -output run.fire12-hi.bm25.topics.fire12hi.176-225.txt -language hi -bm25 &
+
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```
+eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.fire12hi.176-225.txt run.fire12-hi.bm25.topics.fire12hi.176-225.txt
+
+```
+
+## Effectiveness
+
+With the above commands, you should be able to replicate the following results:
+
+MAP                                     | BM25      |
+:---------------------------------------|-----------|
+[FIRE2012 (Hindi monolingual)](http://isical.ac.in/~fire/2012/adhoc.html)| 0.3867    |
+
+
+P30                                     | BM25      |
+:---------------------------------------|-----------|
+[FIRE2012 (Hindi monolingual)](http://isical.ac.in/~fire/2012/adhoc.html)| 0.3920    |
+
+
diff --git a/docs/regressions-trec02-ar.md b/docs/regressions-trec02-ar.md
@@ -0,0 +1,56 @@
+# Anserini: Regressions for [TREC2002 Monolingual Arabic](https://trec.nist.gov/pubs/trec11/t11_proceedings.html)
+
+This page documents regression experiments for [TREC2002 Arabic monolingual topics)](https://trec.nist.gov/pubs/trec11/t11_proceedings.html).
+The description of the document collection can be found in the [TREC data page](https://trec.nist.gov/data/docs_noneng.html): Agence France Presse (AFP) Arabic newswire, from [LDC2001T55 (Arabic Newswire Part 1)](https://catalog.ldc.upenn.edu/LDC2001T55).
+
+The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/trec02-ar.yaml).
+Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/trec02-ar.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead.
+
+## Indexing
+
+Typical indexing command:
+
+```
+nohup sh target/appassembler/bin/IndexCollection -collection JsonCollection \
+-generator LuceneDocumentGenerator -threads 16 -input /path/to/trec02-ar -index \
+lucene-index.trec02-ar.pos+docvectors+rawdocs -storePositions -storeDocvectors \
+-storeRawDocs -language ar >& log.trec02-ar.pos+docvectors+rawdocs &
+```
+
+The directory `/path/to/trec02-ar/` should be a directory containing the collection, 2337 gzipped files from LDC2007T38.
+
+For additional details, see explanation of [common indexing options](common-indexing-options.md).
+
+## Retrieval
+
+Topics and qrels are stored in [`src/main/resources/topics-and-qrels/`](../src/main/resources/topics-and-qrels/).
+The regression experiments here evaluate on the 50 questions.
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```
+nohup target/appassembler/bin/SearchCollection -topicreader TsvString -index lucene-index.trec02-ar.pos+docvectors+rawdocs -topics src/main/resources/topics-and-qrels/topics.trec02ar.mono.ar.txt -output run.trec02-ar.bm25.topics.trec02ar.mono.ar.txt -language ar -bm25 &
+
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```
+eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.trec02ar.txt run.trec02-ar.bm25.topics.trec02ar.mono.ar.txt
+
+```
+
+## Effectiveness
+
+With the above commands, you should be able to replicate the following results:
+
+MAP                                     | BM25      |
+:---------------------------------------|-----------|
+[TREC2002 (Arabic monolingual)](../src/main/resources/topics-and-qrels/topics.trec02ar.momo.ar.txt)| 0.2932    |
+
+
+P30                                     | BM25      |
+:---------------------------------------|-----------|
+[TREC2002 (Arabic monolingual)](../src/main/resources/topics-and-qrels/topics.trec02ar.momo.ar.txt)| 0.3313    |
+
+
diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java
@@ -45,8 +45,12 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.bn.BengaliAnalyzer;
 import org.apache.lucene.analysis.cjk.CJKAnalyzer;
 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+import org.apache.lucene.analysis.hi.HindiAnalyzer;
+import org.apache.lucene.analysis.es.SpanishAnalyzer;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.*;
 import org.apache.lucene.search.similarities.BM25Similarity;
@@ -689,6 +693,10 @@ public void run() throws IOException {
       final CJKAnalyzer chineseAnalyzer = new CJKAnalyzer();
       final ArabicAnalyzer arabicAnalyzer = new ArabicAnalyzer();
       final FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer();
+      final HindiAnalyzer hindiAnalyzer = new HindiAnalyzer();
+      final BengaliAnalyzer bengaliAnalyzer = new BengaliAnalyzer();
+      final GermanAnalyzer germanAnalyzer = new GermanAnalyzer();
+      final SpanishAnalyzer spanishAnalyzer = new SpanishAnalyzer();
       final EnglishStemmingAnalyzer analyzer = args.keepStopwords ?
           new EnglishStemmingAnalyzer(args.stemmer, CharArraySet.EMPTY_SET) : new EnglishStemmingAnalyzer(args.stemmer);
       final TweetAnalyzer tweetAnalyzer = new TweetAnalyzer(args.tweetStemming);
@@ -701,6 +709,14 @@ public void run() throws IOException {
         config = new IndexWriterConfig(arabicAnalyzer);
       } else if (args.language.equals("fr")) {
         config = new IndexWriterConfig(frenchAnalyzer);
+      } else if (args.language.equals("hi")) {
+        config = new IndexWriterConfig(hindiAnalyzer);
+      } else if (args.language.equals("bn")) {
+        config = new IndexWriterConfig(bengaliAnalyzer);
+      } else if (args.language.equals("de")) {
+        config = new IndexWriterConfig(germanAnalyzer);
+      } else if (args.language.equals("es")) {
+        config = new IndexWriterConfig(spanishAnalyzer);
       } else {
         config = new IndexWriterConfig(analyzer);
       }

diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java
@@ -41,8 +41,12 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.bn.BengaliAnalyzer;
 import org.apache.lucene.analysis.cjk.CJKAnalyzer;
 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+import org.apache.lucene.analysis.hi.HindiAnalyzer;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.es.SpanishAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.index.DirectoryReader;
@@ -226,6 +230,14 @@ public SearchCollection(SearchArgs args) throws IOException {
       analyzer = new ArabicAnalyzer();
     } else if (args.language.equals("fr")) {
       analyzer = new FrenchAnalyzer();
+    } else if (args.language.equals("hi")) {
+      analyzer = new HindiAnalyzer();
+    } else if (args.language.equals("bn")) {
+      analyzer = new BengaliAnalyzer();
+    } else if (args.language.equals("de")) {
+      analyzer = new GermanAnalyzer();
+    } else if (args.language.equals("es")) {
+      analyzer = new SpanishAnalyzer();
     } else {
       // Default to English
       analyzer = args.keepstop ?

diff --git a/src/main/java/io/anserini/search/SimpleSearcher.java b/src/main/java/io/anserini/search/SimpleSearcher.java
@@ -30,9 +30,13 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.bn.BengaliAnalyzer;
 import org.apache.lucene.analysis.cjk.CJKAnalyzer;
 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.analysis.hi.HindiAnalyzer;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.es.SpanishAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.index.DirectoryReader;
@@ -140,6 +144,14 @@ public void setLanguage(String language) {
       this.analyzer = new ArabicAnalyzer();
     } else if (language.equals("fr")) {
       this.analyzer = new FrenchAnalyzer();
+    } else if (language.equals("hi")) {
+      this.analyzer = new HindiAnalyzer();
+    } else if (language.equals("bn")) {
+      this.analyzer = new BengaliAnalyzer();
+    } else if (language.equals("de")) {
+      this.analyzer = new GermanAnalyzer();
+    } else if (language.equals("es")) {
+      this.analyzer = new SpanishAnalyzer();
     }
   }