From bdeae1762d555cafffd8242bff2f32e8a665cc82 Mon Sep 17 00:00:00 2001 From: Ryan Clancy Date: Mon, 16 Sep 2019 15:40:42 -0400 Subject: [PATCH 1/2] Fix issue with duplicate field for WaPo in Solr. --- src/main/java/io/anserini/index/IndexCollection.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index 438a132528..825acef909 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -281,6 +281,11 @@ public void run() { if (field.fieldType().docValuesType() != DocValuesType.NONE) { continue; } + // If the field is already in the doc, skip it. + // This fixes an issue with WaPo where published_date is in the Lucene doc as LongPoint and StoredField. Solr needs one copy, more fine-grained control in config. + if (solrDocument.containsKey(field.name())) { + continue; + } if (field.stringValue() != null) { // For some reason, id is multi-valued with null as one of the values solrDocument.addField(field.name(), field.stringValue()); } else if (field.numericValue() != null) { From ef3121c95ac3d7191fbff865943eed597afdc7bc Mon Sep 17 00:00:00 2001 From: Ryan Clancy Date: Mon, 16 Sep 2019 17:16:44 -0400 Subject: [PATCH 2/2] Add known issues for Solr in v0.6.0 --- README.md | 2 +- docs/known-issues/known-issues-v0.6.0.md | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 docs/known-issues/known-issues-v0.6.0.md diff --git a/README.md b/README.md index eaa4082025..219c4a7b91 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ See [this page](docs/additional.md) for additional documentation. ## Release History -+ v0.6.0: September 6, 2019 [[Release Notes](docs/release-notes/release-notes-v0.6.0.md)] ++ v0.6.0: September 6, 2019 [[Release Notes](docs/release-notes/release-notes-v0.6.0.md)][[Known Issues](docs/known-issues/known-issues-v0.6.0.md)] + v0.5.1: June 11, 2019 [[Release Notes](docs/release-notes/release-notes-v0.5.1.md)] + v0.5.0: June 5, 2019 [[Release Notes](docs/release-notes/release-notes-v0.5.0.md)] + v0.4.0: March 4, 2019 [[Release Notes](docs/release-notes/release-notes-v0.4.0.md)] diff --git a/docs/known-issues/known-issues-v0.6.0.md b/docs/known-issues/known-issues-v0.6.0.md new file mode 100644 index 0000000000..8e6f963458 --- /dev/null +++ b/docs/known-issues/known-issues-v0.6.0.md @@ -0,0 +1,3 @@ +# Anserini Known Issues (v0.6.0) + ++ Solr indexing for Washington Post broke due to [417ac12](https://github.com/castorini/anserini/commit/c5ee9af442c500ec43fc28808903cfca2417ac12) and has been fixed in [#807](https://github.com/castorini/anserini/pull/807). \ No newline at end of file