From 1649d31331c0dc37f7b0efb2907e01088eea9e2f Mon Sep 17 00:00:00 2001 From: Ryan Clancy Date: Mon, 16 Sep 2019 17:26:18 -0400 Subject: [PATCH] Fix issue with duplicate field for WaPo in Solr. (#807) * Fix issue with duplicate field for WaPo in Solr. * Add known issues for Solr in v0.6.0 --- README.md | 2 +- docs/known-issues/known-issues-v0.6.0.md | 3 +++ src/main/java/io/anserini/index/IndexCollection.java | 5 +++++ 3 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 docs/known-issues/known-issues-v0.6.0.md diff --git a/README.md b/README.md index eaa4082025..219c4a7b91 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ See [this page](docs/additional.md) for additional documentation. ## Release History -+ v0.6.0: September 6, 2019 [[Release Notes](docs/release-notes/release-notes-v0.6.0.md)] ++ v0.6.0: September 6, 2019 [[Release Notes](docs/release-notes/release-notes-v0.6.0.md)][[Known Issues](docs/known-issues/known-issues-v0.6.0.md)] + v0.5.1: June 11, 2019 [[Release Notes](docs/release-notes/release-notes-v0.5.1.md)] + v0.5.0: June 5, 2019 [[Release Notes](docs/release-notes/release-notes-v0.5.0.md)] + v0.4.0: March 4, 2019 [[Release Notes](docs/release-notes/release-notes-v0.4.0.md)] diff --git a/docs/known-issues/known-issues-v0.6.0.md b/docs/known-issues/known-issues-v0.6.0.md new file mode 100644 index 0000000000..8e6f963458 --- /dev/null +++ b/docs/known-issues/known-issues-v0.6.0.md @@ -0,0 +1,3 @@ +# Anserini Known Issues (v0.6.0) + ++ Solr indexing for Washington Post broke due to [417ac12](https://github.com/castorini/anserini/commit/c5ee9af442c500ec43fc28808903cfca2417ac12) and has been fixed in [#807](https://github.com/castorini/anserini/pull/807). \ No newline at end of file diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index 438a132528..825acef909 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -281,6 +281,11 @@ public void run() { if (field.fieldType().docValuesType() != DocValuesType.NONE) { continue; } + // If the field is already in the doc, skip it. + // This fixes an issue with WaPo where published_date is in the Lucene doc as LongPoint and StoredField. Solr needs one copy, more fine-grained control in config. + if (solrDocument.containsKey(field.name())) { + continue; + } if (field.stringValue() != null) { // For some reason, id is multi-valued with null as one of the values solrDocument.addField(field.name(), field.stringValue()); } else if (field.numericValue() != null) {