From b983fa61d77d3d13806783faeeca17c8b89a6c5d Mon Sep 17 00:00:00 2001 From: codeforkjeff Date: Fri, 12 May 2017 00:07:56 -0400 Subject: [PATCH] add support for parsing multivalued 'name' fields in Solr (issue #5) --- conciliator.properties | 24 +-- .../refine/solr/MultiValueFieldStrategy.java | 5 + .../java/com/codefork/refine/solr/Solr.java | 11 ++ .../codefork/refine/solr/SolrParseState.java | 2 + .../com/codefork/refine/solr/SolrParser.java | 149 ++++++++++++------ .../codefork/refine/solr/SolrParserTest.java | 59 ++++++- 6 files changed, 192 insertions(+), 58 deletions(-) create mode 100644 src/main/java/com/codefork/refine/solr/MultiValueFieldStrategy.java diff --git a/conciliator.properties b/conciliator.properties index cd32e34..e5c3dfa 100644 --- a/conciliator.properties +++ b/conciliator.properties @@ -4,15 +4,19 @@ cache.lifetime=1800 cache.max_size=10000 # example Solr data source. This service would be accessible at /reconcile/solr -#datasource.solr=com.codefork.refine.solr.Solr -#datasource.solr.name=A Solr Collection of Books -#datasource.solr.nametype.id=/book/book -#datasource.solr.nametype.name=Book -#datasource.solr.url.query=http://localhost:8983/solr/test-core/select?wt=xml&q={{QUERY}}&rows={{ROWS}} -#datasource.solr.url.document=http://localhost:8983/solr/test-core/get?id={{id}} -#datasource.solr.field.id=id -#datasource.solr.field.name=title_display + +# datasource.solr=com.codefork.refine.solr.Solr +# datasource.solr.name=A Solr Collection of Books +# datasource.solr.nametype.id=/book/book +# datasource.solr.nametype.name=Book +# datasource.solr.url.query=http://localhost:8983/solr/test-core/select?wt=xml&q={{QUERY}}&rows={{ROWS}} +# datasource.solr.url.document=http://localhost:8983/solr/test-core/get?id={{id}} +# datasource.solr.field.id=id +# datasource.solr.field.name=title_display +# # can be 'concat' or 'first'. defaults to 'concat' +# datasource.solr.field.name.multivalue.strategy=first +# datasource.solr.field.name.multivalue.delimiter=, # example data source -#datasource.mysource=com.company.MySource -#datasource.mysource.name=My DataSource +# datasource.mysource=com.company.MySource +# datasource.mysource.name=My DataSource diff --git a/src/main/java/com/codefork/refine/solr/MultiValueFieldStrategy.java b/src/main/java/com/codefork/refine/solr/MultiValueFieldStrategy.java new file mode 100644 index 0000000..4a930c6 --- /dev/null +++ b/src/main/java/com/codefork/refine/solr/MultiValueFieldStrategy.java @@ -0,0 +1,5 @@ +package com.codefork.refine.solr; + +public enum MultiValueFieldStrategy { + CONCAT, FIRST +} diff --git a/src/main/java/com/codefork/refine/solr/Solr.java b/src/main/java/com/codefork/refine/solr/Solr.java index 08be9be..f8808b8 100644 --- a/src/main/java/com/codefork/refine/solr/Solr.java +++ b/src/main/java/com/codefork/refine/solr/Solr.java @@ -27,6 +27,8 @@ public class Solr extends WebServiceDataSource { public static final String PROP_URL_QUERY = "url.query"; public static final String PROP_FIELD_ID = "field.id"; public static final String PROP_FIELD_NAME = "field.name"; + public static final String PROP_FIELD_NAME_MULTIVALUE_STRATEGY = "field.name.multivalue.strategy"; + public static final String PROP_FIELD_NAME_MULTIVALUE_DELIMITER = "field.name.multivalue.delimiter"; public static final String PROP_NAMETYPE_ID = "nametype.id"; public static final String PROP_NAMETYPE_NAME = "nametype.name"; @@ -53,10 +55,19 @@ public List search(SearchQuery query) throws Exception { InputStream response = conn.getInputStream(); + MultiValueFieldStrategy multiValueFieldStrategy = MultiValueFieldStrategy.CONCAT; + if(MultiValueFieldStrategy.CONCAT.toString().toLowerCase().equals(getConfigProperties().getProperty(PROP_FIELD_NAME_MULTIVALUE_STRATEGY))) { + multiValueFieldStrategy = MultiValueFieldStrategy.CONCAT; + } else if(MultiValueFieldStrategy.FIRST.toString().toLowerCase().equals(getConfigProperties().getProperty(PROP_FIELD_NAME_MULTIVALUE_STRATEGY))) { + multiValueFieldStrategy = MultiValueFieldStrategy.FIRST; + } + SAXParser parser = spf.newSAXParser(); SolrParser solrParser = new SolrParser( getConfigProperties().getProperty(PROP_FIELD_ID), getConfigProperties().getProperty(PROP_FIELD_NAME), + multiValueFieldStrategy, + getConfigProperties().getProperty(PROP_FIELD_NAME_MULTIVALUE_DELIMITER, ", "), new NameType(getConfigProperties().getProperty(PROP_NAMETYPE_ID), getConfigProperties().getProperty(PROP_NAMETYPE_NAME))); diff --git a/src/main/java/com/codefork/refine/solr/SolrParseState.java b/src/main/java/com/codefork/refine/solr/SolrParseState.java index 77d85f1..46b7c64 100644 --- a/src/main/java/com/codefork/refine/solr/SolrParseState.java +++ b/src/main/java/com/codefork/refine/solr/SolrParseState.java @@ -12,6 +12,8 @@ enum Field { ID, NAME } Field fieldBeingCaptured; + List multipleValues = new ArrayList(); + // we don't yet support multiple name types for Solr records // so this is the list we use for every result. List nameTypes = new ArrayList(); diff --git a/src/main/java/com/codefork/refine/solr/SolrParser.java b/src/main/java/com/codefork/refine/solr/SolrParser.java index 9815c3a..b1c6493 100644 --- a/src/main/java/com/codefork/refine/solr/SolrParser.java +++ b/src/main/java/com/codefork/refine/solr/SolrParser.java @@ -1,23 +1,82 @@ package com.codefork.refine.solr; +import com.codefork.refine.parsers.xml.EndElementHandler; +import com.codefork.refine.parsers.xml.StartElementHandler; import com.codefork.refine.parsers.xml.XMLParser; import com.codefork.refine.resources.NameType; import com.codefork.refine.resources.Result; -import com.codefork.refine.parsers.xml.EndElementHandler; -import com.codefork.refine.parsers.xml.StartElementHandler; import org.xml.sax.Attributes; -import java.util.ArrayList; import java.util.HashMap; -import java.util.List; import java.util.Map; public class SolrParser extends XMLParser { - private final static Map> staticStartElementHandlers = new HashMap>(); - private final static Map> staticEndElementHandlers = new HashMap>(); + private final Map> staticStartElementHandlers = new HashMap>(); + private final Map> staticEndElementHandlers = new HashMap>(); + + private String fieldId; + private String fieldName; + private MultiValueFieldStrategy multiValueFieldStrategy; + private String multiValueFieldDelimiter; + + /** + * @param fieldId solr fieldname to use for 'id' field in reconciliation result + * @param fieldName solr fieldname to use for 'name' field in reconciliation result + * @param nameType all records parsed from Solr will have this nameType + */ + public SolrParser(String fieldId, String fieldName, final MultiValueFieldStrategy multiValueFieldStrategy, String multiValueFieldDelimiter, NameType nameType) { + super(); + this.startElementHandlers = staticStartElementHandlers; + this.endElementHandlers = staticEndElementHandlers; + this.fieldId = fieldId; + this.fieldName = fieldName; + this.multiValueFieldStrategy = multiValueFieldStrategy; + this.multiValueFieldDelimiter = multiValueFieldDelimiter; + this.getParseState().nameTypes.add(nameType); + + this.startElementHandlers.put("response/result/doc/arr", + new StartElementHandler() { + public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) { + if (SolrParser.this.fieldId.equals(attributes.getValue("name"))) { + parseState.fieldBeingCaptured = SolrParseState.Field.ID; + } else if (SolrParser.this.fieldName.equals(attributes.getValue("name"))) { + parseState.fieldBeingCaptured = SolrParseState.Field.NAME; + } + } + }); + + this.startElementHandlers.put("response/result/doc/arr/str", + new StartElementHandler() { + public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) { + if(parseState.fieldBeingCaptured != null) { + parseState.captureChars = true; + } + } + }); + + this.startElementHandlers.put("response/result/doc/str", + new StartElementHandler() { + public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) { + if (SolrParser.this.fieldId.equals(attributes.getValue("name"))) { + parseState.fieldBeingCaptured = SolrParseState.Field.ID; + } else if(SolrParser.this.fieldName.equals(attributes.getValue("name"))) { + parseState.fieldBeingCaptured = SolrParseState.Field.NAME; + } + if(parseState.fieldBeingCaptured != null) { + parseState.captureChars = true; + } + } + }); - static { + this.startElementHandlers.put("response/result/doc/float", + new StartElementHandler() { + public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) { + if ("score".equals(attributes.getValue("name"))) { + parseState.captureChars = true; + } + } + }); staticStartElementHandlers.put("response/result/doc", new StartElementHandler() { @@ -27,6 +86,42 @@ public void handle(SolrParseState parseState, String uri, String localName, Stri } }); + staticEndElementHandlers.put("response/result/doc/arr", + new EndElementHandler() { + public void handle(SolrParseState parseState, String uri, String localName, String qName) { + if (SolrParseState.Field.NAME.equals(parseState.fieldBeingCaptured)) { + if(MultiValueFieldStrategy.CONCAT.equals(SolrParser.this.multiValueFieldStrategy)) { + StringBuffer buf = new StringBuffer(); + String delim = ""; + for(String s: parseState.multipleValues) { + buf.append(delim); + buf.append(s); + delim = SolrParser.this.multiValueFieldDelimiter; + } + parseState.result.setName(buf.toString()); + } else if(MultiValueFieldStrategy.FIRST.equals(SolrParser.this.multiValueFieldStrategy)) { + if(parseState.multipleValues.size() > 0) { + parseState.result.setName(parseState.multipleValues.get(0)); + } + } + } + parseState.multipleValues.clear(); + parseState.fieldBeingCaptured = null; + } + }); + + staticEndElementHandlers.put("response/result/doc/arr/str", + new EndElementHandler() { + public void handle(SolrParseState parseState, String uri, String localName, String qName) { + String s = parseState.buf.toString(); + if (SolrParseState.Field.NAME.equals(parseState.fieldBeingCaptured)) { + parseState.multipleValues.add(s); + } + parseState.buf = new StringBuilder(); + parseState.captureChars = false; + } + }); + staticEndElementHandlers.put("response/result/doc/str", new EndElementHandler() { public void handle(SolrParseState parseState, String uri, String localName, String qName) { @@ -59,46 +154,6 @@ public void handle(SolrParseState parseState, String uri, String localName, Stri parseState.result = null; } }); - } - - public String fieldId; - public String fieldName; - - /** - * @param fieldId solr fieldname to use for 'id' field in reconciliation result - * @param fieldName solr fieldname to use for 'name' field in reconciliation result - * @param nameType all records parsed from Solr will have this nameType - */ - public SolrParser(String fieldId, String fieldName, NameType nameType) { - super(); - this.startElementHandlers = staticStartElementHandlers; - this.endElementHandlers = staticEndElementHandlers; - this.fieldId = fieldId; - this.fieldName = fieldName; - this.getParseState().nameTypes.add(nameType); - - this.startElementHandlers.put("response/result/doc/str", - new StartElementHandler() { - public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) { - if (SolrParser.this.fieldId.equals(attributes.getValue("name"))) { - parseState.fieldBeingCaptured = SolrParseState.Field.ID; - } else if(SolrParser.this.fieldName.equals(attributes.getValue("name"))) { - parseState.fieldBeingCaptured = SolrParseState.Field.NAME; - } - if(parseState.fieldBeingCaptured != null) { - parseState.captureChars = true; - } - } - }); - - this.startElementHandlers.put("response/result/doc/float", - new StartElementHandler() { - public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) { - if ("score".equals(attributes.getValue("name"))) { - parseState.captureChars = true; - } - } - }); } diff --git a/src/test/java/com/codefork/refine/solr/SolrParserTest.java b/src/test/java/com/codefork/refine/solr/SolrParserTest.java index 8faccb2..84327a2 100644 --- a/src/test/java/com/codefork/refine/solr/SolrParserTest.java +++ b/src/test/java/com/codefork/refine/solr/SolrParserTest.java @@ -17,7 +17,7 @@ public class SolrParserTest { public void testParse() throws Exception { SAXParserFactory spf = SAXParserFactory.newInstance(); SAXParser parser = spf.newSAXParser(); - SolrParser solrParser = new SolrParser("id", "title_display", new NameType("/book/book", "Book")); + SolrParser solrParser = new SolrParser("id", "title_display", null, null, new NameType("/book/book", "Book")); InputStream is = getClass().getResourceAsStream("/solr_results.xml"); parser.parse(is, solrParser); @@ -40,4 +40,61 @@ public void testParse() throws Exception { assertEquals("The adventures of Sherlock Holmes", result3.getName()); assertEquals("0.2694855", String.valueOf(result3.getScore())); } + + @Test + public void testParseMultiValueFirst() throws Exception { + SAXParserFactory spf = SAXParserFactory.newInstance(); + SAXParser parser = spf.newSAXParser(); + SolrParser solrParser = new SolrParser("id", "subject_topic_facet", MultiValueFieldStrategy.FIRST, null, new NameType("/book/book", "Book")); + + InputStream is = getClass().getResourceAsStream("/solr_results.xml"); + parser.parse(is, solrParser); + + List results = solrParser.getResults(); + assertEquals(3, results.size()); + + Result result1 = results.get(0); + assertEquals("119390", result1.getId()); + assertEquals(null, result1.getName()); + assertEquals("0.33383894", String.valueOf(result1.getScore())); + + Result result2 = results.get(1); + assertEquals("274753", result2.getId()); + assertEquals("Holmes, Sherlock (Fictitious character)", result2.getName()); + assertEquals("0.26951128", String.valueOf(result2.getScore())); + + Result result3 = results.get(2); + assertEquals("25950", result3.getId()); + assertEquals("Detective and mystery stories, English", result3.getName()); + assertEquals("0.2694855", String.valueOf(result3.getScore())); + } + + @Test + public void testParseMultiValueConcat() throws Exception { + SAXParserFactory spf = SAXParserFactory.newInstance(); + SAXParser parser = spf.newSAXParser(); + SolrParser solrParser = new SolrParser("id", "subject_topic_facet", MultiValueFieldStrategy.CONCAT, ", ", new NameType("/book/book", "Book")); + + InputStream is = getClass().getResourceAsStream("/solr_results.xml"); + parser.parse(is, solrParser); + + List results = solrParser.getResults(); + assertEquals(3, results.size()); + + Result result1 = results.get(0); + assertEquals("119390", result1.getId()); + assertEquals(null, result1.getName()); + assertEquals("0.33383894", String.valueOf(result1.getScore())); + + Result result2 = results.get(1); + assertEquals("274753", result2.getId()); + assertEquals("Holmes, Sherlock (Fictitious character), Detective and mystery stories, English, Private investigators", result2.getName()); + assertEquals("0.26951128", String.valueOf(result2.getScore())); + + Result result3 = results.get(2); + assertEquals("25950", result3.getId()); + assertEquals("Detective and mystery stories, English", result3.getName()); + assertEquals("0.2694855", String.valueOf(result3.getScore())); + } + }