Skip to content

Commit

Permalink
add support for parsing multivalued 'name' fields in Solr (issue #5)
Browse files Browse the repository at this point in the history
  • Loading branch information
codeforkjeff committed May 12, 2017
1 parent dcac938 commit b983fa6
Show file tree
Hide file tree
Showing 6 changed files with 192 additions and 58 deletions.
24 changes: 14 additions & 10 deletions conciliator.properties
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,19 @@ cache.lifetime=1800
cache.max_size=10000

# example Solr data source. This service would be accessible at /reconcile/solr
#datasource.solr=com.codefork.refine.solr.Solr
#datasource.solr.name=A Solr Collection of Books
#datasource.solr.nametype.id=/book/book
#datasource.solr.nametype.name=Book
#datasource.solr.url.query=http://localhost:8983/solr/test-core/select?wt=xml&q={{QUERY}}&rows={{ROWS}}
#datasource.solr.url.document=http://localhost:8983/solr/test-core/get?id={{id}}
#datasource.solr.field.id=id
#datasource.solr.field.name=title_display

# datasource.solr=com.codefork.refine.solr.Solr
# datasource.solr.name=A Solr Collection of Books
# datasource.solr.nametype.id=/book/book
# datasource.solr.nametype.name=Book
# datasource.solr.url.query=http://localhost:8983/solr/test-core/select?wt=xml&q={{QUERY}}&rows={{ROWS}}
# datasource.solr.url.document=http://localhost:8983/solr/test-core/get?id={{id}}
# datasource.solr.field.id=id
# datasource.solr.field.name=title_display
# # can be 'concat' or 'first'. defaults to 'concat'
# datasource.solr.field.name.multivalue.strategy=first
# datasource.solr.field.name.multivalue.delimiter=,

# example data source
#datasource.mysource=com.company.MySource
#datasource.mysource.name=My DataSource
# datasource.mysource=com.company.MySource
# datasource.mysource.name=My DataSource
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package com.codefork.refine.solr;

public enum MultiValueFieldStrategy {
CONCAT, FIRST
}
11 changes: 11 additions & 0 deletions src/main/java/com/codefork/refine/solr/Solr.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ public class Solr extends WebServiceDataSource {
public static final String PROP_URL_QUERY = "url.query";
public static final String PROP_FIELD_ID = "field.id";
public static final String PROP_FIELD_NAME = "field.name";
public static final String PROP_FIELD_NAME_MULTIVALUE_STRATEGY = "field.name.multivalue.strategy";
public static final String PROP_FIELD_NAME_MULTIVALUE_DELIMITER = "field.name.multivalue.delimiter";
public static final String PROP_NAMETYPE_ID = "nametype.id";
public static final String PROP_NAMETYPE_NAME = "nametype.name";

Expand All @@ -53,10 +55,19 @@ public List<Result> search(SearchQuery query) throws Exception {

InputStream response = conn.getInputStream();

MultiValueFieldStrategy multiValueFieldStrategy = MultiValueFieldStrategy.CONCAT;
if(MultiValueFieldStrategy.CONCAT.toString().toLowerCase().equals(getConfigProperties().getProperty(PROP_FIELD_NAME_MULTIVALUE_STRATEGY))) {
multiValueFieldStrategy = MultiValueFieldStrategy.CONCAT;
} else if(MultiValueFieldStrategy.FIRST.toString().toLowerCase().equals(getConfigProperties().getProperty(PROP_FIELD_NAME_MULTIVALUE_STRATEGY))) {
multiValueFieldStrategy = MultiValueFieldStrategy.FIRST;
}

SAXParser parser = spf.newSAXParser();
SolrParser solrParser = new SolrParser(
getConfigProperties().getProperty(PROP_FIELD_ID),
getConfigProperties().getProperty(PROP_FIELD_NAME),
multiValueFieldStrategy,
getConfigProperties().getProperty(PROP_FIELD_NAME_MULTIVALUE_DELIMITER, ", "),
new NameType(getConfigProperties().getProperty(PROP_NAMETYPE_ID),
getConfigProperties().getProperty(PROP_NAMETYPE_NAME)));

Expand Down
2 changes: 2 additions & 0 deletions src/main/java/com/codefork/refine/solr/SolrParseState.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ enum Field { ID, NAME }

Field fieldBeingCaptured;

List<String> multipleValues = new ArrayList<String>();

// we don't yet support multiple name types for Solr records
// so this is the list we use for every result.
List<NameType> nameTypes = new ArrayList<NameType>();
Expand Down
149 changes: 102 additions & 47 deletions src/main/java/com/codefork/refine/solr/SolrParser.java
Original file line number Diff line number Diff line change
@@ -1,23 +1,82 @@
package com.codefork.refine.solr;

import com.codefork.refine.parsers.xml.EndElementHandler;
import com.codefork.refine.parsers.xml.StartElementHandler;
import com.codefork.refine.parsers.xml.XMLParser;
import com.codefork.refine.resources.NameType;
import com.codefork.refine.resources.Result;
import com.codefork.refine.parsers.xml.EndElementHandler;
import com.codefork.refine.parsers.xml.StartElementHandler;
import org.xml.sax.Attributes;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class SolrParser extends XMLParser<SolrParseState> {

private final static Map<String, StartElementHandler<SolrParseState>> staticStartElementHandlers = new HashMap<String, StartElementHandler<SolrParseState>>();
private final static Map<String, EndElementHandler<SolrParseState>> staticEndElementHandlers = new HashMap<String, EndElementHandler<SolrParseState>>();
private final Map<String, StartElementHandler<SolrParseState>> staticStartElementHandlers = new HashMap<String, StartElementHandler<SolrParseState>>();
private final Map<String, EndElementHandler<SolrParseState>> staticEndElementHandlers = new HashMap<String, EndElementHandler<SolrParseState>>();

private String fieldId;
private String fieldName;
private MultiValueFieldStrategy multiValueFieldStrategy;
private String multiValueFieldDelimiter;

/**
* @param fieldId solr fieldname to use for 'id' field in reconciliation result
* @param fieldName solr fieldname to use for 'name' field in reconciliation result
* @param nameType all records parsed from Solr will have this nameType
*/
public SolrParser(String fieldId, String fieldName, final MultiValueFieldStrategy multiValueFieldStrategy, String multiValueFieldDelimiter, NameType nameType) {
super();
this.startElementHandlers = staticStartElementHandlers;
this.endElementHandlers = staticEndElementHandlers;
this.fieldId = fieldId;
this.fieldName = fieldName;
this.multiValueFieldStrategy = multiValueFieldStrategy;
this.multiValueFieldDelimiter = multiValueFieldDelimiter;
this.getParseState().nameTypes.add(nameType);

this.startElementHandlers.put("response/result/doc/arr",
new StartElementHandler<SolrParseState>() {
public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) {
if (SolrParser.this.fieldId.equals(attributes.getValue("name"))) {
parseState.fieldBeingCaptured = SolrParseState.Field.ID;
} else if (SolrParser.this.fieldName.equals(attributes.getValue("name"))) {
parseState.fieldBeingCaptured = SolrParseState.Field.NAME;
}
}
});

this.startElementHandlers.put("response/result/doc/arr/str",
new StartElementHandler<SolrParseState>() {
public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) {
if(parseState.fieldBeingCaptured != null) {
parseState.captureChars = true;
}
}
});

this.startElementHandlers.put("response/result/doc/str",
new StartElementHandler<SolrParseState>() {
public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) {
if (SolrParser.this.fieldId.equals(attributes.getValue("name"))) {
parseState.fieldBeingCaptured = SolrParseState.Field.ID;
} else if(SolrParser.this.fieldName.equals(attributes.getValue("name"))) {
parseState.fieldBeingCaptured = SolrParseState.Field.NAME;
}
if(parseState.fieldBeingCaptured != null) {
parseState.captureChars = true;
}
}
});

static {
this.startElementHandlers.put("response/result/doc/float",
new StartElementHandler<SolrParseState>() {
public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) {
if ("score".equals(attributes.getValue("name"))) {
parseState.captureChars = true;
}
}
});

staticStartElementHandlers.put("response/result/doc",
new StartElementHandler<SolrParseState>() {
Expand All @@ -27,6 +86,42 @@ public void handle(SolrParseState parseState, String uri, String localName, Stri
}
});

staticEndElementHandlers.put("response/result/doc/arr",
new EndElementHandler<SolrParseState>() {
public void handle(SolrParseState parseState, String uri, String localName, String qName) {
if (SolrParseState.Field.NAME.equals(parseState.fieldBeingCaptured)) {
if(MultiValueFieldStrategy.CONCAT.equals(SolrParser.this.multiValueFieldStrategy)) {
StringBuffer buf = new StringBuffer();
String delim = "";
for(String s: parseState.multipleValues) {
buf.append(delim);
buf.append(s);
delim = SolrParser.this.multiValueFieldDelimiter;
}
parseState.result.setName(buf.toString());
} else if(MultiValueFieldStrategy.FIRST.equals(SolrParser.this.multiValueFieldStrategy)) {
if(parseState.multipleValues.size() > 0) {
parseState.result.setName(parseState.multipleValues.get(0));
}
}
}
parseState.multipleValues.clear();
parseState.fieldBeingCaptured = null;
}
});

staticEndElementHandlers.put("response/result/doc/arr/str",
new EndElementHandler<SolrParseState>() {
public void handle(SolrParseState parseState, String uri, String localName, String qName) {
String s = parseState.buf.toString();
if (SolrParseState.Field.NAME.equals(parseState.fieldBeingCaptured)) {
parseState.multipleValues.add(s);
}
parseState.buf = new StringBuilder();
parseState.captureChars = false;
}
});

staticEndElementHandlers.put("response/result/doc/str",
new EndElementHandler<SolrParseState>() {
public void handle(SolrParseState parseState, String uri, String localName, String qName) {
Expand Down Expand Up @@ -59,46 +154,6 @@ public void handle(SolrParseState parseState, String uri, String localName, Stri
parseState.result = null;
}
});
}

public String fieldId;
public String fieldName;

/**
* @param fieldId solr fieldname to use for 'id' field in reconciliation result
* @param fieldName solr fieldname to use for 'name' field in reconciliation result
* @param nameType all records parsed from Solr will have this nameType
*/
public SolrParser(String fieldId, String fieldName, NameType nameType) {
super();
this.startElementHandlers = staticStartElementHandlers;
this.endElementHandlers = staticEndElementHandlers;
this.fieldId = fieldId;
this.fieldName = fieldName;
this.getParseState().nameTypes.add(nameType);

this.startElementHandlers.put("response/result/doc/str",
new StartElementHandler<SolrParseState>() {
public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) {
if (SolrParser.this.fieldId.equals(attributes.getValue("name"))) {
parseState.fieldBeingCaptured = SolrParseState.Field.ID;
} else if(SolrParser.this.fieldName.equals(attributes.getValue("name"))) {
parseState.fieldBeingCaptured = SolrParseState.Field.NAME;
}
if(parseState.fieldBeingCaptured != null) {
parseState.captureChars = true;
}
}
});

this.startElementHandlers.put("response/result/doc/float",
new StartElementHandler<SolrParseState>() {
public void handle(SolrParseState parseState, String uri, String localName, String qName, Attributes attributes) {
if ("score".equals(attributes.getValue("name"))) {
parseState.captureChars = true;
}
}
});

}

Expand Down
59 changes: 58 additions & 1 deletion src/test/java/com/codefork/refine/solr/SolrParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public class SolrParserTest {
public void testParse() throws Exception {
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser parser = spf.newSAXParser();
SolrParser solrParser = new SolrParser("id", "title_display", new NameType("/book/book", "Book"));
SolrParser solrParser = new SolrParser("id", "title_display", null, null, new NameType("/book/book", "Book"));

InputStream is = getClass().getResourceAsStream("/solr_results.xml");
parser.parse(is, solrParser);
Expand All @@ -40,4 +40,61 @@ public void testParse() throws Exception {
assertEquals("The adventures of Sherlock Holmes", result3.getName());
assertEquals("0.2694855", String.valueOf(result3.getScore()));
}

@Test
public void testParseMultiValueFirst() throws Exception {
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser parser = spf.newSAXParser();
SolrParser solrParser = new SolrParser("id", "subject_topic_facet", MultiValueFieldStrategy.FIRST, null, new NameType("/book/book", "Book"));

InputStream is = getClass().getResourceAsStream("/solr_results.xml");
parser.parse(is, solrParser);

List<Result> results = solrParser.getResults();
assertEquals(3, results.size());

Result result1 = results.get(0);
assertEquals("119390", result1.getId());
assertEquals(null, result1.getName());
assertEquals("0.33383894", String.valueOf(result1.getScore()));

Result result2 = results.get(1);
assertEquals("274753", result2.getId());
assertEquals("Holmes, Sherlock (Fictitious character)", result2.getName());
assertEquals("0.26951128", String.valueOf(result2.getScore()));

Result result3 = results.get(2);
assertEquals("25950", result3.getId());
assertEquals("Detective and mystery stories, English", result3.getName());
assertEquals("0.2694855", String.valueOf(result3.getScore()));
}

@Test
public void testParseMultiValueConcat() throws Exception {
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser parser = spf.newSAXParser();
SolrParser solrParser = new SolrParser("id", "subject_topic_facet", MultiValueFieldStrategy.CONCAT, ", ", new NameType("/book/book", "Book"));

InputStream is = getClass().getResourceAsStream("/solr_results.xml");
parser.parse(is, solrParser);

List<Result> results = solrParser.getResults();
assertEquals(3, results.size());

Result result1 = results.get(0);
assertEquals("119390", result1.getId());
assertEquals(null, result1.getName());
assertEquals("0.33383894", String.valueOf(result1.getScore()));

Result result2 = results.get(1);
assertEquals("274753", result2.getId());
assertEquals("Holmes, Sherlock (Fictitious character), Detective and mystery stories, English, Private investigators", result2.getName());
assertEquals("0.26951128", String.valueOf(result2.getScore()));

Result result3 = results.get(2);
assertEquals("25950", result3.getId());
assertEquals("Detective and mystery stories, English", result3.getName());
assertEquals("0.2694855", String.valueOf(result3.getScore()));
}

}

0 comments on commit b983fa6

Please sign in to comment.