Skip to content

Commit

Permalink
improved solr search behavior
Browse files Browse the repository at this point in the history
Issue #554
  • Loading branch information
rsoika committed Sep 2, 2019
1 parent 49cda08 commit 565ab24
Show file tree
Hide file tree
Showing 4 changed files with 189 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.WorkflowKernel;
import org.imixs.workflow.engine.DocumentService;
import org.imixs.workflow.exceptions.QueryException;

Expand Down Expand Up @@ -199,9 +200,11 @@ void init() {

// build unique field list containing all field names
uniqueFieldList=new HashSet<String>();
uniqueFieldList.add(WorkflowKernel.UNIQUEID);
uniqueFieldList.addAll(fieldListStore);
uniqueFieldList.addAll(fieldListAnalyse);
uniqueFieldList.addAll(fieldListNoAnalyse);


}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.function.IntPredicate;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand Down Expand Up @@ -224,6 +225,7 @@ public void indexDocuments(List<ItemCollection> documents) throws RestAPIExcepti
if (logger.isLoggable(Level.FINEST)) {
logger.finest(xmlRequest);
}

String uri = host + "/solr/" + core + "/update?commit=true";
restClient.post(uri, xmlRequest, "text/xml");
}
Expand Down Expand Up @@ -311,13 +313,20 @@ public void rebuildIndex() {

/**
* This method post a search query and returns the result.
* <p>
* The method will return the documents containing all stored or DocValues
* fields. Only if the param 'loadStubs' is false, then only the field
* '$uniqueid' will be returnded by the method. The caller is responsible to
* load the full document from DocumentService.
*
*
*
* @param searchterm
* @return
* @throws QueryException
*/
public String query(String searchTerm, int pageSize, int pageIndex, SortOrder sortOrder,
DefaultOperator defaultOperator) throws QueryException {
DefaultOperator defaultOperator, boolean loadStubs) throws QueryException {

logger.fine("...search solr index: " + searchTerm + "...");

Expand Down Expand Up @@ -364,6 +373,11 @@ public String query(String searchTerm, int pageSize, int pageIndex, SortOrder so
uri.append("&start=" + (pageIndex * pageSize));
}

// if loadStubs is true, then we only request the field '$uniqueid' here.
if (!loadStubs) {
uri.append("&fl=_uniqueid");
}

// append query
uri.append("&q=" + URLEncoder.encode(searchTerm, "UTF-8"));

Expand Down Expand Up @@ -477,8 +491,15 @@ protected String buildAddDoc(List<ItemCollection> documents) {
xmlContent.append("<add overwrite=\"true\">");

for (ItemCollection document : documents) {

// if no UniqueID is defined we need to skip this document
if (document.getUniqueID().isEmpty()) {
continue;
}

xmlContent.append("<doc>");


xmlContent.append("<field name=\"id\">" + document.getUniqueID() + "</field>");

// add all content fields defined in the schema
Expand Down Expand Up @@ -514,7 +535,12 @@ protected String buildAddDoc(List<ItemCollection> documents) {
}
}
logger.finest("......add index field " + DEFAULT_SEARCH_FIELD + "=" + content);
// if XML is part of the content, the we need to add a wrapping CDATA

// remove existing CDATA...
content = stripCDATA(content);
// strip control codes..
content = stripControlCodes(content);
// We need to add a wrapping CDATA, allow xml in general..
xmlContent.append("<field name=\"" + DEFAULT_SEARCH_FIELD + "\"><![CDATA[" + content + "]]></field>");

// now add all analyzed fields...
Expand Down Expand Up @@ -591,6 +617,11 @@ private void addFieldDefinitionToUpdateSchema(StringBuffer updateSchema, String

/**
* This method adds a field value into a xml update request.
* <p>
* In case the value is a date or calendar object, then the value will be
* converted into a lucene time format.
* <p>
* The value will always be wrapped with a CDATA tag to avoid invalid XML.
*
* @param doc
* an existing lucene document
Expand Down Expand Up @@ -633,15 +664,70 @@ private void addFieldValuesToUpdateRequest(StringBuffer xmlContent, final ItemCo
convertedValue = singleValue.toString();
}

// if XML is part of the content, the we need to add a wrapping CDATA
if (convertedValue.contains("<") || convertedValue.contains("<")) {
convertedValue = "<![CDATA[" + convertedValue + "]]>";
}
// remove existing CDATA...
convertedValue = stripCDATA(convertedValue);
// strip control codes..
convertedValue = stripControlCodes(convertedValue);
// wrapp value into CDATA
convertedValue = "<![CDATA[" + stripControlCodes(convertedValue) + "]]>";

xmlContent.append("<field name=\"" + itemName + "\">" + convertedValue + "</field>");
}

}

/**
* This helper method is to strip control codes and extended characters from a
* string. We can not put those chars into the XML request send to solr.
* <p>
* Background:
* <p>
* In ASCII, the control codes have decimal codes 0 through to 31 and 127. On an
* ASCII based system, if the control codes are stripped, the resultant string
* would have all of its characters within the range of 32 to 126 decimal on the
* ASCII table.
* <p>
* On a non-ASCII based system, we consider characters that do not have a
* corresponding glyph on the ASCII table (within the ASCII range of 32 to 126
* decimal) to be an extended character for the purpose of this task.
* </p>
*
* @see https://rosettacode.org/wiki/Strip_control_codes_and_extended_characters_from_a_string
*
* @param s
* @param include
* @return
*/
protected String stripControlCodes(String s) {

// control codes stripped (but extended characters not stripped)
// IntPredicate include=c -> c > '\u001F' && c != '\u007F';

// control codes and extended characters stripped
IntPredicate include = c -> c > '\u001F' && c < '\u007F';
return s.codePoints().filter(include::test)
.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append).toString();
}

/**
* This helper method strips CDATA blocks from a string. We can not post
* embedded CDATA in an alredy existing CDATA when we post the xml to solr.
* <p>
*
* @param s
* @return
*/
protected String stripCDATA(String s) {

if (s.contains("<![CDATA[")) {
String result = s.replaceAll("<!\\[CDATA\\[", "");
result = result.replaceAll("]]>", "");
return result;
} else {
return s;
}
}

/**
* This method flushes a given count of eventLogEntries. The method return true
* if no more eventLogEntries exist.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ public List<ItemCollection> search(String searchTerm, int pageSize, int pageInde
}

// post query....
String result = solarIndexService.query(searchTerm, pageSize, pageIndex,sortOrder,defaultOperator);
String result = solarIndexService.query(searchTerm, pageSize, pageIndex,sortOrder,defaultOperator, loadStubs);
logger.finest("......Result = " + result);

if (result != null && !result.isEmpty()) {
Expand All @@ -173,6 +173,54 @@ public List<ItemCollection> search(String searchTerm, int pageSize, int pageInde
return workitems;
}

/**
* Returns the total hits for a given search term from the lucene index. The
* method did not load any data. The provided search term will we extended with
* a users roles to test the read access level of each workitem matching the
* search term.
*
* The optional param 'maxResult' can be set to overwrite the
* DEFAULT_MAX_SEARCH_RESULT.
*
* @see search(String, int, int, Sort, Operator)
*
* @param sSearchTerm
* @param maxResult
* - max search result
* @return total hits of search result
* @throws QueryException
* in case the searchterm is not understandable.
*/
@Override
public int getTotalHits(final String _searchTerm, final int _maxResult, final DefaultOperator defaultOperator)
throws QueryException {

int maxResult = _maxResult;

if (maxResult <= 0) {
maxResult = DEFAULT_MAX_SEARCH_RESULT;
}

// quey only the $uniqueid
String searchTerm = schemaService.getExtendedSearchTerm(_searchTerm);
// test if searchtem is provided
if (searchTerm == null || "".equals(searchTerm)) {
return 0;
}

// post query....
String result = solarIndexService.query(searchTerm, _maxResult, 0,null,defaultOperator, true);

// TODO now parse the count!!!
//
//
// logger.finest("......Result = " + result);
//
//

return 0;
}

/**
* This method extracts the docs from a Solr JSON query result
*
Expand Down Expand Up @@ -389,39 +437,11 @@ private String adaptItemName(String itemName) {
}
if (itemName.charAt(0)=='_') {
String adaptedName="$"+itemName.substring(1);
Set<String> uniqueFieldList = schemaService.getUniqueFieldList();
uniqueFieldList.add(WorkflowKernel.UNIQUEID);
if (uniqueFieldList.contains(adaptedName)) {
if (schemaService.getUniqueFieldList().contains(adaptedName)) {
return adaptedName;
}
}
return itemName;
}

/**
* Returns the total hits for a given search term from the lucene index. The
* method did not load any data. The provided search term will we extended with
* a users roles to test the read access level of each workitem matching the
* search term.
*
* The optional param 'maxResult' can be set to overwrite the
* DEFAULT_MAX_SEARCH_RESULT.
*
* @see search(String, int, int, Sort, Operator)
*
* @param sSearchTerm
* @param maxResult
* - max search result
* @return total hits of search result
* @throws QueryException
* in case the searchterm is not understandable.
*/
@Override
public int getTotalHits(final String _searchTerm, final int _maxResult, final DefaultOperator defaultOperator)
throws QueryException {

logger.warning("...TBD");
return 0;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package org.imixs.workflow.engine.solr;

import org.imixs.workflow.exceptions.ModelException;
import org.imixs.workflow.exceptions.PluginException;
import org.junit.Before;
import org.junit.Test;

import junit.framework.Assert;

/**
* Test striping invalid characters
*
* @author rsoika
*
*/
public class TestStripInvalidCharacters {
SolrIndexService solrIndexService;
@Before
public void setUp() throws PluginException, ModelException {
solrIndexService=new SolrIndexService();
}


/**
* Test
*
*/
@Test
public void testCDATA() {

String testString = "Hello <![CDATA[<XX>....</XX>]]> Data!";



String result=solrIndexService.stripCDATA(testString);


Assert.assertEquals("Hello <XX>....</XX> Data!",result);



}

}

0 comments on commit 565ab24

Please sign in to comment.