From e4bf73e8fe118146bcb07ca40d7a6dd7d17ea76f Mon Sep 17 00:00:00 2001 From: Ralph Soika Date: Mon, 2 Sep 2019 23:56:46 +0200 Subject: [PATCH] improved solr search behavior Issue #554 --- .../workflow/engine/index/SchemaService.java | 3 + .../engine/solr/SolrIndexService.java | 98 +++++++++++++++++-- .../engine/solr/SolrSearchService.java | 80 +++++++++------ .../solr/TestStripInvalidCharacters.java | 44 +++++++++ 4 files changed, 189 insertions(+), 36 deletions(-) create mode 100644 imixs-workflow-index-solr/src/test/java/org/imixs/workflow/engine/solr/TestStripInvalidCharacters.java diff --git a/imixs-workflow-engine/src/main/java/org/imixs/workflow/engine/index/SchemaService.java b/imixs-workflow-engine/src/main/java/org/imixs/workflow/engine/index/SchemaService.java index 92c727e7f..82ea9fa02 100644 --- a/imixs-workflow-engine/src/main/java/org/imixs/workflow/engine/index/SchemaService.java +++ b/imixs-workflow-engine/src/main/java/org/imixs/workflow/engine/index/SchemaService.java @@ -41,6 +41,7 @@ import org.eclipse.microprofile.config.inject.ConfigProperty; import org.imixs.workflow.ItemCollection; +import org.imixs.workflow.WorkflowKernel; import org.imixs.workflow.engine.DocumentService; import org.imixs.workflow.exceptions.QueryException; @@ -199,9 +200,11 @@ void init() { // build unique field list containing all field names uniqueFieldList=new HashSet(); + uniqueFieldList.add(WorkflowKernel.UNIQUEID); uniqueFieldList.addAll(fieldListStore); uniqueFieldList.addAll(fieldListAnalyse); uniqueFieldList.addAll(fieldListNoAnalyse); + } diff --git a/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrIndexService.java b/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrIndexService.java index f2ad6cb9a..6e256dfce 100644 --- a/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrIndexService.java +++ b/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrIndexService.java @@ -34,6 +34,7 @@ import java.util.Calendar; import java.util.Date; import java.util.List; +import java.util.function.IntPredicate; import java.util.logging.Level; import java.util.logging.Logger; @@ -224,6 +225,7 @@ public void indexDocuments(List documents) throws RestAPIExcepti if (logger.isLoggable(Level.FINEST)) { logger.finest(xmlRequest); } + String uri = host + "/solr/" + core + "/update?commit=true"; restClient.post(uri, xmlRequest, "text/xml"); } @@ -311,13 +313,20 @@ public void rebuildIndex() { /** * This method post a search query and returns the result. + *

+ * The method will return the documents containing all stored or DocValues + * fields. Only if the param 'loadStubs' is false, then only the field + * '$uniqueid' will be returnded by the method. The caller is responsible to + * load the full document from DocumentService. + * + * * * @param searchterm * @return * @throws QueryException */ public String query(String searchTerm, int pageSize, int pageIndex, SortOrder sortOrder, - DefaultOperator defaultOperator) throws QueryException { + DefaultOperator defaultOperator, boolean loadStubs) throws QueryException { logger.fine("...search solr index: " + searchTerm + "..."); @@ -364,6 +373,11 @@ public String query(String searchTerm, int pageSize, int pageIndex, SortOrder so uri.append("&start=" + (pageIndex * pageSize)); } + // if loadStubs is true, then we only request the field '$uniqueid' here. + if (!loadStubs) { + uri.append("&fl=_uniqueid"); + } + // append query uri.append("&q=" + URLEncoder.encode(searchTerm, "UTF-8")); @@ -477,8 +491,15 @@ protected String buildAddDoc(List documents) { xmlContent.append(""); for (ItemCollection document : documents) { + + // if no UniqueID is defined we need to skip this document + if (document.getUniqueID().isEmpty()) { + continue; + } + xmlContent.append(""); + xmlContent.append("" + document.getUniqueID() + ""); // add all content fields defined in the schema @@ -514,7 +535,12 @@ protected String buildAddDoc(List documents) { } } logger.finest("......add index field " + DEFAULT_SEARCH_FIELD + "=" + content); - // if XML is part of the content, the we need to add a wrapping CDATA + + // remove existing CDATA... + content = stripCDATA(content); + // strip control codes.. + content = stripControlCodes(content); + // We need to add a wrapping CDATA, allow xml in general.. xmlContent.append(""); // now add all analyzed fields... @@ -591,6 +617,11 @@ private void addFieldDefinitionToUpdateSchema(StringBuffer updateSchema, String /** * This method adds a field value into a xml update request. + *

+ * In case the value is a date or calendar object, then the value will be + * converted into a lucene time format. + *

+ * The value will always be wrapped with a CDATA tag to avoid invalid XML. * * @param doc * an existing lucene document @@ -633,15 +664,70 @@ private void addFieldValuesToUpdateRequest(StringBuffer xmlContent, final ItemCo convertedValue = singleValue.toString(); } - // if XML is part of the content, the we need to add a wrapping CDATA - if (convertedValue.contains("<") || convertedValue.contains("<")) { - convertedValue = ""; - } + // remove existing CDATA... + convertedValue = stripCDATA(convertedValue); + // strip control codes.. + convertedValue = stripControlCodes(convertedValue); + // wrapp value into CDATA + convertedValue = ""; + xmlContent.append("" + convertedValue + ""); } } + /** + * This helper method is to strip control codes and extended characters from a + * string. We can not put those chars into the XML request send to solr. + *

+ * Background: + *

+ * In ASCII, the control codes have decimal codes 0 through to 31 and 127. On an + * ASCII based system, if the control codes are stripped, the resultant string + * would have all of its characters within the range of 32 to 126 decimal on the + * ASCII table. + *

+ * On a non-ASCII based system, we consider characters that do not have a + * corresponding glyph on the ASCII table (within the ASCII range of 32 to 126 + * decimal) to be an extended character for the purpose of this task. + *

+ * + * @see https://rosettacode.org/wiki/Strip_control_codes_and_extended_characters_from_a_string + * + * @param s + * @param include + * @return + */ + protected String stripControlCodes(String s) { + + // control codes stripped (but extended characters not stripped) + // IntPredicate include=c -> c > '\u001F' && c != '\u007F'; + + // control codes and extended characters stripped + IntPredicate include = c -> c > '\u001F' && c < '\u007F'; + return s.codePoints().filter(include::test) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append).toString(); + } + + /** + * This helper method strips CDATA blocks from a string. We can not post + * embedded CDATA in an alredy existing CDATA when we post the xml to solr. + *

+ * + * @param s + * @return + */ + protected String stripCDATA(String s) { + + if (s.contains("", ""); + return result; + } else { + return s; + } + } + /** * This method flushes a given count of eventLogEntries. The method return true * if no more eventLogEntries exist. diff --git a/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrSearchService.java b/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrSearchService.java index bbd7ea8b3..9caaf9859 100644 --- a/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrSearchService.java +++ b/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrSearchService.java @@ -148,7 +148,7 @@ public List search(String searchTerm, int pageSize, int pageInde } // post query.... - String result = solarIndexService.query(searchTerm, pageSize, pageIndex,sortOrder,defaultOperator); + String result = solarIndexService.query(searchTerm, pageSize, pageIndex,sortOrder,defaultOperator, loadStubs); logger.finest("......Result = " + result); if (result != null && !result.isEmpty()) { @@ -173,6 +173,54 @@ public List search(String searchTerm, int pageSize, int pageInde return workitems; } + /** + * Returns the total hits for a given search term from the lucene index. The + * method did not load any data. The provided search term will we extended with + * a users roles to test the read access level of each workitem matching the + * search term. + * + * The optional param 'maxResult' can be set to overwrite the + * DEFAULT_MAX_SEARCH_RESULT. + * + * @see search(String, int, int, Sort, Operator) + * + * @param sSearchTerm + * @param maxResult + * - max search result + * @return total hits of search result + * @throws QueryException + * in case the searchterm is not understandable. + */ + @Override + public int getTotalHits(final String _searchTerm, final int _maxResult, final DefaultOperator defaultOperator) + throws QueryException { + + int maxResult = _maxResult; + + if (maxResult <= 0) { + maxResult = DEFAULT_MAX_SEARCH_RESULT; + } + + // quey only the $uniqueid + String searchTerm = schemaService.getExtendedSearchTerm(_searchTerm); + // test if searchtem is provided + if (searchTerm == null || "".equals(searchTerm)) { + return 0; + } + + // post query.... + String result = solarIndexService.query(searchTerm, _maxResult, 0,null,defaultOperator, true); + +// TODO now parse the count!!! +// +// +// logger.finest("......Result = " + result); +// +// + + return 0; + } + /** * This method extracts the docs from a Solr JSON query result * @@ -389,39 +437,11 @@ private String adaptItemName(String itemName) { } if (itemName.charAt(0)=='_') { String adaptedName="$"+itemName.substring(1); - Set uniqueFieldList = schemaService.getUniqueFieldList(); - uniqueFieldList.add(WorkflowKernel.UNIQUEID); - if (uniqueFieldList.contains(adaptedName)) { + if (schemaService.getUniqueFieldList().contains(adaptedName)) { return adaptedName; } } return itemName; } - /** - * Returns the total hits for a given search term from the lucene index. The - * method did not load any data. The provided search term will we extended with - * a users roles to test the read access level of each workitem matching the - * search term. - * - * The optional param 'maxResult' can be set to overwrite the - * DEFAULT_MAX_SEARCH_RESULT. - * - * @see search(String, int, int, Sort, Operator) - * - * @param sSearchTerm - * @param maxResult - * - max search result - * @return total hits of search result - * @throws QueryException - * in case the searchterm is not understandable. - */ - @Override - public int getTotalHits(final String _searchTerm, final int _maxResult, final DefaultOperator defaultOperator) - throws QueryException { - - logger.warning("...TBD"); - return 0; - } - } diff --git a/imixs-workflow-index-solr/src/test/java/org/imixs/workflow/engine/solr/TestStripInvalidCharacters.java b/imixs-workflow-index-solr/src/test/java/org/imixs/workflow/engine/solr/TestStripInvalidCharacters.java new file mode 100644 index 000000000..cf67a87e2 --- /dev/null +++ b/imixs-workflow-index-solr/src/test/java/org/imixs/workflow/engine/solr/TestStripInvalidCharacters.java @@ -0,0 +1,44 @@ +package org.imixs.workflow.engine.solr; + +import org.imixs.workflow.exceptions.ModelException; +import org.imixs.workflow.exceptions.PluginException; +import org.junit.Before; +import org.junit.Test; + +import junit.framework.Assert; + +/** + * Test striping invalid characters + * + * @author rsoika + * + */ +public class TestStripInvalidCharacters { + SolrIndexService solrIndexService; + @Before + public void setUp() throws PluginException, ModelException { + solrIndexService=new SolrIndexService(); + } + + + /** + * Test + * + */ + @Test + public void testCDATA() { + + String testString = "Hello ....]]> Data!"; + + + + String result=solrIndexService.stripCDATA(testString); + + + Assert.assertEquals("Hello .... Data!",result); + + + + } + +}