From e4bf73e8fe118146bcb07ca40d7a6dd7d17ea76f Mon Sep 17 00:00:00 2001
From: Ralph Soika <ralph.soika@imixs.com>
Date: Mon, 2 Sep 2019 23:56:46 +0200
Subject: [PATCH] improved solr search behavior

Issue #554
---
 .../workflow/engine/index/SchemaService.java  |  3 +
 .../engine/solr/SolrIndexService.java         | 98 +++++++++++++++++--
 .../engine/solr/SolrSearchService.java        | 80 +++++++++------
 .../solr/TestStripInvalidCharacters.java      | 44 +++++++++
 4 files changed, 189 insertions(+), 36 deletions(-)
 create mode 100644 imixs-workflow-index-solr/src/test/java/org/imixs/workflow/engine/solr/TestStripInvalidCharacters.java
diff --git a/imixs-workflow-engine/src/main/java/org/imixs/workflow/engine/index/SchemaService.java b/imixs-workflow-engine/src/main/java/org/imixs/workflow/engine/index/SchemaService.java
index 92c727e7f..82ea9fa02 100644
--- a/imixs-workflow-engine/src/main/java/org/imixs/workflow/engine/index/SchemaService.java
+++ b/imixs-workflow-engine/src/main/java/org/imixs/workflow/engine/index/SchemaService.java
@@ -41,6 +41,7 @@
 
 import org.eclipse.microprofile.config.inject.ConfigProperty;
 import org.imixs.workflow.ItemCollection;
+import org.imixs.workflow.WorkflowKernel;
 import org.imixs.workflow.engine.DocumentService;
 import org.imixs.workflow.exceptions.QueryException;
 
@@ -199,9 +200,11 @@ void init() {
 		
 		// build unique field list containing all field names
 		uniqueFieldList=new HashSet<String>();
+		uniqueFieldList.add(WorkflowKernel.UNIQUEID);
 		uniqueFieldList.addAll(fieldListStore);
 		uniqueFieldList.addAll(fieldListAnalyse);
 		uniqueFieldList.addAll(fieldListNoAnalyse);
+		
 
 	}
 
diff --git a/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrIndexService.java b/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrIndexService.java
index f2ad6cb9a..6e256dfce 100644
--- a/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrIndexService.java
+++ b/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrIndexService.java
@@ -34,6 +34,7 @@
 import java.util.Calendar;
 import java.util.Date;
 import java.util.List;
+import java.util.function.IntPredicate;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
@@ -224,6 +225,7 @@ public void indexDocuments(List<ItemCollection> documents) throws RestAPIExcepti
 			if (logger.isLoggable(Level.FINEST)) {
 				logger.finest(xmlRequest);
 			}
+
 			String uri = host + "/solr/" + core + "/update?commit=true";
 			restClient.post(uri, xmlRequest, "text/xml");
 		}
@@ -311,13 +313,20 @@ public void rebuildIndex() {
 
 	/**
 	 * This method post a search query and returns the result.
+	 * <p>
+	 * The method will return the documents containing all stored or DocValues
+	 * fields. Only if the param 'loadStubs' is false, then only the field
+	 * '$uniqueid' will be returnded by the method. The caller is responsible to
+	 * load the full document from DocumentService.
+	 * 
+	 * 
 	 * 
 	 * @param searchterm
 	 * @return
 	 * @throws QueryException
 	 */
 	public String query(String searchTerm, int pageSize, int pageIndex, SortOrder sortOrder,
-			DefaultOperator defaultOperator) throws QueryException {
+			DefaultOperator defaultOperator, boolean loadStubs) throws QueryException {
 
 		logger.fine("...search solr index: " + searchTerm + "...");
 
@@ -364,6 +373,11 @@ public String query(String searchTerm, int pageSize, int pageIndex, SortOrder so
 				uri.append("&start=" + (pageIndex * pageSize));
 			}
 
+			// if loadStubs is true, then we only request the field '$uniqueid' here.
+			if (!loadStubs) {
+				uri.append("&fl=_uniqueid");
+			}
+
 			// append query
 			uri.append("&q=" + URLEncoder.encode(searchTerm, "UTF-8"));
 
@@ -477,8 +491,15 @@ protected String buildAddDoc(List<ItemCollection> documents) {
 		xmlContent.append("<add overwrite=\"true\">");
 
 		for (ItemCollection document : documents) {
+			
+			// if no UniqueID is defined we need to skip this document
+			if (document.getUniqueID().isEmpty()) {
+				continue;
+			}
+			
 			xmlContent.append("<doc>");
 
+			
 			xmlContent.append("<field name=\"id\">" + document.getUniqueID() + "</field>");
 
 			// add all content fields defined in the schema
@@ -514,7 +535,12 @@ protected String buildAddDoc(List<ItemCollection> documents) {
 				}
 			}
 			logger.finest("......add index field " + DEFAULT_SEARCH_FIELD + "=" + content);
-			// if XML is part of the content, the we need to add a wrapping CDATA
+
+			// remove existing CDATA...
+			content = stripCDATA(content);
+			// strip control codes..
+			content = stripControlCodes(content);
+			// We need to add a wrapping CDATA, allow xml in general..
 			xmlContent.append("<field name=\"" + DEFAULT_SEARCH_FIELD + "\"><![CDATA[" + content + "]]></field>");
 
 			// now add all analyzed fields...
@@ -591,6 +617,11 @@ private void addFieldDefinitionToUpdateSchema(StringBuffer updateSchema, String
 
 	/**
 	 * This method adds a field value into a xml update request.
+	 * <p>
+	 * In case the value is a date or calendar object, then the value will be
+	 * converted into a lucene time format.
+	 * <p>
+	 * The value will always be wrapped with a CDATA tag to avoid invalid XML.
 	 * 
 	 * @param doc
 	 *            an existing lucene document
@@ -633,15 +664,70 @@ private void addFieldValuesToUpdateRequest(StringBuffer xmlContent, final ItemCo
 				convertedValue = singleValue.toString();
 			}
 
-			// if XML is part of the content, the we need to add a wrapping CDATA
-			if (convertedValue.contains("<") || convertedValue.contains("<")) {
-				convertedValue = "<![CDATA[" + convertedValue + "]]>";
-			}
+			// remove existing CDATA...
+			convertedValue = stripCDATA(convertedValue);
+			// strip control codes..
+			convertedValue = stripControlCodes(convertedValue);
+			// wrapp value into CDATA
+			convertedValue = "<![CDATA[" + stripControlCodes(convertedValue) + "]]>";
+
 			xmlContent.append("<field name=\"" + itemName + "\">" + convertedValue + "</field>");
 		}
 
 	}
 
+	/**
+	 * This helper method is to strip control codes and extended characters from a
+	 * string. We can not put those chars into the XML request send to solr.
+	 * <p>
+	 * Background:
+	 * <p>
+	 * In ASCII, the control codes have decimal codes 0 through to 31 and 127. On an
+	 * ASCII based system, if the control codes are stripped, the resultant string
+	 * would have all of its characters within the range of 32 to 126 decimal on the
+	 * ASCII table.
+	 * <p>
+	 * On a non-ASCII based system, we consider characters that do not have a
+	 * corresponding glyph on the ASCII table (within the ASCII range of 32 to 126
+	 * decimal) to be an extended character for the purpose of this task.
+	 * </p>
+	 * 
+	 * @see https://rosettacode.org/wiki/Strip_control_codes_and_extended_characters_from_a_string
+	 * 
+	 * @param s
+	 * @param include
+	 * @return
+	 */
+	protected String stripControlCodes(String s) {
+
+		// control codes stripped (but extended characters not stripped)
+		// IntPredicate include=c -> c > '\u001F' && c != '\u007F';
+
+		// control codes and extended characters stripped
+		IntPredicate include = c -> c > '\u001F' && c < '\u007F';
+		return s.codePoints().filter(include::test)
+				.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append).toString();
+	}
+
+	/**
+	 * This helper method strips CDATA blocks from a string. We can not post
+	 * embedded CDATA in an alredy existing CDATA when we post the xml to solr.
+	 * <p>
+	 * 
+	 * @param s
+	 * @return
+	 */
+	protected String stripCDATA(String s) {
+
+		if (s.contains("<![CDATA[")) {
+			String result = s.replaceAll("<!\\[CDATA\\[", "");
+			result = result.replaceAll("]]>", "");
+			return result;
+		} else {
+			return s;
+		}
+	}
+
 	/**
 	 * This method flushes a given count of eventLogEntries. The method return true
 	 * if no more eventLogEntries exist.
diff --git a/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrSearchService.java b/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrSearchService.java
index bbd7ea8b3..9caaf9859 100644
--- a/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrSearchService.java
+++ b/imixs-workflow-index-solr/src/main/java/org/imixs/workflow/engine/solr/SolrSearchService.java
@@ -148,7 +148,7 @@ public List<ItemCollection> search(String searchTerm, int pageSize, int pageInde
 		}
 
 		// post query....
-		String result = solarIndexService.query(searchTerm, pageSize,  pageIndex,sortOrder,defaultOperator);
+		String result = solarIndexService.query(searchTerm, pageSize,  pageIndex,sortOrder,defaultOperator, loadStubs);
 		logger.finest("......Result = " + result);
 
 		if (result != null && !result.isEmpty()) {
@@ -173,6 +173,54 @@ public List<ItemCollection> search(String searchTerm, int pageSize, int pageInde
 		return workitems;
 	}
 
+	/**
+	 * Returns the total hits for a given search term from the lucene index. The
+	 * method did not load any data. The provided search term will we extended with
+	 * a users roles to test the read access level of each workitem matching the
+	 * search term.
+	 * 
+	 * The optional param 'maxResult' can be set to overwrite the
+	 * DEFAULT_MAX_SEARCH_RESULT.
+	 * 
+	 * @see search(String, int, int, Sort, Operator)
+	 * 
+	 * @param sSearchTerm
+	 * @param maxResult
+	 *            - max search result
+	 * @return total hits of search result
+	 * @throws QueryException
+	 *             in case the searchterm is not understandable.
+	 */
+	@Override
+	public int getTotalHits(final String _searchTerm, final int _maxResult, final DefaultOperator defaultOperator)
+			throws QueryException {
+		
+		int maxResult = _maxResult;
+
+		if (maxResult <= 0) {
+			maxResult = DEFAULT_MAX_SEARCH_RESULT;
+		}
+
+		// quey only the $uniqueid
+		String searchTerm = schemaService.getExtendedSearchTerm(_searchTerm);
+		// test if searchtem is provided
+		if (searchTerm == null || "".equals(searchTerm)) {
+			return 0;
+		}
+
+		// post query....
+		String result = solarIndexService.query(searchTerm, _maxResult,  0,null,defaultOperator, true);
+		
+//	TODO	now parse the count!!!
+//		
+//		
+//		logger.finest("......Result = " + result);
+//
+//		
+		
+		return 0;
+	}
+
 	/**
 	 * This method extracts the docs from a Solr JSON query result
 	 * 
@@ -389,39 +437,11 @@ private String adaptItemName(String itemName) {
 		}
 		if (itemName.charAt(0)=='_')  {
 			String adaptedName="$"+itemName.substring(1);
-			Set<String> uniqueFieldList = schemaService.getUniqueFieldList();
-			uniqueFieldList.add(WorkflowKernel.UNIQUEID);
-			if (uniqueFieldList.contains(adaptedName)) {
+			if (schemaService.getUniqueFieldList().contains(adaptedName)) {
 				return adaptedName;
 			}
 		}
 		return itemName;
 	}
 
-	/**
-	 * Returns the total hits for a given search term from the lucene index. The
-	 * method did not load any data. The provided search term will we extended with
-	 * a users roles to test the read access level of each workitem matching the
-	 * search term.
-	 * 
-	 * The optional param 'maxResult' can be set to overwrite the
-	 * DEFAULT_MAX_SEARCH_RESULT.
-	 * 
-	 * @see search(String, int, int, Sort, Operator)
-	 * 
-	 * @param sSearchTerm
-	 * @param maxResult
-	 *            - max search result
-	 * @return total hits of search result
-	 * @throws QueryException
-	 *             in case the searchterm is not understandable.
-	 */
-	@Override
-	public int getTotalHits(final String _searchTerm, final int _maxResult, final DefaultOperator defaultOperator)
-			throws QueryException {
-
-		logger.warning("...TBD");
-		return 0;
-	}
-
 }
diff --git a/imixs-workflow-index-solr/src/test/java/org/imixs/workflow/engine/solr/TestStripInvalidCharacters.java b/imixs-workflow-index-solr/src/test/java/org/imixs/workflow/engine/solr/TestStripInvalidCharacters.java
new file mode 100644
index 000000000..cf67a87e2
--- /dev/null
+++ b/imixs-workflow-index-solr/src/test/java/org/imixs/workflow/engine/solr/TestStripInvalidCharacters.java
@@ -0,0 +1,44 @@
+package org.imixs.workflow.engine.solr;
+
+import org.imixs.workflow.exceptions.ModelException;
+import org.imixs.workflow.exceptions.PluginException;
+import org.junit.Before;
+import org.junit.Test;
+
+import junit.framework.Assert;
+
+/**
+ * Test striping invalid characters
+ * 
+ * @author rsoika
+ * 
+ */
+public class TestStripInvalidCharacters {
+	SolrIndexService solrIndexService;
+	@Before
+	public void setUp() throws PluginException, ModelException {
+		solrIndexService=new SolrIndexService();
+	}
+	
+
+	/**
+	 * Test 
+	 * 
+	 */
+	@Test
+	public void testCDATA() {
+		
+		String testString = "Hello <![CDATA[<XX>....</XX>]]> Data!";
+		
+		
+		
+		String result=solrIndexService.stripCDATA(testString);
+		
+		
+		Assert.assertEquals("Hello <XX>....</XX> Data!",result);
+		
+		
+		
+	}
+
+}