Skip to content

Commit

Permalink
Search: strip HTML tags while indexing #1332
Browse files Browse the repository at this point in the history
  • Loading branch information
pdurbin committed Mar 5, 2015
1 parent 4cbef87 commit fa3a94f
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 10 deletions.
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,11 @@
<version>1.0.0</version>
<type>jar</type>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.1</version>
</dependency>
</dependencies>

<build>
Expand Down
2 changes: 1 addition & 1 deletion scripts/deploy/apitest.dataverse.org/dv-root.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"name": "API Test",
"permissionRoot": false,
"facetRoot": true,
"description": "Welcome! This is a playground for Dataverse API users. (Data will be deleted periodically.) Please see http://guides.dataverse.org/en/latest/api to get started and http://community.dataverse.org/community-groups/api.html to join the community!",
"description": "Welcome! This is a playground for Dataverse API users. (Data will be deleted periodically.) Please see <a href=\"http://guides.dataverse.org/en/latest/api\">http://guides.dataverse.org/en/latest/api</a> to get started and <a href=\"http://community.dataverse.org/community-groups/api.html\">http://community.dataverse.org/community-groups/api.html</a> to join the community!",
"dataverseSubjects": [
"Other"
],
Expand Down
2 changes: 1 addition & 1 deletion scripts/search/data/dv-trees1.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name alias permissionRoot subject contactEmail description affiliation
Trees trees true Other trees@trees.com A tree dataverse with some birds Trees Inc.
Spruce spruce false Other spruce@trees.com A spruce with some birds Trees Inc.
Spruce spruce false Other spruce@trees.com A <a href=\"http://en.wikipedia.org/wiki/Spruce\">spruce</a> with some birds Trees Inc.
Chestnut Trees chestnuttrees false Other chestnuttrees@trees.com A dataverse with chestnut trees and an oriole Trees Inc.
21 changes: 16 additions & 5 deletions src/main/java/edu/harvard/iq/dataverse/IndexServiceBean.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package edu.harvard.iq.dataverse;

import edu.harvard.iq.dataverse.util.StringUtil;
import edu.harvard.iq.dataverse.authorization.providers.builtin.BuiltinUserServiceBean;
import edu.harvard.iq.dataverse.search.SearchFields;
import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean;
Expand Down Expand Up @@ -168,8 +169,8 @@ public Future<String> indexDataverse(Dataverse dataverse) {
// if (dataverse.getOwner() != null) {
// solrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataverse.getOwner().getName());
// }
solrInputDocument.addField(SearchFields.DESCRIPTION, dataverse.getDescription());
solrInputDocument.addField(SearchFields.DATAVERSE_DESCRIPTION, dataverse.getDescription());
solrInputDocument.addField(SearchFields.DESCRIPTION, StringUtil.html2text(dataverse.getDescription()));
solrInputDocument.addField(SearchFields.DATAVERSE_DESCRIPTION, StringUtil.html2text(dataverse.getDescription()));
// logger.info("dataverse affiliation: " + dataverse.getAffiliation());
if (dataverse.getAffiliation() != null && !dataverse.getAffiliation().isEmpty()) {
/**
Expand Down Expand Up @@ -677,9 +678,19 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) {
}
}
} else {
solrInputDocument.addField(solrFieldSearchable, dsf.getValues());
if (dsfType.getSolrField().isFacetable()) {
solrInputDocument.addField(solrFieldFacetable, dsf.getValues());
if (dsfType.getFieldType().equals(DatasetFieldType.FieldType.TEXTBOX)) {
// strip HTML
List<String> htmlFreeText = StringUtil.htmlArray2textArray(dsf.getValues());
solrInputDocument.addField(solrFieldSearchable, htmlFreeText);
if (dsfType.getSolrField().isFacetable()) {
solrInputDocument.addField(solrFieldFacetable, htmlFreeText);
}
} else {
// do not strip HTML
solrInputDocument.addField(solrFieldSearchable, dsf.getValues());
if (dsfType.getSolrField().isFacetable()) {
solrInputDocument.addField(solrFieldFacetable, dsf.getValues());
}
}
}
}
Expand Down
26 changes: 25 additions & 1 deletion src/main/java/edu/harvard/iq/dataverse/util/StringUtil.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package edu.harvard.iq.dataverse.util;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;

/**
*
Expand Down Expand Up @@ -48,5 +51,26 @@ public static String truncateString(String originalString, int maxLength) {

return finalString;
}


public static String html2text(String html) {
if (html == null) {
return null;
}
return Jsoup.parse(html).text();
}

/**
* @return A list of clean strings or an empty list.
*/
public static List<String> htmlArray2textArray(List<String> htmlArray) {
List cleanTextArray = new ArrayList<>();
if (htmlArray == null) {
return cleanTextArray;
}
for (String html : htmlArray) {
cleanTextArray.add(Jsoup.parse(html).text());
}
return cleanTextArray;
}

}
39 changes: 37 additions & 2 deletions src/test/java/edu/harvard/iq/dataverse/util/StringUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
*/
package edu.harvard.iq.dataverse.util;

import edu.harvard.iq.dataverse.util.StringUtil;
import java.util.Arrays;
import java.util.Collections;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
Expand Down Expand Up @@ -77,5 +78,39 @@ public void testIsAlphaNumericChar() {
assertTrue( StringUtil.isAlphaNumericChar('Z') );
assertFalse( StringUtil.isAlphaNumericChar('@') );
}


@Test
public void testHtml2Text() {
assertEquals(StringUtil.html2text("be <b>bold</b>!"), "be bold!");
assertEquals(StringUtil.html2text(null), null);
assertEquals(StringUtil.html2text("<p><b>Description:</b><br />\n"
+ "Data were taken May-June 2003 and 2005. Flux units are in mJy per 31 arcsecond beam.\n"
+ "</p>\n"
+ "\n"
+ "<p><b>Telescope Information</b><br />\n"
+ "<a href=\"http://www.submm.caltech.edu/cso/\">Caltech Submillimeter Observatory</a></p>\n"
+ "\n"
+ "<p><b>Status:</b><br />\n"
+ "Final</p>\n"
+ "\n"
+ "<p><b>Sampling:</b><br />\n"
+ "Sensitivity: Average 1 sigma rms = 10 mJy per beam.<br />\n"
+ "Waveband: 1120 microns<br />\n"
+ "Resolution: 31 arcsecond beam in 10 arcsecond pixels (diffuse large-scale structure is lost)\n"
+ "</p>\n"
+ "\n"
+ "<p><b>Areal Coverage:</b><br />\n"
+ "11 square degrees\n"
+ "</p>\n"
+ "\n"
+ "<p><b>Map Center (Galactic):</b><br />\n"
+ "NA<br />\n"
+ "NA</p>\n"
+ "\n"
+ "<p><b>Map Center (J2000):</b><br />\n"
+ "RA = 18:29:00<br />\n"
+ "Dec = +00:30:00 </p>"), "Description: Data were taken May-June 2003 and 2005. Flux units are in mJy per 31 arcsecond beam. Telescope Information Caltech Submillimeter Observatory Status: Final Sampling: Sensitivity: Average 1 sigma rms = 10 mJy per beam. Waveband: 1120 microns Resolution: 31 arcsecond beam in 10 arcsecond pixels (diffuse large-scale structure is lost) Areal Coverage: 11 square degrees Map Center (Galactic): NA NA Map Center (J2000): RA = 18:29:00 Dec = +00:30:00");
assertEquals(StringUtil.htmlArray2textArray(Arrays.asList("be <b>bold</b>!")), Arrays.asList("be bold!"));
assertEquals(StringUtil.htmlArray2textArray(null), Collections.emptyList());
}
}

0 comments on commit fa3a94f

Please sign in to comment.