From ec33a254954045160ac76739fd561c80f4ec3c8c Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Thu, 20 Jun 2024 21:01:20 +0900 Subject: [PATCH] fix #2821 Exclude X-FESS metadata from indexing and add transformation process for metadata inclusion. --- .../transformer/AbstractFessFileTransformer.java | 10 +++++----- .../codelibs/fess/mylasta/direction/FessConfig.java | 6 +++--- .../codelibs/fess/mylasta/direction/FessProp.java | 12 ++++++++++++ src/main/resources/fess_config.properties | 2 +- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java index 23cabebac..2432bf7b7 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java @@ -136,15 +136,15 @@ protected Map generateData(final ResponseData responseData) { dataMap.put(mapping.getValue1(), Double.parseDouble(values[0])); } else if (Constants.MAPPING_TYPE_DATE.equalsIgnoreCase(mapping.getValue2()) || Constants.MAPPING_TYPE_PDF_DATE.equalsIgnoreCase(mapping.getValue2())) { - final String dateFormate; + final String dateFormat; if (StringUtil.isNotBlank(mapping.getValue3())) { - dateFormate = mapping.getValue3(); + dateFormat = mapping.getValue3(); } else if (Constants.MAPPING_TYPE_PDF_DATE.equalsIgnoreCase(mapping.getValue2())) { - dateFormate = mapping.getValue2(); + dateFormat = Constants.MAPPING_TYPE_PDF_DATE; } else { - dateFormate = Constants.DATE_OPTIONAL_TIME; + dateFormat = Constants.DATE_OPTIONAL_TIME; } - final Date dt = FessFunctions.parseDate(values[0], dateFormate); + final Date dt = FessFunctions.parseDate(values[0], dateFormat); if (dt != null) { dataMap.put(mapping.getValue1(), FessFunctions.formatDate(dt)); } else { diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java index 173b38aea..e26e3d90d 100644 --- a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java +++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java @@ -391,7 +391,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction /** The key of the configuration. e.g. cpu */ String CRAWLER_HOTTHREAD_TYPE = "crawler.hotthread.type"; - /** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*,X-TIKA.* */ + /** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*,X-TIKA.*,X-FESS.* */ String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes"; /** The key of the configuration. e.g. title=title:string
@@ -2926,7 +2926,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction /** * Get the value for the key 'crawler.metadata.content.excludes'.
- * The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*,X-TIKA.*
+ * The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*,X-TIKA.*,X-FESS.*
* @return The value of found property. (NotNull: if not found, exception but basically no way) */ String getCrawlerMetadataContentExcludes(); @@ -10899,7 +10899,7 @@ protected java.util.Map prepareGeneratedDefaultMap() { defaultMap.put(FessConfig.CRAWLER_HOTTHREAD_TIMEOUT, "30s"); defaultMap.put(FessConfig.CRAWLER_HOTTHREAD_TYPE, "cpu"); defaultMap.put(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES, - "resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*,X-TIKA.*"); + "resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*,X-TIKA.*,X-FESS.*"); defaultMap.put(FessConfig.CRAWLER_METADATA_NAME_MAPPING, "title=title:string\nTitle=title:string\ndc:title=title:string\n"); defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CONTENT_XPATH, "//BODY"); defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_LANG_XPATH, "//HTML/@lang"); diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java index c23ddeedf..3fe8a010c 100644 --- a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java +++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java @@ -975,6 +975,18 @@ default Tuple3 getCrawlerMetadataNameMapping(final Strin return params.get(name); } + default void addCrawlerMetadataNameMapping(final String name, final String fieldName, final String mappingType, + final String dateFormat) { + if (getCrawlerMetadataNameMapping(name) != null) { + return; + } + + @SuppressWarnings("unchecked") + final Map> params = + (Map>) propMap.get(CRAWLER_METADATA_NAME_MAPPING); + params.put(name, new Tuple3<>(fieldName, mappingType, dateFormat)); + } + String getSuggestPopularWordFields(); default String[] getSuggestPopularWordFieldsAsArray() { diff --git a/src/main/resources/fess_config.properties b/src/main/resources/fess_config.properties index 6abdd8647..6fecccc38 100644 --- a/src/main/resources/fess_config.properties +++ b/src/main/resources/fess_config.properties @@ -228,7 +228,7 @@ crawler.hotthread.snapshots=10 crawler.hotthread.threads=3 crawler.hotthread.timeout=30s crawler.hotthread.type=cpu -crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*,X-TIKA.* +crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*,X-TIKA.*,X-FESS.* crawler.metadata.name.mapping=\ title=title:string\n\ Title=title:string\n\