Skip to content

Commit

Permalink
fix #2826 Added support for defining field configurations via field.c…
Browse files Browse the repository at this point in the history
…onfig.fieldname in crawl settings.
  • Loading branch information
marevol committed Jul 4, 2024
1 parent 8907219 commit 5d74bd3
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
Expand Down Expand Up @@ -181,7 +182,7 @@ protected Map<String, Object> generateData(final ResponseData responseData) {
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);

final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
final FieldConfigs fieldConfigs = new FieldConfigs(crawlingConfig.getConfigParameterMap(ConfigName.FIELD));

String urlEncoding;
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
Expand Down Expand Up @@ -221,7 +222,7 @@ protected Map<String, Object> generateData(final ResponseData responseData) {
responseData.addMetaData(Extractor.class.getSimpleName(), extractor);
final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
if ((fieldConfigs.getConfig(fessConfig.getIndexFieldCache()).map(config -> config.isCache()).orElse(false)
|| fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0
&& responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
Expand Down Expand Up @@ -334,7 +335,7 @@ protected Map<String, Object> generateData(final ResponseData responseData) {
putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key), scriptType);
}

return dataMap;
return processFieldConfigs(dataMap, fieldConfigs);
}

protected Date getLastModified(final Map<String, Object> dataMap, final ResponseData responseData) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
Expand All @@ -31,6 +32,7 @@
import org.codelibs.fess.crawler.entity.AccessResultData;
import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;

Expand Down Expand Up @@ -248,4 +250,17 @@ default String getParentEncoding(final String parentUrl, final String sessionId)
}
return null;
}

default Map<String, Object> processFieldConfigs(final Map<String, Object> dataMap, final FieldConfigs fieldConfigs) {
final Map<String, Object> newDataMap = new LinkedHashMap<>();
for (Map.Entry<String, Object> e : dataMap.entrySet()) {
if (fieldConfigs.getConfig(e.getKey()).map(FieldConfigs.Config::isOverwrite).orElse(false)
&& e.getValue() instanceof Object[] values && values.length > 0) {
newDataMap.put(e.getKey(), values[values.length - 1]);
} else {
newDataMap.put(e.getKey(), e.getValue());
}
}
return newDataMap;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
Expand Down Expand Up @@ -152,7 +153,7 @@ protected void storeData(final ResponseData responseData, final ResultData resul
processMetaRobots(responseData, resultData, document);
processXRobotsTag(responseData, resultData);

final Map<String, Object> dataMap = new LinkedHashMap<>();
Map<String, Object> dataMap = new LinkedHashMap<>();
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
final String path = entry.getValue();
try {
Expand Down Expand Up @@ -184,7 +185,7 @@ protected void storeData(final ResponseData responseData, final ResultData resul
}
}

putAdditionalData(dataMap, responseData, document);
dataMap = processAdditionalData(dataMap, responseData, document);
normalizeData(responseData, dataMap);

try {
Expand Down Expand Up @@ -336,7 +337,8 @@ protected boolean isValidCanonicalUrl(final String url, final String canonicalUr
return true;
}

protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
protected Map<String, Object> processAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData,
final Document document) {
// canonical
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
Expand All @@ -362,7 +364,7 @@ && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
url = pathMappingHelper.replaceUrl(sessionId, url);
final String mimeType = responseData.getMimeType();

final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
final FieldConfigs fieldConfigs = new FieldConfigs(crawlingConfig.getConfigParameterMap(ConfigName.FIELD));
final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);

String urlEncoding;
Expand Down Expand Up @@ -394,7 +396,7 @@ && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
prunedContent ? node -> pruneNode(node, crawlingConfig) : node -> node);
final String fileName = getFileName(url, urlEncoding);
putResultDataContent(dataMap, responseData, fessConfig, crawlingConfig, documentHelper, body, fileName);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
if ((fieldConfigs.getConfig(fessConfig.getIndexFieldCache()).map(config -> config.isCache()).orElse(false)
|| fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0
&& responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
Expand Down Expand Up @@ -499,6 +501,8 @@ && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
final String value = e.getValue();
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
});

return processFieldConfigs(dataMap, fieldConfigs);
}

protected void putResultDataContent(final Map<String, Object> dataMap, final ResponseData responseData, final FessConfig fessConfig,
Expand Down
76 changes: 76 additions & 0 deletions src/main/java/org/codelibs/fess/crawler/util/FieldConfigs.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Copyright 2012-2024 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.crawler.util;

import java.util.Map;
import java.util.regex.Pattern;

import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.stream.StreamUtil;
import org.codelibs.fess.Constants;
import org.dbflute.optional.OptionalThing;

public class FieldConfigs {

private final Map<String, String> params;

public FieldConfigs(Map<String, String> params) {
this.params = params;
}

public OptionalThing<Config> getConfig(String fieldName) {
String value = params.get(fieldName);
if (StringUtil.isNotBlank(value)) {
return OptionalThing.of(new Config(value));
}
return OptionalThing.empty();
}

public static class Config {

private final String[] values;

public Config(String value) {
values = StreamUtil.split(value, Pattern.quote("|")).get(stream -> stream.map(s -> s.trim()).toArray(n -> new String[n]));
}

public boolean isCache() {
for (final String value : values) {
if ("cache".equalsIgnoreCase(value)) {
return true;
}
}
// backward compatibility
if (values.length == 1 && Constants.TRUE.equalsIgnoreCase(values[0])) {
return true;
}
return false;
}

public boolean isOverwrite() {
for (final String value : values) {
if ("overwrite".equalsIgnoreCase(value)) {
return true;
}
}
return false;
}

public String[] getValues() {
return values;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Map;

import org.apache.groovy.util.Maps;
import org.codelibs.fess.Constants;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.exception.FessSystemException;
import org.codelibs.fess.unit.UnitFessTestCase;

Expand Down Expand Up @@ -269,6 +272,21 @@ public void test_getSite_unexpected() {
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
}

public void test_processFieldConfigs() {
final FessFileTransformer transformer = createInstance();
final Map<String, String> params = Maps.of("foo", "cache", "bar", "overwrite", "baz", "cache|overwrite");
FieldConfigs fieldConfigs = new FieldConfigs(params);
final Map<String, Object> dataMap = Map.of(//
"foo", new String[] { "aaa", "bbb" }, //
"bar", new String[] { "ccc", "ddd" }, //
"baz", new String[] { "eee", "fff" });
final Map<String, Object> resultMap = transformer.processFieldConfigs(dataMap, fieldConfigs);
assertEquals("aaa", ((String[]) resultMap.get("foo"))[0]);
assertEquals("bbb", ((String[]) resultMap.get("foo"))[1]);
assertEquals("ddd", resultMap.get("bar"));
assertEquals("fff", resultMap.get("baz"));
}

private FessFileTransformer createInstance() {
final FessFileTransformer transformer = new FessFileTransformer();
transformer.init();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.groovy.util.Maps;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.core.lang.ClassUtil;
Expand All @@ -42,6 +43,7 @@
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.es.config.exentity.WebConfig;
import org.codelibs.fess.helper.CrawlingConfigHelper;
Expand Down Expand Up @@ -626,7 +628,7 @@ protected Map<String, String> getConfigPrameterMap(final ResponseData responseDa
String data = "<html><body>aaa</body></html>";
Document document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
transformer.processAdditionalData(dataMap, responseData, document);
fail();
} catch (final ComponentNotFoundException e) {
// ignore
Expand All @@ -635,7 +637,7 @@ protected Map<String, String> getConfigPrameterMap(final ResponseData responseDa
data = "<html><head><link rel=\"canonical\" href=\"http://example.com/\"></head><body>aaa</body></html>";
document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
transformer.processAdditionalData(dataMap, responseData, document);
fail();
} catch (final ComponentNotFoundException e) {
// ignore
Expand All @@ -644,7 +646,7 @@ protected Map<String, String> getConfigPrameterMap(final ResponseData responseDa
data = "<html><head><link rel=\"canonical\" href=\"http://example.com/foo\"></head><body>aaa</body></html>";
document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
transformer.processAdditionalData(dataMap, responseData, document);
fail();
} catch (final ChildUrlsException e) {
final Set<RequestData> childUrlList = e.getChildUrlList();
Expand All @@ -655,7 +657,7 @@ protected Map<String, String> getConfigPrameterMap(final ResponseData responseDa
data = "<html><link rel=\"canonical\" href=\"http://example.com/foo\"><body>aaa</body></html>";
document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
transformer.processAdditionalData(dataMap, responseData, document);
fail();
} catch (final ChildUrlsException e) {
final Set<RequestData> childUrlList = e.getChildUrlList();
Expand Down Expand Up @@ -904,4 +906,19 @@ public void test_isValidUrl() {
assertFalse(transformer.isValidUrl("http://"));
assertFalse(transformer.isValidUrl("http://http://www.example.com"));
}

public void test_processFieldConfigs() {
final FessXpathTransformer transformer = new FessXpathTransformer();
final Map<String, String> params = Maps.of("foo", "cache", "bar", "overwrite", "baz", "cache|overwrite");
FieldConfigs fieldConfigs = new FieldConfigs(params);
final Map<String, Object> dataMap = Map.of(//
"foo", new String[] { "aaa", "bbb" }, //
"bar", new String[] { "ccc", "ddd" }, //
"baz", new String[] { "eee", "fff" });
final Map<String, Object> resultMap = transformer.processFieldConfigs(dataMap, fieldConfigs);
assertEquals("aaa", ((String[]) resultMap.get("foo"))[0]);
assertEquals("bbb", ((String[]) resultMap.get("foo"))[1]);
assertEquals("ddd", resultMap.get("bar"));
assertEquals("fff", resultMap.get("baz"));
}
}
71 changes: 71 additions & 0 deletions src/test/java/org/codelibs/fess/crawler/util/FieldConfigsTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright 2012-2024 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.crawler.util;

import java.util.Collections;
import java.util.Map;

import org.apache.groovy.util.Maps;
import org.codelibs.fess.unit.UnitFessTestCase;

public class FieldConfigsTest extends UnitFessTestCase {
public void test_empty() {
final FieldConfigs fieldConfigs = new FieldConfigs(Collections.emptyMap());
assertTrue(fieldConfigs.getConfig("test").isEmpty());
}

public void test_values() {
final Map<String, String> params = Maps.of("foo", "bar");
FieldConfigs fieldConfigs = new FieldConfigs(params);
assertTrue(fieldConfigs.getConfig("test").isEmpty());
assertFalse(fieldConfigs.getConfig("foo").isEmpty());
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
assertEquals("bar", fieldConfigs.getConfig("foo").map(FieldConfigs.Config::getValues).orElse(new String[0])[0]);
}

public void test_cache_true() {
final Map<String, String> params = Maps.of("foo", "true");
FieldConfigs fieldConfigs = new FieldConfigs(params);
assertTrue(fieldConfigs.getConfig("test").isEmpty());
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
}

public void test_cache() {
final Map<String, String> params = Maps.of("foo", "cache");
FieldConfigs fieldConfigs = new FieldConfigs(params);
assertTrue(fieldConfigs.getConfig("test").isEmpty());
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
}

public void test_overwrite() {
final Map<String, String> params = Maps.of("foo", "overwrite");
FieldConfigs fieldConfigs = new FieldConfigs(params);
assertTrue(fieldConfigs.getConfig("test").isEmpty());
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
}

public void test_cache_overwrite() {
final Map<String, String> params = Maps.of("foo", "cache|overwrite");
FieldConfigs fieldConfigs = new FieldConfigs(params);
assertTrue(fieldConfigs.getConfig("test").isEmpty());
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
}
}

0 comments on commit 5d74bd3

Please sign in to comment.