From 0f8476361c567dba92237bc2013d8bd17e8ac683 Mon Sep 17 00:00:00 2001 From: yangyaofei Date: Tue, 8 Dec 2020 01:46:20 +0800 Subject: [PATCH] Attachment ingest processor: add resource_name field (#64389) --- plugins/ingest-attachment/build.gradle | 1 + .../attachment/AttachmentProcessor.java | 15 +++- .../attachment/AttachmentProcessorTests.java | 72 ++++++++++++++++--- .../test/sample-files/text-cjk-big5.txt | 14 ++++ .../test/sample-files/text-cjk-euc-jp.txt | 29 ++++++++ .../test/sample-files/text-cjk-gbk.txt | 3 + 6 files changed, 122 insertions(+), 12 deletions(-) create mode 100644 plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-big5.txt create mode 100644 plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-euc-jp.txt create mode 100644 plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-gbk.txt diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle index df22d03b03e90..ab6ff5bf00b00 100644 --- a/plugins/ingest-attachment/build.gradle +++ b/plugins/ingest-attachment/build.gradle @@ -93,6 +93,7 @@ forbiddenPatterns { exclude '**/*.pdf' exclude '**/*.epub' exclude '**/*.vsdx' + exclude '**/text-cjk-*.txt' } thirdPartyAudit { diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java index e20b8e6025ba6..4734afdb97e66 100644 --- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java +++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java @@ -56,9 +56,10 @@ public final class AttachmentProcessor extends AbstractProcessor { private final int indexedChars; private final boolean ignoreMissing; private final String indexedCharsField; + private final String resourceName; AttachmentProcessor(String tag, String description, String field, String targetField, Set properties, - int indexedChars, boolean ignoreMissing, String indexedCharsField) { + int indexedChars, boolean ignoreMissing, String indexedCharsField, String resourceName) { super(tag, description); this.field = field; this.targetField = targetField; @@ -66,6 +67,7 @@ public final class AttachmentProcessor extends AbstractProcessor { this.indexedChars = indexedChars; this.ignoreMissing = ignoreMissing; this.indexedCharsField = indexedCharsField; + this.resourceName = resourceName; } boolean isIgnoreMissing() { @@ -77,7 +79,10 @@ public IngestDocument execute(IngestDocument ingestDocument) { Map additionalFields = new HashMap<>(); byte[] input = ingestDocument.getFieldValueAsBytes(field, ignoreMissing); - + String resourceNameInput = null; + if (resourceName != null) { + resourceNameInput = ingestDocument.getFieldValue(resourceName, String.class, true); + } if (input == null && ignoreMissing) { return ingestDocument; } else if (input == null) { @@ -96,6 +101,9 @@ public IngestDocument execute(IngestDocument ingestDocument) { } Metadata metadata = new Metadata(); + if (resourceNameInput != null) { + metadata.set(Metadata.RESOURCE_NAME_KEY, resourceNameInput); + } String parsedContent = ""; try { parsedContent = TikaImpl.parse(input, metadata, indexedChars); @@ -197,6 +205,7 @@ public static final class Factory implements Processor.Factory { public AttachmentProcessor create(Map registry, String processorTag, String description, Map config) throws Exception { String field = readStringProperty(TYPE, processorTag, config, "field"); + String resourceName = readOptionalStringProperty(TYPE, processorTag, config, "resource_name"); String targetField = readStringProperty(TYPE, processorTag, config, "target_field", "attachment"); List propertyNames = readOptionalList(TYPE, processorTag, config, "properties"); int indexedChars = readIntProperty(TYPE, processorTag, config, "indexed_chars", NUMBER_OF_CHARS_INDEXED); @@ -219,7 +228,7 @@ public AttachmentProcessor create(Map registry, Strin } return new AttachmentProcessor(processorTag, description, field, targetField, properties, indexedChars, ignoreMissing, - indexedCharsField); + indexedCharsField, resourceName); } } diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java index 38b03c1879e67..9deccb2e13a88 100644 --- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java +++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java @@ -55,7 +55,7 @@ public class AttachmentProcessorTests extends ESTestCase { @Before public void createStandardProcessor() { processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", - "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null); + "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null, null); } public void testEnglishTextDocument() throws Exception { @@ -88,7 +88,7 @@ public void testHtmlDocumentWithRandomFields() throws Exception { selectedProperties.add(AttachmentProcessor.Property.DATE); } processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", - "target_field", selectedProperties, 10000, false, null); + "target_field", selectedProperties, 10000, false, null, null); Map attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor); assertThat(attachmentData.keySet(), hasSize(selectedFieldNames.length)); @@ -160,7 +160,7 @@ public void testLegacyWordDocumentWithVisioSchema() throws Exception { public void testPdf() throws Exception { Map attachmentData = parseDocument("test.pdf", processor); assertThat(attachmentData.get("content"), - is("This is a test, with umlauts, from München\n\nAlso contains newlines for testing.\n\nAnd one more.")); + is("This is a test, with umlauts, from München\n\nAlso contains newlines for testing.\n\nAnd one more.")); assertThat(attachmentData.get("content_type").toString(), is("application/pdf")); assertThat(attachmentData.get("content_length"), is(notNullValue())); } @@ -247,7 +247,8 @@ public void testNullValueWithIgnoreMissing() throws Exception { IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.singletonMap("source_field", null)); IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); - Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, true, null); + Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", + "randomTarget", null, 10, true, null, null); processor.execute(ingestDocument); assertIngestDocument(originalIngestDocument, ingestDocument); } @@ -255,7 +256,8 @@ public void testNullValueWithIgnoreMissing() throws Exception { public void testNonExistentWithIgnoreMissing() throws Exception { IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap()); IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); - Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, true, null); + Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", + "randomTarget", null, 10, true, null, null); processor.execute(ingestDocument); assertIngestDocument(originalIngestDocument, ingestDocument); } @@ -264,7 +266,8 @@ public void testNullWithoutIgnoreMissing() throws Exception { IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.singletonMap("source_field", null)); IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); - Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, false, null); + Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", + "randomTarget", null, 10, false, null, null); Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument)); assertThat(exception.getMessage(), equalTo("field [source_field] is null, cannot parse.")); } @@ -272,7 +275,8 @@ public void testNullWithoutIgnoreMissing() throws Exception { public void testNonExistentWithoutIgnoreMissing() throws Exception { IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap()); IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); - Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, false, null); + Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", + "randomTarget", null, 10, false, null, null); Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument)); assertThat(exception.getMessage(), equalTo("field [source_field] not present as part of path [source_field]")); } @@ -295,9 +299,27 @@ private Map parseDocument(String file, AttachmentProcessor proce return attachmentData; } + private Map parseDocument(String file, AttachmentProcessor processor, Map optionalFields, + boolean includeResourceName) + throws Exception { + Map document = new HashMap<>(); + document.put("source_field", getAsBinaryOrBase64(file)); + if (includeResourceName) { + document.put("resource_name", file); + } + document.putAll(optionalFields); + + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map attachmentData = (Map) ingestDocument.getSourceAndMetadata().get("target_field"); + return attachmentData; + } + public void testIndexedChars() throws Exception { processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", - "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, null); + "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, null, null); Map attachmentData = parseDocument("text-in-english.txt", processor); @@ -308,7 +330,7 @@ public void testIndexedChars() throws Exception { assertThat(attachmentData.get("content_length"), is(19L)); processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", - "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, "max_length"); + "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, "max_length", null); attachmentData = parseDocument("text-in-english.txt", processor); @@ -333,6 +355,38 @@ public void testIndexedChars() throws Exception { assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\"")); assertThat(attachmentData.get("content_type").toString(), containsString("text/plain")); assertThat(attachmentData.get("content_length"), is(56L)); + + } + + public void testIndexedCharsWithResourceName() throws Exception { + processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", + "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 100, + false, null, "resource_name"); + + Map attachmentData = parseDocument("text-cjk-big5.txt", processor, Collections.singletonMap("max_length", 100), + true); + + assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length")); + assertThat(attachmentData.get("content").toString(), containsString("碩鼠碩鼠,無食我黍!")); + assertThat(attachmentData.get("content_type").toString(), containsString("text/plain")); + assertThat(attachmentData.get("content_type").toString(), containsString("charset=Big5")); + assertThat(attachmentData.get("content_length"), is(100L)); + + attachmentData = parseDocument("text-cjk-gbk.txt", processor, Collections.singletonMap("max_length", 100), true); + + assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length")); + assertThat(attachmentData.get("content").toString(), containsString("硕鼠硕鼠,无食我黍!")); + assertThat(attachmentData.get("content_type").toString(), containsString("text/plain")); + assertThat(attachmentData.get("content_type").toString(), containsString("charset=GB18030")); + assertThat(attachmentData.get("content_length"), is(100L)); + + attachmentData = parseDocument("text-cjk-euc-jp.txt", processor, Collections.singletonMap("max_length", 100), true); + + assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length")); + assertThat(attachmentData.get("content").toString(), containsString("碩鼠よ碩鼠よ、\n我が黍を食らう無かれ!")); + assertThat(attachmentData.get("content_type").toString(), containsString("text/plain")); + assertThat(attachmentData.get("content_type").toString(), containsString("charset=EUC-JP")); + assertThat(attachmentData.get("content_length"), is(100L)); } private Object getAsBinaryOrBase64(String filename) throws Exception { diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-big5.txt b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-big5.txt new file mode 100644 index 0000000000000..0efbfad8dd59b --- /dev/null +++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-big5.txt @@ -0,0 +1,14 @@ +ӹӹALڶITekAڪUCuNhkAA֤gC֤g֤gAoکҡC +ӹӹALڳITekAڪּwCuNhkAAְCְְAoڪC +ӹӹALڭ]ITekAڪֳҡCuNhkAA֭C֭֭A֤øC +AI_Aֵ֡A_A_AnnC +AI_AAO¬OA]bAALC +ivAikAڨpAڦA``_AkC +աAճAاhHAIPC +FZRAڰk誡AکhuAHhC +FAڰȶAکhuAHöˡC +FoAڰȨoAڹڱoASoC +huAy_NAh_A餣AܩѮVAPѡC +XJ_A^_AklAS̡AݧڽѩhAEΧBnC +XJ_zA^_A׸ߣA٨ڡAࡿ_A妳`C +ګάuAüۡA䶷PAڤ߱yyArXCAHgڼ~C \ No newline at end of file diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-euc-jp.txt b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-euc-jp.txt new file mode 100644 index 0000000000000..45e4627daecc7 --- /dev/null +++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-euc-jp.txt @@ -0,0 +1,29 @@ +ͤͤ衢 +椬Ф򿩤餦̵졪 +н˴Ӥˡ +ΤƸܤߤ + +¤ƾ˽ꡢ +γڤŬ +ڤڤ衢 +੤˲椬 + +ͤͤ衢 +椬򿩤餦̵졪 +н˴Ӥˡ +Τ + +¤ƾ˽ꡢ +γڹŬ +ڹڹ衢 +੤˲椬ľ + +ͤͤ衢 +椬Ĥ򿩤餦̵졪 +н˴Ӥˡ +Τϫ + +¤ƾ˽ꡢ +γڹ٤Ŭ +ڹ٤ڹ٤衢 +ïǷƱʹ椻 \ No newline at end of file diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-gbk.txt b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-gbk.txt new file mode 100644 index 0000000000000..e8477cd6ce3e2 --- /dev/null +++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-cjk-gbk.txt @@ -0,0 +1,3 @@ +˶˶ʳŮĪҿϹˡŽȥŮʱ뼵 +˶˶ʳŮĪҿϵ¡ŽȥŮʱֹֹֹ뼵ֱ +˶˶ʳ磡ŮĪҿ͡ŽȥŮʱֽֽֽ˭֮š \ No newline at end of file