Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attachment ingest processor: add resource_name field #64389

Merged
merged 18 commits into from
Dec 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
e115498
Add filename for Tika to better recognize the txt file
yangyaofei Oct 30, 2020
285a513
bugfix : no field "file_name"
yangyaofei Oct 30, 2020
dcdcb52
add test for adding file_name field in ingest-attachment
yangyaofei Oct 30, 2020
fbd914e
Merge branch 'master' into plugin-attachment-filename
elasticmachine Nov 16, 2020
f8adca4
[Plugin Attachment] update exclude file
yangyaofei Nov 17, 2020
8f88243
[Plugin Attachment] update variable name to meet tika's name
yangyaofei Dec 5, 2020
914648d
[Plugin Attachment] update test
yangyaofei Dec 7, 2020
d8e4638
Merge branch 'master' into plugin-attachment-filename
elasticmachine Dec 7, 2020
bca1175
Update plugins/ingest-attachment/src/test/java/org/elasticsearch/inge…
yangyaofei Dec 7, 2020
0d4d6cf
Update plugins/ingest-attachment/src/test/java/org/elasticsearch/inge…
yangyaofei Dec 7, 2020
93f8cf5
Update plugins/ingest-attachment/src/test/java/org/elasticsearch/inge…
yangyaofei Dec 7, 2020
2d927ca
Update plugins/ingest-attachment/src/test/java/org/elasticsearch/inge…
yangyaofei Dec 7, 2020
9cfded6
Update plugins/ingest-attachment/src/test/java/org/elasticsearch/inge…
yangyaofei Dec 7, 2020
f34c553
Update plugins/ingest-attachment/src/test/java/org/elasticsearch/inge…
yangyaofei Dec 7, 2020
4c55774
Update plugins/ingest-attachment/src/test/java/org/elasticsearch/inge…
yangyaofei Dec 7, 2020
74a38eb
Update plugins/ingest-attachment/src/test/java/org/elasticsearch/inge…
yangyaofei Dec 7, 2020
6e5e033
[Plugin Attachment] make test independent
yangyaofei Dec 7, 2020
0d158d1
[Plugin Attachment] update format
yangyaofei Dec 7, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions plugins/ingest-attachment/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ forbiddenPatterns {
exclude '**/*.pdf'
exclude '**/*.epub'
exclude '**/*.vsdx'
exclude '**/text-cjk-*.txt'
}

thirdPartyAudit {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,18 @@ public final class AttachmentProcessor extends AbstractProcessor {
private final int indexedChars;
private final boolean ignoreMissing;
private final String indexedCharsField;
private final String resourceName;

AttachmentProcessor(String tag, String description, String field, String targetField, Set<Property> properties,
int indexedChars, boolean ignoreMissing, String indexedCharsField) {
int indexedChars, boolean ignoreMissing, String indexedCharsField, String resourceName) {
super(tag, description);
this.field = field;
this.targetField = targetField;
this.properties = properties;
this.indexedChars = indexedChars;
this.ignoreMissing = ignoreMissing;
this.indexedCharsField = indexedCharsField;
this.resourceName = resourceName;
}

boolean isIgnoreMissing() {
Expand All @@ -77,7 +79,10 @@ public IngestDocument execute(IngestDocument ingestDocument) {
Map<String, Object> additionalFields = new HashMap<>();

byte[] input = ingestDocument.getFieldValueAsBytes(field, ignoreMissing);

String resourceNameInput = null;
if (resourceName != null) {
resourceNameInput = ingestDocument.getFieldValue(resourceName, String.class, true);
}
if (input == null && ignoreMissing) {
return ingestDocument;
} else if (input == null) {
Expand All @@ -96,6 +101,9 @@ public IngestDocument execute(IngestDocument ingestDocument) {
}

Metadata metadata = new Metadata();
if (resourceNameInput != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, resourceNameInput);
}
String parsedContent = "";
try {
parsedContent = TikaImpl.parse(input, metadata, indexedChars);
Expand Down Expand Up @@ -197,6 +205,7 @@ public static final class Factory implements Processor.Factory {
public AttachmentProcessor create(Map<String, Processor.Factory> registry, String processorTag,
String description, Map<String, Object> config) throws Exception {
String field = readStringProperty(TYPE, processorTag, config, "field");
String resourceName = readOptionalStringProperty(TYPE, processorTag, config, "resource_name");
String targetField = readStringProperty(TYPE, processorTag, config, "target_field", "attachment");
List<String> propertyNames = readOptionalList(TYPE, processorTag, config, "properties");
int indexedChars = readIntProperty(TYPE, processorTag, config, "indexed_chars", NUMBER_OF_CHARS_INDEXED);
Expand All @@ -219,7 +228,7 @@ public AttachmentProcessor create(Map<String, Processor.Factory> registry, Strin
}

return new AttachmentProcessor(processorTag, description, field, targetField, properties, indexedChars, ignoreMissing,
indexedCharsField);
indexedCharsField, resourceName);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public class AttachmentProcessorTests extends ESTestCase {
@Before
public void createStandardProcessor() {
processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null);
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null, null);
}

public void testEnglishTextDocument() throws Exception {
Expand Down Expand Up @@ -88,7 +88,7 @@ public void testHtmlDocumentWithRandomFields() throws Exception {
selectedProperties.add(AttachmentProcessor.Property.DATE);
}
processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"target_field", selectedProperties, 10000, false, null);
"target_field", selectedProperties, 10000, false, null, null);

Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);
assertThat(attachmentData.keySet(), hasSize(selectedFieldNames.length));
Expand Down Expand Up @@ -160,7 +160,7 @@ public void testLegacyWordDocumentWithVisioSchema() throws Exception {
public void testPdf() throws Exception {
Map<String, Object> attachmentData = parseDocument("test.pdf", processor);
assertThat(attachmentData.get("content"),
is("This is a test, with umlauts, from München\n\nAlso contains newlines for testing.\n\nAnd one more."));
is("This is a test, with umlauts, from München\n\nAlso contains newlines for testing.\n\nAnd one more."));
assertThat(attachmentData.get("content_type").toString(), is("application/pdf"));
assertThat(attachmentData.get("content_length"), is(notNullValue()));
}
Expand Down Expand Up @@ -247,15 +247,17 @@ public void testNullValueWithIgnoreMissing() throws Exception {
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(),
Collections.singletonMap("source_field", null));
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, true, null);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"randomTarget", null, 10, true, null, null);
processor.execute(ingestDocument);
assertIngestDocument(originalIngestDocument, ingestDocument);
}

public void testNonExistentWithIgnoreMissing() throws Exception {
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap());
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, true, null);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"randomTarget", null, 10, true, null, null);
processor.execute(ingestDocument);
assertIngestDocument(originalIngestDocument, ingestDocument);
}
Expand All @@ -264,15 +266,17 @@ public void testNullWithoutIgnoreMissing() throws Exception {
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(),
Collections.singletonMap("source_field", null));
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, false, null);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"randomTarget", null, 10, false, null, null);
Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument));
assertThat(exception.getMessage(), equalTo("field [source_field] is null, cannot parse."));
}

public void testNonExistentWithoutIgnoreMissing() throws Exception {
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap());
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, false, null);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"randomTarget", null, 10, false, null, null);
Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument));
assertThat(exception.getMessage(), equalTo("field [source_field] not present as part of path [source_field]"));
}
Expand All @@ -295,9 +299,27 @@ private Map<String, Object> parseDocument(String file, AttachmentProcessor proce
return attachmentData;
}

private Map<String, Object> parseDocument(String file, AttachmentProcessor processor, Map<String, Object> optionalFields,
boolean includeResourceName)
throws Exception {
Map<String, Object> document = new HashMap<>();
document.put("source_field", getAsBinaryOrBase64(file));
if (includeResourceName) {
document.put("resource_name", file);
}
document.putAll(optionalFields);

IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document);
processor.execute(ingestDocument);

@SuppressWarnings("unchecked")
Map<String, Object> attachmentData = (Map<String, Object>) ingestDocument.getSourceAndMetadata().get("target_field");
return attachmentData;
}

public void testIndexedChars() throws Exception {
processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, null);
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, null, null);

Map<String, Object> attachmentData = parseDocument("text-in-english.txt", processor);

Expand All @@ -308,7 +330,7 @@ public void testIndexedChars() throws Exception {
assertThat(attachmentData.get("content_length"), is(19L));

processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, "max_length");
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, "max_length", null);

attachmentData = parseDocument("text-in-english.txt", processor);

Expand All @@ -333,6 +355,38 @@ public void testIndexedChars() throws Exception {
assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\""));
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
assertThat(attachmentData.get("content_length"), is(56L));

}

public void testIndexedCharsWithResourceName() throws Exception {
processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 100,
false, null, "resource_name");

Map<String, Object> attachmentData = parseDocument("text-cjk-big5.txt", processor, Collections.singletonMap("max_length", 100),
true);

assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
assertThat(attachmentData.get("content").toString(), containsString("碩鼠碩鼠,無食我黍!"));
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
assertThat(attachmentData.get("content_type").toString(), containsString("charset=Big5"));
assertThat(attachmentData.get("content_length"), is(100L));

attachmentData = parseDocument("text-cjk-gbk.txt", processor, Collections.singletonMap("max_length", 100), true);

assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
assertThat(attachmentData.get("content").toString(), containsString("硕鼠硕鼠,无食我黍!"));
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
assertThat(attachmentData.get("content_type").toString(), containsString("charset=GB18030"));
assertThat(attachmentData.get("content_length"), is(100L));

attachmentData = parseDocument("text-cjk-euc-jp.txt", processor, Collections.singletonMap("max_length", 100), true);

assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
assertThat(attachmentData.get("content").toString(), containsString("碩鼠よ碩鼠よ、\n我が黍を食らう無かれ!"));
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
assertThat(attachmentData.get("content_type").toString(), containsString("charset=EUC-JP"));
assertThat(attachmentData.get("content_length"), is(100L));
}

private Object getAsBinaryOrBase64(String filename) throws Exception {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
�ӹ��ӹ��A�L���ڶ��I�T���e�k�A���ڪ��U�C�u�N�h�k�A�A���֤g�C�֤g�֤g�A���o�کҡC
�ӹ��ӹ��A�L���ڳ��I�T���e�k�A���ڪּw�C�u�N�h�k�A�A���ְ�C�ְ�ְ�A���o�ڪ��C
�ӹ��ӹ��A�L���ڭ]�I�T���e�k�A���ڪֳҡC�u�N�h�k�A�A���֭��C�֭��֭��A�֤��ø��C
���������A�I�_�����A�����ֵ֡A�����_���A���_���A����n�n�C
���������A�I�_�����A���������A�O�¬O��A���]���b�A�A���L��C
���i�v��A���i���k�A�����ڨp�A����ڦ�A�`��`�_�A�k������C
�������աA���ճ����A�ا��h�H�A�I���P��C
�F���Z�R�A�ڰ��k誡A�کh�u�������A���H�����h�C
�F�������A�ڰ��ȶ��A�کh�u�������A���H���öˡC
�F�����o�A�ڰ��Ȩo�A�ڹ�ڱ�o�A����S�o�C
�h���u���A��y�_�N�A���h�_���A���餣��A�ܩ��ѮV�A��P���ѡC
�X�J�_���A���^�_�A�k�l����A�������S�̡A�ݧڽѩh�A�E�ΧB�n�C
�X�J�_�z�A���^�_���A���׸�ߣ�A�٨����ڡAࡿ��_���A���妳�`�C
�ګ�άu�A�����üۡA�䶷�P���A�ڤ߱y�y�A�r���X�C�A�H�g�ڼ~�C
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
���ͤ����ͤ衢
�椬�Ф򿩤餦̵���졪
���н��˴Ӥ����ˡ�
���Τ��Ƹܤߤ�������

�¤��ƾ��˽����ꡢ
��γ��ڤ�Ŭ����
���ڤ���ڤ衢
੤˲椬�������

���ͤ����ͤ衢
�椬���򿩤餦̵���졪
���н��˴Ӥ����ˡ�
������������������

�¤��ƾ��˽����ꡢ
��γڹ��Ŭ����
�ڹ��ڹ�衢
੤˲椬ľ������

���ͤ����ͤ衢
�椬�Ĥ򿩤餦̵���졪
���н��˴Ӥ����ˡ�
���Τ���ϫ��������

�¤��ƾ��˽����ꡢ
��γڹ٤�Ŭ����
�ڹ٤�ڹ٤衢
ï��Ƿ���Ʊʹ椻��
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
˶��˶����ʳ���������Ů��Ī�ҿϹˡ��Ž�ȥŮ���ʱ�����������������뼵�������
˶��˶����ʳ���������Ů��Ī�ҿϵ¡��Ž�ȥŮ���ʱ��ֹ����ֹ��ֹ���뼵���ֱ��
˶��˶����ʳ���磡�����Ů��Ī�ҿ��͡��Ž�ȥŮ���ʱ��ֽ����ֽ��ֽ���˭֮���š�