Skip to content

Commit

Permalink
Attachment ingest processor: add resource_name field (#64389)
Browse files Browse the repository at this point in the history
  • Loading branch information
yangyaofei authored Dec 7, 2020
1 parent 92e803a commit 0f84763
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 12 deletions.
1 change: 1 addition & 0 deletions plugins/ingest-attachment/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ forbiddenPatterns {
exclude '**/*.pdf'
exclude '**/*.epub'
exclude '**/*.vsdx'
exclude '**/text-cjk-*.txt'
}

thirdPartyAudit {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,18 @@ public final class AttachmentProcessor extends AbstractProcessor {
private final int indexedChars;
private final boolean ignoreMissing;
private final String indexedCharsField;
private final String resourceName;

AttachmentProcessor(String tag, String description, String field, String targetField, Set<Property> properties,
int indexedChars, boolean ignoreMissing, String indexedCharsField) {
int indexedChars, boolean ignoreMissing, String indexedCharsField, String resourceName) {
super(tag, description);
this.field = field;
this.targetField = targetField;
this.properties = properties;
this.indexedChars = indexedChars;
this.ignoreMissing = ignoreMissing;
this.indexedCharsField = indexedCharsField;
this.resourceName = resourceName;
}

boolean isIgnoreMissing() {
Expand All @@ -77,7 +79,10 @@ public IngestDocument execute(IngestDocument ingestDocument) {
Map<String, Object> additionalFields = new HashMap<>();

byte[] input = ingestDocument.getFieldValueAsBytes(field, ignoreMissing);

String resourceNameInput = null;
if (resourceName != null) {
resourceNameInput = ingestDocument.getFieldValue(resourceName, String.class, true);
}
if (input == null && ignoreMissing) {
return ingestDocument;
} else if (input == null) {
Expand All @@ -96,6 +101,9 @@ public IngestDocument execute(IngestDocument ingestDocument) {
}

Metadata metadata = new Metadata();
if (resourceNameInput != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, resourceNameInput);
}
String parsedContent = "";
try {
parsedContent = TikaImpl.parse(input, metadata, indexedChars);
Expand Down Expand Up @@ -197,6 +205,7 @@ public static final class Factory implements Processor.Factory {
public AttachmentProcessor create(Map<String, Processor.Factory> registry, String processorTag,
String description, Map<String, Object> config) throws Exception {
String field = readStringProperty(TYPE, processorTag, config, "field");
String resourceName = readOptionalStringProperty(TYPE, processorTag, config, "resource_name");
String targetField = readStringProperty(TYPE, processorTag, config, "target_field", "attachment");
List<String> propertyNames = readOptionalList(TYPE, processorTag, config, "properties");
int indexedChars = readIntProperty(TYPE, processorTag, config, "indexed_chars", NUMBER_OF_CHARS_INDEXED);
Expand All @@ -219,7 +228,7 @@ public AttachmentProcessor create(Map<String, Processor.Factory> registry, Strin
}

return new AttachmentProcessor(processorTag, description, field, targetField, properties, indexedChars, ignoreMissing,
indexedCharsField);
indexedCharsField, resourceName);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public class AttachmentProcessorTests extends ESTestCase {
@Before
public void createStandardProcessor() {
processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null);
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null, null);
}

public void testEnglishTextDocument() throws Exception {
Expand Down Expand Up @@ -88,7 +88,7 @@ public void testHtmlDocumentWithRandomFields() throws Exception {
selectedProperties.add(AttachmentProcessor.Property.DATE);
}
processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"target_field", selectedProperties, 10000, false, null);
"target_field", selectedProperties, 10000, false, null, null);

Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);
assertThat(attachmentData.keySet(), hasSize(selectedFieldNames.length));
Expand Down Expand Up @@ -160,7 +160,7 @@ public void testLegacyWordDocumentWithVisioSchema() throws Exception {
public void testPdf() throws Exception {
Map<String, Object> attachmentData = parseDocument("test.pdf", processor);
assertThat(attachmentData.get("content"),
is("This is a test, with umlauts, from München\n\nAlso contains newlines for testing.\n\nAnd one more."));
is("This is a test, with umlauts, from München\n\nAlso contains newlines for testing.\n\nAnd one more."));
assertThat(attachmentData.get("content_type").toString(), is("application/pdf"));
assertThat(attachmentData.get("content_length"), is(notNullValue()));
}
Expand Down Expand Up @@ -247,15 +247,17 @@ public void testNullValueWithIgnoreMissing() throws Exception {
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(),
Collections.singletonMap("source_field", null));
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, true, null);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"randomTarget", null, 10, true, null, null);
processor.execute(ingestDocument);
assertIngestDocument(originalIngestDocument, ingestDocument);
}

public void testNonExistentWithIgnoreMissing() throws Exception {
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap());
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, true, null);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"randomTarget", null, 10, true, null, null);
processor.execute(ingestDocument);
assertIngestDocument(originalIngestDocument, ingestDocument);
}
Expand All @@ -264,15 +266,17 @@ public void testNullWithoutIgnoreMissing() throws Exception {
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(),
Collections.singletonMap("source_field", null));
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, false, null);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"randomTarget", null, 10, false, null, null);
Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument));
assertThat(exception.getMessage(), equalTo("field [source_field] is null, cannot parse."));
}

public void testNonExistentWithoutIgnoreMissing() throws Exception {
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap());
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field", "randomTarget", null, 10, false, null);
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"randomTarget", null, 10, false, null, null);
Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument));
assertThat(exception.getMessage(), equalTo("field [source_field] not present as part of path [source_field]"));
}
Expand All @@ -295,9 +299,27 @@ private Map<String, Object> parseDocument(String file, AttachmentProcessor proce
return attachmentData;
}

private Map<String, Object> parseDocument(String file, AttachmentProcessor processor, Map<String, Object> optionalFields,
boolean includeResourceName)
throws Exception {
Map<String, Object> document = new HashMap<>();
document.put("source_field", getAsBinaryOrBase64(file));
if (includeResourceName) {
document.put("resource_name", file);
}
document.putAll(optionalFields);

IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document);
processor.execute(ingestDocument);

@SuppressWarnings("unchecked")
Map<String, Object> attachmentData = (Map<String, Object>) ingestDocument.getSourceAndMetadata().get("target_field");
return attachmentData;
}

public void testIndexedChars() throws Exception {
processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, null);
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, null, null);

Map<String, Object> attachmentData = parseDocument("text-in-english.txt", processor);

Expand All @@ -308,7 +330,7 @@ public void testIndexedChars() throws Exception {
assertThat(attachmentData.get("content_length"), is(19L));

processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, "max_length");
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, "max_length", null);

attachmentData = parseDocument("text-in-english.txt", processor);

Expand All @@ -333,6 +355,38 @@ public void testIndexedChars() throws Exception {
assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\""));
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
assertThat(attachmentData.get("content_length"), is(56L));

}

public void testIndexedCharsWithResourceName() throws Exception {
processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 100,
false, null, "resource_name");

Map<String, Object> attachmentData = parseDocument("text-cjk-big5.txt", processor, Collections.singletonMap("max_length", 100),
true);

assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
assertThat(attachmentData.get("content").toString(), containsString("碩鼠碩鼠,無食我黍!"));
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
assertThat(attachmentData.get("content_type").toString(), containsString("charset=Big5"));
assertThat(attachmentData.get("content_length"), is(100L));

attachmentData = parseDocument("text-cjk-gbk.txt", processor, Collections.singletonMap("max_length", 100), true);

assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
assertThat(attachmentData.get("content").toString(), containsString("硕鼠硕鼠,无食我黍!"));
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
assertThat(attachmentData.get("content_type").toString(), containsString("charset=GB18030"));
assertThat(attachmentData.get("content_length"), is(100L));

attachmentData = parseDocument("text-cjk-euc-jp.txt", processor, Collections.singletonMap("max_length", 100), true);

assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
assertThat(attachmentData.get("content").toString(), containsString("碩鼠よ碩鼠よ、\n我が黍を食らう無かれ!"));
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
assertThat(attachmentData.get("content_type").toString(), containsString("charset=EUC-JP"));
assertThat(attachmentData.get("content_length"), is(100L));
}

private Object getAsBinaryOrBase64(String filename) throws Exception {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
�ӹ��ӹ��A�L���ڶ��I�T���e�k�A���ڪ��U�C�u�N�h�k�A�A���֤g�C�֤g�֤g�A���o�کҡC
�ӹ��ӹ��A�L���ڳ��I�T���e�k�A���ڪּw�C�u�N�h�k�A�A���ְ�C�ְ�ְ�A���o�ڪ��C
�ӹ��ӹ��A�L���ڭ]�I�T���e�k�A���ڪֳҡC�u�N�h�k�A�A���֭��C�֭��֭��A�֤��ø��C
���������A�I�_�����A�����ֵ֡A�����_���A���_���A����n�n�C
���������A�I�_�����A���������A�O�¬O��A���]���b�A�A���L��C
���i�v��A���i���k�A�����ڨp�A����ڦ�A�`��`�_�A�k������C
�������աA���ճ����A�ا��h�H�A�I���P��C
�F���Z�R�A�ڰ��k誡A�کh�u�������A���H�����h�C
�F�������A�ڰ��ȶ��A�کh�u�������A���H���öˡC
�F�����o�A�ڰ��Ȩo�A�ڹ�ڱ�o�A����S�o�C
�h���u���A��y�_�N�A���h�_���A���餣��A�ܩ��ѮV�A��P���ѡC
�X�J�_���A���^�_�A�k�l����A�������S�̡A�ݧڽѩh�A�E�ΧB�n�C
�X�J�_�z�A���^�_���A���׸�ߣ�A�٨����ڡAࡿ��_���A���妳�`�C
�ګ�άu�A�����üۡA�䶷�P���A�ڤ߱y�y�A�r���X�C�A�H�g�ڼ~�C
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
���ͤ����ͤ衢
�椬�Ф򿩤餦̵���졪
���н��˴Ӥ����ˡ�
���Τ��Ƹܤߤ�������

�¤��ƾ��˽����ꡢ
��γ��ڤ�Ŭ����
���ڤ���ڤ衢
੤˲椬�������

���ͤ����ͤ衢
�椬���򿩤餦̵���졪
���н��˴Ӥ����ˡ�
������������������

�¤��ƾ��˽����ꡢ
��γڹ��Ŭ����
�ڹ��ڹ�衢
੤˲椬ľ������

���ͤ����ͤ衢
�椬�Ĥ򿩤餦̵���졪
���н��˴Ӥ����ˡ�
���Τ���ϫ��������

�¤��ƾ��˽����ꡢ
��γڹ٤�Ŭ����
�ڹ٤�ڹ٤衢
ï��Ƿ���Ʊʹ椻��
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
˶��˶����ʳ���������Ů��Ī�ҿϹˡ��Ž�ȥŮ���ʱ�����������������뼵�������
˶��˶����ʳ���������Ů��Ī�ҿϵ¡��Ž�ȥŮ���ʱ��ֹ����ֹ��ֹ���뼵���ֱ��
˶��˶����ʳ���磡�����Ů��Ī�ҿ��͡��Ž�ȥŮ���ʱ��ֽ����ֽ��ֽ���˭֮���š�

0 comments on commit 0f84763

Please sign in to comment.