Skip to content

Commit

Permalink
TIKA-4357 -- improve metadata key prefixing for PDFs and html (#2061)
Browse files Browse the repository at this point in the history
* TIKA-4357 -- improve metadata key prefixing for PDFs and html

* TIKA-4357 -- fix unit test
  • Loading branch information
tballison authored Dec 4, 2024
1 parent 3806e55 commit 6a098b7
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ public void testMetadataOutput() throws Exception {
public void testJsonMetadataOutput() throws Exception {
String json = getParamOutContent("--json", "--digest=MD2", resourcePrefix + "testJsonMultipleInts.html");
//TIKA-1310
assertTrue(json.contains("\"" + "fb:admins\":\"1,2,3,4\","));
assertTrue(json.contains("\"html_meta:fb:admins\":\"1,2,3,4\","));
assertTrue(json.contains("\"X-TIKA:digest:MD2\":"));
}

Expand Down
4 changes: 2 additions & 2 deletions tika-core/src/main/java/org/apache/tika/metadata/HTML.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
*/

public interface HTML {
String PREFIX_HTML_META = "html_meta";
String PREFIX_HTML_META = "html_meta" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;


/**
* If a script element contains a src value, this value
* is set in the embedded document's metadata
*/
Property SCRIPT_SOURCE = Property.internalText(
PREFIX_HTML_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "scriptSrc");
PREFIX_HTML_META + "scriptSrc");

}
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ public void startElement(String uri, String local, String name, Attributes atts)
addHtmlMetadata(atts.getValue("name"), atts.getValue("content"));
} else if (atts.getValue("property") != null) {
// TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
metadata.add(atts.getValue("property"), atts.getValue("content"));
metadata.add(HTML.PREFIX_HTML_META + atts.getValue("property"), atts.getValue("content"));
}
} else if ("BASE".equals(name) && atts.getValue("href") != null) {
startElementWithSafeAttributes("base", atts);
Expand Down Expand Up @@ -222,14 +222,15 @@ private void addHtmlMetadata(String name, String value) {
if (property.equals(TikaCoreProperties.TITLE) && isTitleSetToMetadata) {
//prefer the title element if it is already set
//do nothing
metadata.add(HTML.PREFIX_HTML_META + TikaCoreProperties.TITLE.getName(), value);
} else if (property.isMultiValuePermitted()) {
metadata.add(property, value);
} else {
metadata.set(property, value);
}
} else {
metadata.add(HTML.PREFIX_HTML_META + name, value);
}
//TODO -- we should prefix these raw names to avoid collisions
metadata.add(name, value);
}

private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Geographic;
import org.apache.tika.metadata.HTML;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
Expand Down Expand Up @@ -109,8 +110,8 @@ public void startElement(String u, String l, String n, Attributes a)
}

assertEquals("Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Tika Developers", metadata.get("Author"));
assertEquals("5", metadata.get("refresh"));
assertEquals("Tika Developers", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("5", metadata.get(HTML.PREFIX_HTML_META + "refresh"));

assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
Expand Down Expand Up @@ -152,8 +153,8 @@ public void testXhtmlParsing() throws Exception {
metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset="));
assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));

assertEquals("Tika Developers", metadata.get("Author"));
assertEquals("5", metadata.get("refresh"));
assertEquals("Tika Developers", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("5", metadata.get(HTML.PREFIX_HTML_META + "refresh"));
assertContains("ability of Apache Tika", content);
assertContains("extract content", content);
assertContains("an XHTML document", content);
Expand Down Expand Up @@ -809,8 +810,8 @@ public void testOpenGraphMetadata() throws Exception {
Metadata metadata = new Metadata();
new JSoupParser().parse(new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
new BodyContentHandler(), metadata, new ParseContext());
assertEquals("some description", metadata.get("og:description"));
assertTrue(metadata.isMultiValued("og:image"));
assertEquals("some description", metadata.get(HTML.PREFIX_HTML_META + "og:description"));
assertTrue(metadata.isMultiValued(HTML.PREFIX_HTML_META + "og:image"));
}

// TIKA-1011
Expand Down Expand Up @@ -1220,19 +1221,15 @@ public void testMetadataMapping() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testHTML_metadata.html");
Metadata m = metadataList.get(0);
assertEquals("Free Web tutorials", m.get(TikaCoreProperties.DESCRIPTION));
assertEquals("Free Web tutorials", m.get("description"));

assertEquals("HTML,CSS,XML,JavaScript", m.get(TikaCoreProperties.SUBJECT));
assertEquals("HTML,CSS,XML,JavaScript", m.get("keywords"));

assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));

assertEquals("OldMetaTitle", m.get(TikaCoreProperties.TITLE));
assertEquals("OldMetaTitle", m.get("title"));

assertEquals("John Doe", m.get(TikaCoreProperties.CREATOR));
assertEquals("John Doe", m.get("author"));
}

@Test
Expand All @@ -1242,7 +1239,7 @@ public void testPreferenceForTitleElement() throws Exception {
Metadata m = metadataList.get(0);

assertEquals("ActualTitle", m.get(TikaCoreProperties.TITLE));
assertEquals("OldMetaTitle", m.get("title"));
assertEquals("OldMetaTitle", m.get(HTML.PREFIX_HTML_META + TikaCoreProperties.TITLE.getName()));
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -630,8 +630,6 @@ private void extractMetadata(PDDocument document, Metadata metadata, ParseContex
for (COSName key : info.getCOSObject().keySet()) {
String name = key.getName();
if (!handledMetadata.contains(name)) {
PDMetadataExtractor
.addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
PDMetadataExtractor.addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
info.getCOSObject().getDictionaryObject(key));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,20 +150,20 @@ public void testPdfParsingMetadataOnly() throws Exception {
}

@Test
public void testCustomMetadata() throws Exception {
public void testCustomMetadataInPDDocInfo() throws Exception {

XMLResult r = getXML("testPDF-custommetadata.pdf");
Metadata metadata = r.metadata;
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));

assertEquals("Custom Value", metadata.get("Custom Property"));
assertEquals("Custom Value", metadata.get(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Property"));

assertEquals("Array Entry 1", metadata.get("Custom Array"));
assertEquals(2, metadata.getValues("Custom Array").length);
assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
assertEquals("Array Entry 1", metadata.get(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array"));
assertEquals(2, metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array").length);
assertEquals("Array Entry 1", metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array")[0]);
assertEquals("Array Entry 2", metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array")[1]);

assertContains("Hello World!", r.xml);
}
Expand Down

0 comments on commit 6a098b7

Please sign in to comment.