Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TIKA-4357 -- improve metadata key prefixing for PDFs and html #2061

Merged
merged 3 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ public void testMetadataOutput() throws Exception {
public void testJsonMetadataOutput() throws Exception {
String json = getParamOutContent("--json", "--digest=MD2", resourcePrefix + "testJsonMultipleInts.html");
//TIKA-1310
assertTrue(json.contains("\"" + "fb:admins\":\"1,2,3,4\","));
assertTrue(json.contains("\"html_meta:fb:admins\":\"1,2,3,4\","));
assertTrue(json.contains("\"X-TIKA:digest:MD2\":"));
}

Expand Down
4 changes: 2 additions & 2 deletions tika-core/src/main/java/org/apache/tika/metadata/HTML.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
*/

public interface HTML {
String PREFIX_HTML_META = "html_meta";
String PREFIX_HTML_META = "html_meta" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;


/**
* If a script element contains a src value, this value
* is set in the embedded document's metadata
*/
Property SCRIPT_SOURCE = Property.internalText(
PREFIX_HTML_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "scriptSrc");
PREFIX_HTML_META + "scriptSrc");

}
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ public void startElement(String uri, String local, String name, Attributes atts)
addHtmlMetadata(atts.getValue("name"), atts.getValue("content"));
} else if (atts.getValue("property") != null) {
// TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
metadata.add(atts.getValue("property"), atts.getValue("content"));
metadata.add(HTML.PREFIX_HTML_META + atts.getValue("property"), atts.getValue("content"));
}
} else if ("BASE".equals(name) && atts.getValue("href") != null) {
startElementWithSafeAttributes("base", atts);
Expand Down Expand Up @@ -222,14 +222,15 @@ private void addHtmlMetadata(String name, String value) {
if (property.equals(TikaCoreProperties.TITLE) && isTitleSetToMetadata) {
//prefer the title element if it is already set
//do nothing
metadata.add(HTML.PREFIX_HTML_META + TikaCoreProperties.TITLE.getName(), value);
} else if (property.isMultiValuePermitted()) {
metadata.add(property, value);
} else {
metadata.set(property, value);
}
} else {
metadata.add(HTML.PREFIX_HTML_META + name, value);
}
//TODO -- we should prefix these raw names to avoid collisions
metadata.add(name, value);
}

private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Geographic;
import org.apache.tika.metadata.HTML;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
Expand Down Expand Up @@ -109,8 +110,8 @@ public void startElement(String u, String l, String n, Attributes a)
}

assertEquals("Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Tika Developers", metadata.get("Author"));
assertEquals("5", metadata.get("refresh"));
assertEquals("Tika Developers", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("5", metadata.get(HTML.PREFIX_HTML_META + "refresh"));

assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
Expand Down Expand Up @@ -152,8 +153,8 @@ public void testXhtmlParsing() throws Exception {
metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset="));
assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));

assertEquals("Tika Developers", metadata.get("Author"));
assertEquals("5", metadata.get("refresh"));
assertEquals("Tika Developers", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("5", metadata.get(HTML.PREFIX_HTML_META + "refresh"));
assertContains("ability of Apache Tika", content);
assertContains("extract content", content);
assertContains("an XHTML document", content);
Expand Down Expand Up @@ -809,8 +810,8 @@ public void testOpenGraphMetadata() throws Exception {
Metadata metadata = new Metadata();
new JSoupParser().parse(new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
new BodyContentHandler(), metadata, new ParseContext());
assertEquals("some description", metadata.get("og:description"));
assertTrue(metadata.isMultiValued("og:image"));
assertEquals("some description", metadata.get(HTML.PREFIX_HTML_META + "og:description"));
assertTrue(metadata.isMultiValued(HTML.PREFIX_HTML_META + "og:image"));
}

// TIKA-1011
Expand Down Expand Up @@ -1220,19 +1221,15 @@ public void testMetadataMapping() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testHTML_metadata.html");
Metadata m = metadataList.get(0);
assertEquals("Free Web tutorials", m.get(TikaCoreProperties.DESCRIPTION));
assertEquals("Free Web tutorials", m.get("description"));

assertEquals("HTML,CSS,XML,JavaScript", m.get(TikaCoreProperties.SUBJECT));
assertEquals("HTML,CSS,XML,JavaScript", m.get("keywords"));

assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));

assertEquals("OldMetaTitle", m.get(TikaCoreProperties.TITLE));
assertEquals("OldMetaTitle", m.get("title"));

assertEquals("John Doe", m.get(TikaCoreProperties.CREATOR));
assertEquals("John Doe", m.get("author"));
}

@Test
Expand All @@ -1242,7 +1239,7 @@ public void testPreferenceForTitleElement() throws Exception {
Metadata m = metadataList.get(0);

assertEquals("ActualTitle", m.get(TikaCoreProperties.TITLE));
assertEquals("OldMetaTitle", m.get("title"));
assertEquals("OldMetaTitle", m.get(HTML.PREFIX_HTML_META + TikaCoreProperties.TITLE.getName()));
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -630,8 +630,6 @@ private void extractMetadata(PDDocument document, Metadata metadata, ParseContex
for (COSName key : info.getCOSObject().keySet()) {
String name = key.getName();
if (!handledMetadata.contains(name)) {
PDMetadataExtractor
.addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
PDMetadataExtractor.addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
info.getCOSObject().getDictionaryObject(key));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,20 +150,20 @@ public void testPdfParsingMetadataOnly() throws Exception {
}

@Test
public void testCustomMetadata() throws Exception {
public void testCustomMetadataInPDDocInfo() throws Exception {

XMLResult r = getXML("testPDF-custommetadata.pdf");
Metadata metadata = r.metadata;
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));

assertEquals("Custom Value", metadata.get("Custom Property"));
assertEquals("Custom Value", metadata.get(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Property"));

assertEquals("Array Entry 1", metadata.get("Custom Array"));
assertEquals(2, metadata.getValues("Custom Array").length);
assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
assertEquals("Array Entry 1", metadata.get(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array"));
assertEquals(2, metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array").length);
assertEquals("Array Entry 1", metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array")[0]);
assertEquals("Array Entry 2", metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array")[1]);

assertContains("Hello World!", r.xml);
}
Expand Down
Loading