Skip to content

Commit 388097b

Browse files
authored
TIKA-4453 -- decrement embedded depth in rpwh via Stephen H (#2277)
1 parent 5667e6d commit 388097b

File tree

4 files changed

+836
-0
lines changed

4 files changed

+836
-0
lines changed

tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ public void init(DataInputStream input, DataOutputStream output) {
7272
public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata)
7373
throws SAXException {
7474
proxyBackToClient(EMBEDDED_DOCUMENT, contentHandler, metadata);
75+
decrementEmbeddedDepth();
7576
}
7677

7778
@Override

tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,14 @@ public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata
9494
embeddedDepth--;
9595
}
9696

97+
/**
98+
* This is called by RecursiveMetadataContentHandlerProxy as it
99+
* cannot call endEmbeddedDocument().
100+
*/
101+
protected void decrementEmbeddedDepth() {
102+
embeddedDepth--;
103+
}
104+
97105
/**
98106
* This is called after the full parse has completed. Override this
99107
* for custom behavior. Make sure to call this as <code>super.endDocument(...)</code>

tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,35 @@ public void testRecursiveParserWrapper() throws Exception {
278278
assertEquals("/embed1.xml", m1.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
279279
}
280280

281+
@Test
282+
public void testRecursiveParserWrapperMassiveEmbedded() throws Exception {
283+
Parser parser = new AutoDetectParser();
284+
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
285+
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
286+
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
287+
20000));
288+
try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
289+
InputStream is = getResourceAsStream("/test-documents/massive_embedded.xml")) {
290+
Metadata metadata = new Metadata();
291+
ParseContext context = new ParseContext();
292+
fork.parse(is, handler, metadata, context);
293+
}
294+
List<Metadata> metadataList = handler.getMetadataList();
295+
assertEquals(111, metadataList.size());
296+
Metadata m0 = metadataList.get(0);
297+
assertEquals("Nikolai Lobachevsky", m0.get(TikaCoreProperties.CREATOR));
298+
assertContains("main_content", m0.get(TikaCoreProperties.TIKA_CONTENT));
299+
300+
for (int i = 1; i <= 110; i++) {
301+
assertContains("embed" + i + ".xml", m0.get(TikaCoreProperties.TIKA_CONTENT));
302+
303+
Metadata m1 = metadataList.get(i);
304+
assertEquals("embeddedAuthor", m1.get(TikaCoreProperties.CREATOR));
305+
assertContains("some_embedded_content", m1.get(TikaCoreProperties.TIKA_CONTENT));
306+
assertEquals("/embed" + i + ".xml", m1.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
307+
}
308+
}
309+
281310
@Test
282311
public void testRPWWithEmbeddedNPE() throws Exception {
283312
Parser parser = new AutoDetectParser();

0 commit comments

Comments
 (0)