Skip to content

Commit

Permalink
TIKA-3441 -- improve likelihood that tesseract processes will be shut…
Browse files Browse the repository at this point in the history
…down on jvm restart.
  • Loading branch information
tballison committed Jun 10, 2021
1 parent e8ec223 commit 1224f88
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 7 deletions.
6 changes: 6 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
Release 1.27 - ???

* Prevent rare infinite loop in tika-server's -spawnChild mode
when restart fails because of failure to bind to the port (TIKA-3441).

* Improve likelihood that tesseract will not be orphaned on
jvm restart in tika-server (TIKA-3441).

* Deprecate experimental PDFPreflightParser (TIKA-3437).

* Apply encoding detection to zip entry names via Ryan421 (TIKA-3374).
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser;

import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;

/**
* Abstract base class for parsers that call external processes. This
* adds one more layer of 'hope' that processes won't be orphaned if
* the jvm has to be restarted. This does not guarantee that the
* processes won't be orphaned in case of, e.g. kill -9, but this
* increases the chances that under normal circumstances or if the jvm
* itself exits, that external processes won't be orphaned.
*
* @since Apache Tika 1.27
*/
public abstract class AbstractExternalProcessParser extends AbstractParser {

/**
* Serial version UID.
*/
private static final long serialVersionUID = 7186985395903074255L;

private static final ConcurrentHashMap<String, Process> PROCESS_MAP = new ConcurrentHashMap<>();

static {
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
PROCESS_MAP.forEachValue(1, Process::destroyForcibly);
}));
}

protected String register(Process p) {
String id = UUID.randomUUID().toString();
PROCESS_MAP.put(id, p);
return id;
}

protected Process release(String id) {
return PROCESS_MAP.remove(id);
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.AbstractExternalProcessParser;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
Expand Down Expand Up @@ -99,7 +100,7 @@
*
*
*/
public class TesseractOCRParser extends AbstractParser implements Initializable {
public class TesseractOCRParser extends AbstractExternalProcessParser implements Initializable {
private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRParser.class);

private static volatile boolean HAS_WARNED = false;
Expand Down Expand Up @@ -531,13 +532,18 @@ private void doOCR(File input, File output, TesseractOCRConfig config) throws IO
ProcessBuilder pb = new ProcessBuilder(cmd);
setEnv(config, pb);
Process process = null;
String id = null;
try {
process = pb.start();
id = register(process);
runOCRProcess(process, config.getTimeout());
} finally {
if (process != null) {
process.destroyForcibly();
}
if (id != null) {
release(id);
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ private enum CHILD_STATUS {
private static final Logger LOG = LoggerFactory.getLogger(TikaServerWatchDog.class);
private static final String DEFAULT_CHILD_STATUS_FILE_PREFIX = "tika-server-child-process-mmap-";

//this is the shutdown hook for the child (forked) process
private static Thread SHUTDOWN_HOOK = null;

private Object[] childStatusLock = new Object[0];
private volatile CHILD_STATUS childStatus = CHILD_STATUS.INITIALIZING;
private volatile Instant lastPing = null;
Expand Down Expand Up @@ -251,7 +254,6 @@ private static List<String> extractJVMArgs(String[] args) {
}

private class ChildProcess {
private Thread SHUTDOWN_HOOK = null;

private final Process process;
private final DataOutputStream toChild;
Expand Down Expand Up @@ -439,12 +441,12 @@ private Process startProcess(String[] args, int numRestarts, Path childStatusFil
//redirect stdout to parent stderr to avoid error msgs
//from maven during build: Corrupted STDOUT by directly writing to native stream in forked
redirectIO(process.getInputStream(), System.err);
if (SHUTDOWN_HOOK != null) {
Runtime.getRuntime().removeShutdownHook(SHUTDOWN_HOOK);
}
Thread oldHook = SHUTDOWN_HOOK;
SHUTDOWN_HOOK = new Thread(() -> this.close());
Runtime.getRuntime().addShutdownHook(SHUTDOWN_HOOK);

if (oldHook != null) {
Runtime.getRuntime().removeShutdownHook(oldHook);
}
return process;
}
}
Expand All @@ -470,9 +472,12 @@ public void run() {
}

private static synchronized void destroyChildForcibly(Process process) {
process = process.destroyForcibly();

try {
//wait for process to stop itself -- this can help prevent
// orphaned processes, e.g.tesseract
process.waitFor(10, TimeUnit.SECONDS);
process = process.destroyForcibly();
boolean destroyed = process.waitFor(60, TimeUnit.SECONDS);
if (! destroyed) {
LOG.error("Child process still alive after 60 seconds. " +
Expand All @@ -490,6 +495,8 @@ private static synchronized void destroyChildForcibly(Process process) {
} catch (InterruptedException e) {
LOG.warn("interrupted exception while trying to destroy the forked process");
//swallow
} finally {
process.destroyForcibly();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -637,4 +637,59 @@ private void testBaseline() throws Exception {
assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
}

/*
this is useful for manual tests that spawned tesseract processes are correctly cleaned up
fairly often. :(
java -jar tika-server-1.27-SNAPSHOT.jar -taskTimeoutMillis 20000 -spawnChild
-c tika-config-ocr-only.xml -p 9999 -JXmx256m
@Test
public void loadTest() throws Exception {
List<Thread> threads = new ArrayList<>();
//this should tie up tesseract for longer than -taskTimeoutMillis
Path largePDF = Paths.get("..../testPDF_childAttachments.pdf");
//this should cause an oom
Path largeDocx = Paths.get("..../mobydick.docx");
for (int t = 0; t < 6; t++) {
final int num = t;
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
Response response = null;
for (int i = 0; i < 10000; i++) {
try {
if (num < 5) {
response = WebClient.create(endPoint + META_PATH).accept("application/json")
.put(Files.newInputStream(largePDF));
} else if ( num == 5) {
Thread.sleep(8000);
response = WebClient.create(endPoint + META_PATH).accept("application/json")
.put(Files.newInputStream(largeDocx));
}
} catch (Exception e) {
e.printStackTrace();
try {
Thread.sleep(1000);
} catch (InterruptedException interruptedException) {
interruptedException.printStackTrace();
}
//oom may or may not cause an exception depending
//on the timing
}
}
}
});
threads.add(thread);
thread.start();
}
for (Thread t : threads) {
t.join();
}
}
*/
}

0 comments on commit 1224f88

Please sign in to comment.