diff --git a/docs/changelog/103873.yaml b/docs/changelog/103873.yaml new file mode 100644 index 0000000000000..937106043ecf4 --- /dev/null +++ b/docs/changelog/103873.yaml @@ -0,0 +1,5 @@ +pr: 103873 +summary: Catch exceptions during `pytorch_inference` startup +area: Machine Learning +type: bug +issues: [] diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManager.java index ef5de2718e702..18e89732daf21 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManager.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManager.java @@ -17,6 +17,7 @@ import org.elasticsearch.action.support.master.AcknowledgedResponse; import org.elasticsearch.client.internal.Client; import org.elasticsearch.common.Strings; +import org.elasticsearch.common.util.concurrent.AbstractRunnable; import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException; import org.elasticsearch.common.xcontent.LoggingDeprecationHandler; import org.elasticsearch.core.TimeValue; @@ -200,9 +201,18 @@ public void startDeployment(TrainedModelDeploymentTask task, ActionListener processContext.startAndLoad(modelConfig.getLocation(), modelLoadedListener) - ); + executorServiceForDeployment.execute(new AbstractRunnable() { + + @Override + public void onFailure(Exception e) { + failedDeploymentListener.onFailure(e); + } + + @Override + protected void doRun() { + processContext.startAndLoad(modelConfig.getLocation(), modelLoadedListener); + } + }); }, failedDeploymentListener::onFailure) ); } else { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/pytorch/process/NativePyTorchProcessFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/pytorch/process/NativePyTorchProcessFactory.java index b26c6720ed179..e538a6c686881 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/pytorch/process/NativePyTorchProcessFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/pytorch/process/NativePyTorchProcessFactory.java @@ -89,7 +89,7 @@ public NativePyTorchProcess createProcess( process.start(executorService); } catch (IOException | EsRejectedExecutionException e) { String msg = "Failed to connect to pytorch process for job " + task.getDeploymentId(); - logger.error(msg); + logger.error(msg, e); try { IOUtils.close(process); } catch (IOException ioe) {