From 11d76561d1e7807672ce27af5ab15da56a0c6d32 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Fri, 3 Jan 2020 08:00:38 -0800 Subject: [PATCH] * Catch TensorFlow FailedPreconditionErrors and force the process restart. * Attempt to fix kubeflow/code-intelligence#88 * We see some predictions succeed but then subsequent ones fail. * Try to deal with workload identity issues by testing for a service account on startup. --- py/label_microservice/worker.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/py/label_microservice/worker.py b/py/label_microservice/worker.py index f7d46de3dc3..e5215951157 100644 --- a/py/label_microservice/worker.py +++ b/py/label_microservice/worker.py @@ -187,6 +187,22 @@ def callback(message): f"The program will restart to try to recover.") sys.exit(1) + # TODO(jlewi): I observed cases where some of the initial inferences + # would succeed but on subsequent ones it started failing + # see: https://github.com/kubeflow/code-intelligence/issues/70#issuecomment-570491289 + # Restarting is a bit of a hack. We should try to figure out + # why its happening and fix it. + except tf_errors.FailedPreconditionError as e: + logging.fatal(f"Exception occurred while handling issue " + f"{repo_owner}/{repo_name}#{issue_num}. \n" + f"Exception: {e}\n" + f"{traceback.format_exc()}\n." + f"This usually indicates an issue with " + f"trying to use the model in a thread different " + f"from the one it was created in. " + f"The program will restart to try to recover.") + sys.exit(1) + #TODO(jlewi): We should catch a more narrow exception. # On exception if we don't ack the message then we risk problems # caused by poison pills repeatedly crashing our workers