From 11d76561d1e7807672ce27af5ab15da56a0c6d32 Mon Sep 17 00:00:00 2001
From: Jeremy Lewi <jlewi@google.com>
Date: Fri, 3 Jan 2020 08:00:38 -0800
Subject: [PATCH] * Catch TensorFlow FailedPreconditionErrors and force the
 process restart.

  * Attempt to fix kubeflow/code-intelligence#88
  * We see some predictions succeed but then subsequent ones fail.

* Try to deal with workload identity issues by testing for a service account
  on startup.
---
 py/label_microservice/worker.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/py/label_microservice/worker.py b/py/label_microservice/worker.py
index f7d46de3dc3..e5215951157 100644
--- a/py/label_microservice/worker.py
+++ b/py/label_microservice/worker.py
@@ -187,6 +187,22 @@ def callback(message):
                               f"The program will restart to try to recover.")
                 sys.exit(1)
 
+            # TODO(jlewi): I observed cases where some of the initial inferences
+            # would succeed but on subsequent ones it started failing
+            # see: https://github.com/kubeflow/code-intelligence/issues/70#issuecomment-570491289
+            # Restarting is a bit of a hack. We should try to figure out
+            # why its happening and fix it.
+            except tf_errors.FailedPreconditionError as e:
+                logging.fatal(f"Exception occurred while handling issue "
+                              f"{repo_owner}/{repo_name}#{issue_num}. \n"
+                              f"Exception: {e}\n"
+                              f"{traceback.format_exc()}\n."
+                              f"This usually indicates an issue with "
+                              f"trying to use the model in a thread different "
+                              f"from the one it was created in. "
+                              f"The program will restart to try to recover.")
+                sys.exit(1)
+
             #TODO(jlewi): We should catch a more narrow exception.
             # On exception if we don't ack the message then we risk problems
             # caused by poison pills repeatedly crashing our workers