Skip to content

Commit

Permalink
WaitForJob should use conditions for v1alpha2. (#771)
Browse files Browse the repository at this point in the history
* Now that #673 is fixed wait_for_job for v1alpha2 should for condition Succeeded or failed.

* If there is a timeout waiting for the job to finish include the job spec
  in the exception so we can print it out to see what the final status is.
  • Loading branch information
jlewi authored and k8s-ci-robot committed Aug 7, 2018
1 parent bb430c9 commit d2509aa
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 14 deletions.
12 changes: 9 additions & 3 deletions py/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,8 @@ def run_test(args): # pylint: disable=too-many-branches,too-many-statements
api_client, namespace, name, args.tfjob_version,
status_callback=tf_job_client.log_status)

logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

if args.tfjob_version == "v1alpha1":
if results.get("status", {}).get("state", {}).lower() != "succeeded":
t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format(
Expand Down Expand Up @@ -566,9 +568,13 @@ def run_test(args): # pylint: disable=too-many-branches,too-many-statements
# TODO(jlewi): Add an option to add chaos and randomly kill various resources?
# TODO(jlewi): Are there other generic validation checks we should
# run.
except util.TimeoutError:
t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
name, namespace)
except util.JobTimeoutError as e:
if e.job:
spec = "Job:\n" + json.dumps(e.job, indent=2)
else:
spec = "JobTimeoutError did not contain job"
t.failure = ("Timeout waiting for {0} in namespace {1} to finish; ").format(
name, namespace) + spec
logging.exception(t.failure)
except Exception as e: # pylint: disable-msg=broad-except
# TODO(jlewi): I'm observing flakes where the exception has message "status"
Expand Down
22 changes: 11 additions & 11 deletions py/tf_job_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,10 +228,10 @@ def wait_for_condition(client,
return results

if datetime.datetime.now() + polling_interval > end_time:
raise util.TimeoutError(
raise util.JobTimeoutError(
"Timeout waiting for job {0} in namespace {1} to enter one of the "
"conditions {2}.".format(
name, namespace, conditions))
name, namespace, conditions), results)

time.sleep(polling_interval.seconds)

Expand All @@ -258,6 +258,13 @@ def wait_for_job(client,
invoked after we poll the job. Callable takes a single argument which
is the job.
"""
if version != "v1alpha1":
return wait_for_condition(
client, namespace, name, ["Succeeded", "Failed"],
timeout=timeout,
polling_interval=polling_interval,
status_callback=status_callback)

crd_api = k8s_client.CustomObjectsApi(client)
end_time = datetime.datetime.now() + timeout
while True:
Expand All @@ -278,15 +285,8 @@ def wait_for_job(client,
status_callback(results)

# If we poll the CRD quick enough status won't have been set yet.
if version == "v1alpha1":
if results.get("status", {}).get("phase", {}) == "Done":
return results
else:
# For v1alpha2 check for non-empty completionTime
# TODO(jlewi): https://github.com/kubeflow/tf-operator/issues/673
# Once that issue is fixed we should be able to look at the condition.
if results.get("status", {}).get("completionTime", ""):
return results
if results.get("status", {}).get("phase", {}) == "Done":
return results

if datetime.datetime.now() + polling_interval > end_time:
raise util.TimeoutError(
Expand Down
9 changes: 9 additions & 0 deletions py/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,15 @@ def setup_cluster(api_client):
class TimeoutError(Exception): # pylint: disable=redefined-builtin
"""An error indicating an operation timed out."""

class JobTimeError(TimeoutError):
"""An error indicating the job timed out.
The job spec/status can be found in .job.
"""

def __init__(self, message, job):
super(JobTimeError, self).__init__(message)
self.job = job

GCS_REGEX = re.compile("gs://([^/]*)(/.*)?")

Expand Down

0 comments on commit d2509aa

Please sign in to comment.