Fix leaking of clusters in E2E tests #80 (#250)

* setup_cluster needs to push the cluster name so that it is available to the teardown step before we try to setup the cluster so that the name is available even if setup_cluster fails. * setup_cluster also needs to handle the case where setup_cluster might already have been attempted; in which case we should reuse that cluster. * Fix #80
kubeflow · Dec 27, 2017 · 420f9aa · 420f9aa
1 parent e98b10c
commit 420f9aa
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 9 deletions.
diff --git a/test-infra/airflow/Dockerfile.prow b/test-infra/airflow/Dockerfile.prow
@@ -16,5 +16,5 @@
 #
 # TODO(jlewi) With Docker v17.0 we should just be able to use Docker
 # build args to specify the base image.
-FROM gcr.io/mlkube-testing/airflow:v20171222-f2644ab-dirty-6078d0
+FROM gcr.io/mlkube-testing/airflow:v20171227-e98b10c-dirty-02bfd3
 ENTRYPOINT ["python", "-m", "py.airflow"]
diff --git a/test-infra/airflow/dags/e2e_tests_dag.py b/test-infra/airflow/dags/e2e_tests_dag.py
@@ -239,7 +239,21 @@ def setup_cluster(dag_run=None, ti=None, **_kwargs):
   chart = ti.xcom_pull("build_images", key="helm_chart")
 
   now = datetime.now()
-  cluster = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]
+
+  # Check if we already set the cluster. This can happen if there was a
+  # previous failed attempt to setup the cluster. We want to reuse the same
+  # cluster across attempts because this way we only have 1 cluster to teardown
+  # which minimizes the likelihood of leaking clusters.
+  cluster = ti.xcom_pull(None, key="cluster")
+  if not cluster:
+    cluster = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]
+    logging.info("Set cluster=%s", cluster)
+    # Push the value for cluster so that it is saved even if setup_cluster
+    # fails.
+    logging.info("xcom push: cluster=%s", cluster)
+    ti.xcom_push(key="cluster", value=cluster)
+  else:
+    logging.info("Cluster was set to %s", cluster)
 
   logging.info("conf=%s", conf)
   artifacts_path = conf.get("ARTIFACTS_PATH",
@@ -262,12 +276,6 @@ def setup_cluster(dag_run=None, ti=None, **_kwargs):
   # output is squashed together.
   run(ti, args, use_print=True, dryrun=dryrun)
 
-  values = {
-    "cluster": cluster,
-  }
-  for k, v in six.iteritems(values):
-    logging.info("xcom push: %s=%s", k, v)
-    ti.xcom_push(key=k, value=v)
 
 def run_tests(dag_run=None, ti=None, **_kwargs):
   conf = dag_run.conf

diff --git a/test-infra/airflow/deployment.yaml b/test-infra/airflow/deployment.yaml
@@ -25,7 +25,7 @@ spec:
     spec:
       containers:
       - name: airflow
-        image: gcr.io/mlkube-testing/airflow:v20171222-f2644ab-dirty-6078d0
+        image: gcr.io/mlkube-testing/airflow:v20171227-e98b10c-dirty-02bfd3
         # What arguments do we need to set to use the LocalExecutor
         command:
           # Tells entrypoint.sh to start the webserver