Skip to content

Commit

Permalink
Fix cleanup_ci.py - fix cleanup of argo workflows and IAM policy bind…
Browse files Browse the repository at this point in the history
…ings (kubeflow#359)

* Fix cleanup_ci.py - cleaning of Argo and service account binding

* Add logging to make it easier to monitor progress.
* Print out stack traces on errors.

* cleanup_workflows needs to get GKE credentials.
  * The project might be different from the project where KF is deployed.

Related to kubeflow#358 cleanup ci not cleaning up Argo workflows

Related to kubeflow#357 cleanup ci not cleaning up IAM policies.

* Fix bug in cleaning up Argo workflows; we should not break out of loop.

* Don't exit if delete of Workflow fails; it looks like there might be
  some race conditions.

* Update the docker image to use google-api-python-client library 1.7.0 instead
  of 1.6.5. It looks like using 1.6.5 gives us an error when getting IAM
  policy.

* Prune the OWNERs file - remove approvers who haven't been very active.

* Fix lint.
  • Loading branch information
jlewi authored and k8s-ci-robot committed Apr 16, 2019
1 parent 5c78690 commit 92ffdd0
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 26 deletions.
14 changes: 0 additions & 14 deletions OWNERS
Original file line number Diff line number Diff line change
@@ -1,21 +1,7 @@
# TODO(jlewi): We should probably have OWNERs files in subdirectories that
# list approvers for individual components (e.g. Seldon folks for Seldon component)
approvers:
- DjangoPeng
- gaocegege
- jlewi
- lluunn
- pdmack
- ScorpioCPH
- kunmingg
- richardsliu
reviewers:
- DjangoPeng
- gaocegege
- Jimexist
- jlewi
- lluunn
- ScorpioCPH
- wbuchwalter
- zjj2wry
- kunmingg
6 changes: 5 additions & 1 deletion images/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ RUN set -ex \
wget \
ca-certificates \
git \
emacs \
jq \
zip \
unzip \
Expand Down Expand Up @@ -136,14 +137,17 @@ RUN cd /tmp/ && \
pip2 install -U wheel filelock && \
pip2 install pipenv && \
pip2 install requests && \
pip2 install google-api-python-client && \
pip2 install prometheus_client && \
pipenv install --system --two && \
pip3 install -U wheel filelock

RUN pip3 install pipenv==2018.10.9
RUN cd /tmp/ && pipenv install --system --three

# Force update of googleapipython client
# Do this after pipenv because we want to override what pipenv installs.
RUN pip2 install --upgrade google-api-python-client==1.7.0

RUN pip install yq

COPY checkout.sh /usr/local/bin
Expand Down
6 changes: 3 additions & 3 deletions images/gcb_build/image_build.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"images": [
"gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c",
"gcr.io/kubeflow-ci/test-worker:v20190415-53ad3b5-dirty-5bc1cf",
"gcr.io/kubeflow-ci/test-worker:latest"
],
"steps": [
Expand All @@ -19,7 +19,7 @@
"args": [
"build",
"-t",
"gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c",
"gcr.io/kubeflow-ci/test-worker:v20190415-53ad3b5-dirty-5bc1cf",
"--label=git-versions=",
"--file=./Dockerfile",
"--cache-from=gcr.io/kubeflow-ci/test-worker:latest",
Expand All @@ -34,7 +34,7 @@
{
"args": [
"tag",
"gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c",
"gcr.io/kubeflow-ci/test-worker:v20190415-53ad3b5-dirty-5bc1cf",
"gcr.io/kubeflow-ci/test-worker:latest"
],
"id": "tag-test-worker",
Expand Down
67 changes: 60 additions & 7 deletions py/kubeflow/testing/cleanup_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import retrying
import socket
import subprocess
import sys
import traceback
import tempfile
import yaml

Expand Down Expand Up @@ -51,6 +53,13 @@ def is_retryable_exception(exception):
return isinstance(exception, socket.error)

def cleanup_workflows(args):
logging.info("Cleanup Argo workflows")
util.maybe_activate_service_account()

util.run(["gcloud", "container", "clusters", "get-credentials",
args.testing_cluster, "--zone=" + args.testing_zone,
"--project=" + args.testing_project,])

# We need to load the kube config so that we can have credentials to
# talk to the APIServer.
util.load_kube_config(persist_config=False)
Expand All @@ -75,20 +84,24 @@ def cleanup_workflows(args):
logging.info("Deleting workflow: %s", name)
is_expired = True
if not args.dryrun:
crd_api.delete_namespaced_custom_object(
argo_client.GROUP, argo_client.VERSION, args.namespace,
argo_client.PLURAL, name, k8s_client.V1DeleteOptions())
break

try:
crd_api.delete_namespaced_custom_object(
argo_client.GROUP, argo_client.VERSION, args.namespace,
argo_client.PLURAL, name, k8s_client.V1DeleteOptions())
except Exception as e: # pylint: disable=broad-except
logging.error("There was a problem deleting workflow %s.%s; "
"error: %s", args.namespace, args.name, e)
if is_expired:
expired.append(name)
else:
unexpired.append(name)

logging.info("Unexpired workflows:\n%s", "\n".join(unexpired))
logging.info("expired workflows:\n%s", "\n".join(expired))
logging.info("Finished cleanup of Argo workflows")

def cleanup_endpoints(args):
logging.info("Cleanup Google Cloud Endpoints")
credentials = GoogleCredentials.get_application_default()

services_management = discovery.build('servicemanagement', 'v1', credentials=credentials)
Expand Down Expand Up @@ -142,8 +155,11 @@ def cleanup_endpoints(args):
logging.info("Unmatched services:\n%s", "\n".join(unmatched))
logging.info("Unexpired services:\n%s", "\n".join(unexpired))
logging.info("expired services:\n%s", "\n".join(expired))
logging.info("Finished cleanup Google Cloud Endpoints")

def cleanup_disks(args):
logging.info("Cleanup persistent disks")

credentials = GoogleCredentials.get_application_default()

compute = discovery.build('compute', 'v1', credentials=credentials)
Expand Down Expand Up @@ -183,8 +199,11 @@ def cleanup_disks(args):
logging.info("Unmatched disks:\n%s", "\n".join(unmatched))
logging.info("Unexpired disks:\n%s", "\n".join(unexpired))
logging.info("expired disks:\n%s", "\n".join(expired))
logging.info("Finished cleanup persistent disks")

def cleanup_firewall_rules(args):
logging.info("Cleanup firewall rules")

credentials = GoogleCredentials.get_application_default()

compute = discovery.build('compute', 'v1', credentials=credentials)
Expand Down Expand Up @@ -288,8 +307,11 @@ def cleanup_health_checks(args):

logging.info("Unmatched health checks:\n%s", "\n".join(unmatched))
logging.info("Matched health checks:\n%s", "\n".join(matched))
logging.info("Finished cleanup firewall rules")

def cleanup_service_accounts(args):
logging.info("Cleanup service accounts")

credentials = GoogleCredentials.get_application_default()

iam = discovery.build('iam', 'v1', credentials=credentials)
Expand Down Expand Up @@ -341,7 +363,7 @@ def cleanup_service_accounts(args):
logging.info("Unmatched emails:\n%s", "\n".join(unmatched_emails))
logging.info("Unexpired emails:\n%s", "\n".join(unexpired_emails))
logging.info("expired emails:\n%s", "\n".join(expired_emails))

logging.info("Finished cleanup service accounts")

def trim_unused_bindings(iamPolicy, accounts):
keepBindings = []
Expand All @@ -361,10 +383,13 @@ def trim_unused_bindings(iamPolicy, accounts):
binding['members'] = members_to_keep
keepBindings.append(binding)
if members_to_delete:
logging.info("Delete binding for members:\n%s", ", ".join(members_to_delete))
logging.info("Delete binding for members:\n%s", "\n".join(
members_to_delete))
iamPolicy['bindings'] = keepBindings

def cleanup_service_account_bindings(args):
logging.info("Cleanup service account bindings")

credentials = GoogleCredentials.get_application_default()
iam = discovery.build('iam', 'v1', credentials=credentials)
accounts = []
Expand All @@ -379,13 +404,15 @@ def cleanup_service_account_bindings(args):
next_page_token = service_accounts["nextPageToken"]

resourcemanager = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
logging.info("Get IAM policy for project %s", args.project)
iamPolicy = resourcemanager.projects().getIamPolicy(resource=args.project).execute()
trim_unused_bindings(iamPolicy, accounts)

setBody = {'policy': iamPolicy}
if not args.dryrun:
resourcemanager.projects().setIamPolicy(resource=args.project, body=setBody).execute()

logging.info("Finished cleanup service account bindings")

def getAge(tsInRFC3339):
# The docs say insert time will be in RFC339 format.
Expand Down Expand Up @@ -415,6 +442,7 @@ def execute_rpc(rpc):
return rpc.execute()

def cleanup_deployments(args): # pylint: disable=too-many-statements,too-many-branches
logging.info("Cleanup deployments")
if not args.delete_script:
raise ValueError("--delete_script must be specified.")

Expand Down Expand Up @@ -529,6 +557,7 @@ def cleanup_deployments(args): # pylint: disable=too-many-statements,too-many-br
if not args.dryrun:
clusters_client.delete(projectId=args.project, zone=zone,
clusterId=name).execute()
logging.info("Finished cleanup deployments")

def cleanup_all(args):
ops = [cleanup_deployments,
Expand All @@ -544,6 +573,8 @@ def cleanup_all(args):
op(args)
except Exception as e: # pylint: disable=broad-except
logging.error(e)
exc_type, exc_value, exc_tb = sys.exc_info()
traceback.print_exception(exc_type, exc_value, exc_tb)

def add_workflow_args(parser):
parser.add_argument(
Expand Down Expand Up @@ -575,6 +606,21 @@ def main():
parser.add_argument(
"--project", default="kubeflow-ci", type=str, help=("The project."))

# The values prefixed with testing_ refer to the test cluster where the
# Argo workflows run. In contrast --project is the project where the tests
# spin up Kubeflow instances.
parser.add_argument(
"--testing_project", default="kubeflow-ci", type=str,
help=("The cluster used for Argo workflows."))

parser.add_argument(
"--testing_cluster", default="kubeflow-testing", type=str,
help=("The cluster used for Argo workflows."))

parser.add_argument(
"--testing_zone", default="us-east1-d", type=str,
help=("The zone of the cluster used for Argo workflows."))

parser.add_argument(
"--max_age_hours", default=3, type=int, help=("The age of deployments to gc."))

Expand Down Expand Up @@ -631,6 +677,13 @@ def main():

parser_service_account.set_defaults(func=cleanup_service_accounts)

######################################################
# Parser for service account bindings
parser_service_account = subparsers.add_parser(
"service_account_bindings", help="Cleanup service account bindings")

parser_service_account.set_defaults(func=cleanup_service_account_bindings)

######################################################
# Parser for deployments
parser_deployments = subparsers.add_parser(
Expand Down
2 changes: 1 addition & 1 deletion test-infra/ks_app/components/cleanup-ci.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"--delete_script=/src/kubeflow/kubeflow/scripts/gke/delete_deployment.sh",
],
]),
"image": "gcr.io/kubeflow-ci/test-worker/test-worker:v20190116-b7abb8d-e3b0c4",
"image": "gcr.io/kubeflow-ci/test-worker:v20190415-53ad3b5-dirty-5bc1cf",
"name": "label-sync",
env: [
{
Expand Down

0 comments on commit 92ffdd0

Please sign in to comment.