diff --git a/acm-repo/namespaces/auto-deploy/batch_v1beta1_cronjob_cleanup-ci-kubeflow-ci-deployment.yaml b/acm-repo/namespaces/auto-deploy/batch_v1beta1_cronjob_cleanup-ci-kubeflow-ci-deployment.yaml new file mode 100644 index 000000000..1f1c6c1dc --- /dev/null +++ b/acm-repo/namespaces/auto-deploy/batch_v1beta1_cronjob_cleanup-ci-kubeflow-ci-deployment.yaml @@ -0,0 +1,50 @@ +apiVersion: batch/v1beta1 +kind: CronJob +metadata: + labels: + app: cleanup-ci-kubeflow-ci-deployment + name: cleanup-ci-kubeflow-ci-deployment + namespace: auto-deploy +spec: + concurrencyPolicy: Forbid + failedJobsHistoryLimit: 1 + jobTemplate: + metadata: + annotations: + sidecar.istio.io/inject: "false" + creationTimestamp: null + labels: + job: cleanup-kubeflow-ci-deployment + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + labels: + job: cleanup-kubeflow-ci-deployment + spec: + containers: + - command: + - kubectl + - create + - -f + - /configs/cleanup-blueprints-pipeline.yaml + image: gcr.io/kubeflow-ci/test-worker-py3@sha256:b679ce5d7edbcc373fd7d28c57454f4f22ae987f200f601252b6dcca1fd8823b + imagePullPolicy: IfNotPresent + name: create-pipeline + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /configs + name: cleanup-config + restartPolicy: OnFailure + serviceAccountName: default-editor + volumes: + - configMap: + name: cleanup-config-4bm54d2bmb + name: cleanup-config + schedule: 0 */2 * * * + successfulJobsHistoryLimit: 3 + suspend: false +status: + lastScheduleTime: "2020-05-07T14:00:00Z" diff --git a/acm-repo/namespaces/auto-deploy/tekton.dev_v1alpha1_task_cleanup-kubeflow-ci.yaml b/acm-repo/namespaces/auto-deploy/tekton.dev_v1alpha1_task_cleanup-kubeflow-ci.yaml new file mode 100644 index 000000000..64347c633 --- /dev/null +++ b/acm-repo/namespaces/auto-deploy/tekton.dev_v1alpha1_task_cleanup-kubeflow-ci.yaml @@ -0,0 +1,66 @@ +apiVersion: tekton.dev/v1alpha1 +kind: Task +metadata: + annotations: + sidecar.istio.io/inject: "false" + name: cleanup-kubeflow-ci + namespace: auto-deploy +spec: + inputs: + params: + - default: kf-vbp-{uid} + description: The name for the Kubeflow deployment + name: name + type: string + - default: kubeflow-ci-deployment + description: The project to clean up. + name: project + type: string + - default: kf-ci-management + description: The name of the management cluster. + name: management-cluster-name + type: string + - default: kubeflow-ci + description: The project containing the management cluster + name: management-project + type: string + - default: us-central1 + description: The location of the management cluster + name: management-location + type: string + resources: + - description: The GitHub repo containing kubeflow testing scripts + name: testing-repo + type: git + steps: + - command: + - python + - -m + - kubeflow.testing.create_context + - create + - --name=$(inputs.params.management-project) + - --project=$(inputs.params.management-project) + - --location=$(inputs.params.management-location) + - --cluster=$(inputs.params.management-cluster-name) + - --namespace=$(inputs.params.project) + env: + - name: KUBECONFIG + value: /workspace/kubeconfig + - name: PYTHONPATH + value: /workspace/$(inputs.resources.testing-repo.name)/py + image: gcr.io/kubeflow-ci/test-worker-py3@sha256:b679ce5d7edbcc373fd7d28c57454f4f22ae987f200f601252b6dcca1fd8823b + name: create-context + - command: + - python + - -m + - kubeflow.testing.cleanup_blueprints + - auto-blueprints + - --project=$(inputs.params.project) + - --context=$(inputs.params.management-project) + env: + - name: KUBECONFIG + value: /workspace/kubeconfig + - name: PYTHONPATH + value: /workspace/$(inputs.resources.testing-repo.name)/py + image: gcr.io/kubeflow-ci/test-worker-py3@sha256:b679ce5d7edbcc373fd7d28c57454f4f22ae987f200f601252b6dcca1fd8823b + name: cleanup-ci diff --git a/acm-repo/namespaces/auto-deploy/~g_v1_configmap_cleanup-config-4bm54d2bmb.yaml b/acm-repo/namespaces/auto-deploy/~g_v1_configmap_cleanup-config-4bm54d2bmb.yaml new file mode 100644 index 000000000..cb4cdf948 --- /dev/null +++ b/acm-repo/namespaces/auto-deploy/~g_v1_configmap_cleanup-config-4bm54d2bmb.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +data: + cleanup-blueprints-pipeline.yaml: "# A Tekton PipelineRun to do a one off \n# cleaning + up the Kubeflow auto-deployed blueprints.\n#\napiVersion: tekton.dev/v1alpha1\nkind: + PipelineRun\nmetadata:\n generateName: cleanup-blueprints-\n namespace: auto-deploy\nspec:\n + \ # TODO(jlewi): Override any parameters?\n #params: {}\n resources: \n - + name: testing-repo\n resourceSpec:\n type: git\n params:\n # + TODO(jlewi): Switch to master on kubeflow/gcp-blueprints\n - name: revision\n + \ value: gcp_blueprint\n - name: url\n value: https://github.com/jlewi/testing.git\n + \ # Need to use a KSA with appropriate GSA\n serviceAccountName: default-editor\n + \ pipelineSpec:\n params:\n - name: management-cluster-name\n type: + string\n description: The name of the management cluster. \n default: + \"kf-ci-management\"\n resources:\n - name: testing-repo\n type: git\n + \ tasks:\n - name: cleanup-blueprints\n # TODO(jlewi): expose other + parameters? Right now\n # we are just relying on the defaults defined in + the task\n params:\n - name: management-cluster-name\n value: + \"$(params.management-cluster-name)\"\n resources:\n inputs: \n + \ - name: testing-repo\n resource: testing-repo\n taskRef:\n + \ name: cleanup-kubeflow-ci\n kind: namespaced " +kind: ConfigMap +metadata: + name: cleanup-config-4bm54d2bmb + namespace: auto-deploy diff --git a/playbook/README.md b/playbook/README.md index fe2441184..08a4ed164 100644 --- a/playbook/README.md +++ b/playbook/README.md @@ -2,6 +2,6 @@ This directory contains various playbooks for the Kubeflow test infrastructure. -* [auto_deploy.md][auto_deploy.md] - Playbook for auto deployed infrastructure +* [auto_deploy.md](auto_deploy.md) - Playbook for auto deployed infrastructure * [buildcop.md](buildcop.md) - Playbook for the buildcop * [playbook.md](playbook.md) - General playbook for the test infrastructure \ No newline at end of file diff --git a/py/kubeflow/testing/cleanup_blueprints.py b/py/kubeflow/testing/cleanup_blueprints.py new file mode 100644 index 000000000..792d39eb0 --- /dev/null +++ b/py/kubeflow/testing/cleanup_blueprints.py @@ -0,0 +1,232 @@ +"""Cleanup auto deployed blueprints. + +Note: This is in a separate file from cleanup_ci because we wanted to start +using Fire and python3. +""" +import collections +import datetime +from dateutil import parser as date_parser +import fire +import logging + +from kubeflow.testing import cnrm_clients +from kubeflow.testing import util +from kubernetes import client as k8s_client + +# The names of various labels used to encode information about the +# +# Which branch the blueprint was deployed from +BRANCH_LABEL = "blueprint-branch" +NAME_LABEL = "kf-name" +AUTO_DEPLOY_LABEL = "auto-deploy" + +def _iter_blueprints(namespace, context=None): + """Return an iterator over blueprints. + + Args: + namespace: The namespace to look for blueprints + context: The kube context to use. + """ + # We need to load the kube config so that we can have credentials to + # talk to the APIServer. + util.load_kube_config(persist_config=False, context=context) + + client = k8s_client.ApiClient() + crd_api = cnrm_clients.CnrmClientApi(client, "containercluster") + + clusters = crd_api.list_namespaced(namespace) + + for c in clusters.get("items"): + yield c + +def _delete_blueprints(namespace, to_keep_names, context=None, dryrun=True): + """Delete all auto-deployed resources that we don't want to keep. + + Args: + namespace: The namespace that owns the CNRM objects. + to_keep_names: Names of the blueprints to keep. + context: The kubeconfig context to use + + + This function deletes all auto-deployed resources that we don't want + to keep. This function is intended to delete any orphaned resources. + It works as follows. + + 1. For each type of resource we issue a list to find all autodeployed + resources + 2. We then remove any resource which belongs to a blueprint to keep + 3. We remove any resource that is less than 1 hours old + * This is to avoid race conditions where a blueprint was created + after to_keep was computedisks + 4. remaining resources are deleted. + """ + + util.load_kube_config(persist_config=False, context=context) + + client = k8s_client.ApiClient() + crd_api = k8s_client.CustomObjectsApi(client) + + BASE_GROUP = "cnrm.cloud.google.com" + CNRM_VERSION = "v1beta1" + + + # List of resources to GC + kinds = ["containercluster", "iampolicymember", + "iamserviceaccount", "containernodepool", + "computeaddress", "computedisk"] + + + # Mappings from resource type to list of resources + to_keep = collections.defaultdict(lambda: []) + to_delete = collections.defaultdict(lambda: []) + + api_client = k8s_client.ApiClient() + + # Loop over resources and identify resources to delete. + for kind in kinds: + client = cnrm_clients.CnrmClientApi(api_client, kind) + + selector = "{0}=true".format(AUTO_DEPLOY_LABEL) + results = client.list_namespaced(namespace, label_selector=selector) + + for i in results.get("items"): + name = i["metadata"]["name"] + + if name in to_keep_names: + to_keep[kind].append(name) + continue + + creation = date_parser.parse(i["metadata"]["creationTimestamp"]) + age = datetime.datetime.now(creation.tzinfo) - creation + if age < datetime.timedelta(hours=1): + to_keep[kind].append(name) + logging.info("Not GC'ing %s %s; it was created to recently", kind, + name) + continue + + to_delete[kind].append(name) + + for kind in kinds: + client = cnrm_clients.CnrmClientApi(api_client, kind) + for name in to_delete[kind]: + if dryrun: + logging.info("Dryrun: %s %s would be deleted", kind, name) + else: + logging.info("Deleting: %s %s", kind, name) + client.delete_namespaced(namespace, name, {}) + + for kind in kinds: + logging.info("Deleted %s:\n%s", kind, "\n".join(to_delete[kind])) + logging.info("Kept %s:\n%s", kind, "\n".join(to_keep[kind])) + +class Cleanup: + @staticmethod + def auto_blueprints(project, context, dryrun=True, blueprints=None): # pylint: disable=too-many-branches + """Cleanup auto deployed blueprints. + + For auto blueprints we only want to keep the most recent N deployments. + + Args: + project: The project that owns the deployments + context: The kubernetes context to use to talk to the Cloud config + Connector cluster. + dryrun: (True) set to false to actually cleanup. + blueprints: (Optional) iterator over CNRM ContainerCluster resources + corresponding to blueprints. + + Returns: + blueprints_to_delete: List of deployments to delete + blueprints_to_keep: List of deployments to keep + """ + logging.info("Cleanup auto blueprints") + + # Map from blueprint version e.g. "master" to a map of blueprint names to + # their insert time e.g. + # auto_deployments["master"]["kf-vbp-abcd"] returns the creation time + # of blueprint "kf-vbp-abcd" which was created from the master branch + # of the blueprints repo. + auto_deployments = collections.defaultdict(lambda: {}) + + if not blueprints: + blueprints = _iter_blueprints(project, context=context) + + for b in blueprints: + name = b["metadata"]["name"] + if not b["metadata"].get("creationTimestamp", None): + # This should not happen all K8s objects should have creation timestamp + logging.error("Cluster %s doesn't have a deployment time " + "skipping it", b["metadata"]["name"]) + continue + + # Use labels to identify auto-deployed instances + auto_deploy_label = b["metadata"].get("labels", {}).get(AUTO_DEPLOY_LABEL, + "false") + + is_auto_deploy = auto_deploy_label.lower() == "true" + + if not is_auto_deploy: + logging.info("Skipping cluster %s; its missing the auto-deploy label", + name) + + # Tha name of blueprint + kf_name = b["metadata"].get("labels", {}).get(NAME_LABEL, "") + + if not kf_name: + logging.info("Skipping cluster %s; it is not an auto-deployed instance", + name) + continue + + if kf_name != name: + # TODO(jlewi): This shouldn't be happening. Hopefully this was just + # temporary issue with the first couple of auto-deployed clusters I + # created and we can delete this code. + logging.error("Found cluster named:%s with label kf-name: %s. The name " + "will be used. This shouldn't happen. This hopefully " + "was just due to a temporary bug in the early versions " + "of create_kf_from_gcp_blueprint.py that should be fixed " + "so it shouldn't be happening in new instances anymore." + , name, kf_name) + kf_name = name + + logging.info("Blueprint %s is auto deployed", kf_name) + + blueprint_branch = b["metadata"]["labels"].get(BRANCH_LABEL, "unknown") + + if blueprint_branch == "unknown": + logging.warning("Blueprint %s was missing label %s", kf_name, + BRANCH_LABEL) + + if kf_name in auto_deployments[blueprint_branch]: + continue + + auto_deployments[blueprint_branch][kf_name] = ( + date_parser.parse(b["metadata"]["creationTimestamp"])) + + # Garbage collect the blueprints + to_keep = [] + to_delete = [] + for version, matched_deployments in auto_deployments.items(): + logging.info("For version=%s found deployments:\n%s", version, + "\n".join(matched_deployments.keys())) + + # Sort the deployment by the insert time + pairs = matched_deployments.items() + sorted_pairs = sorted(pairs, key=lambda x: x[1]) + + # keep the 3 most recent deployments + to_keep.extend([p[0] for p in sorted_pairs[-3:]]) + to_delete.extend([p[0] for p in sorted_pairs[:-3]]) + + _delete_blueprints(project, to_keep, context=context, + dryrun=dryrun) + + logging.info("Finish cleanup auto blueprints") + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + fire.Fire(Cleanup) diff --git a/py/kubeflow/testing/cleanup_ci.py b/py/kubeflow/testing/cleanup_ci.py index 9cdde2b3d..ba4675308 100644 --- a/py/kubeflow/testing/cleanup_ci.py +++ b/py/kubeflow/testing/cleanup_ci.py @@ -14,6 +14,7 @@ import yaml from kubeflow.testing import argo_client + from kubeflow.testing import util from kubernetes import client as k8s_client from googleapiclient import discovery @@ -1247,9 +1248,10 @@ def cleanup_clusters(args): # https://github.com/kubernetes/ingress-gce/issues/136#issuecomment-371254595 def cleanup_all(args): - ops = [# Deleting deploymens should be called first because hopefully that will + ops = [# Deleting deployments should be called first because hopefully that will # cleanup all the resources associated with the deployment cleanup_auto_deployments, + cleanup_auto_blueprints, cleanup_deployments, cleanup_clusters, cleanup_endpoints, @@ -1294,6 +1296,12 @@ def add_deployments_args(parser): "--zones", default="us-east1-d,us-central1-a", type=str, help="Comma separated list of zones to check.") +def add_blueprint_args(parser): + parser.add_argument( + "--management_context", default="kf-ci-deployment-management", + help="Kubeconfig context for the management context used with " + "blueprints.") + def main(): logging.basicConfig(level=logging.INFO, format=('%(levelname)s|%(asctime)s' @@ -1370,7 +1378,6 @@ def main(): parser_firewall.set_defaults(func=cleanup_firewall_rules) - ###################################################### # Parser for health checks parser_health = subparsers.add_parser( @@ -1433,6 +1440,13 @@ def main(): add_deployments_args(parser_ig) parser_ig.set_defaults(func=cleanup_instance_groups) + ###################################################### + # Parser for auto_blueprints + parser_blueprints = subparsers.add_parser( + "blueprints", help="Cleanup blueprints") + add_blueprint_args(parser_blueprints) + parser_blueprints.set_defaults(func=cleanup_auto_blueprints) + args = parser.parse_args() # Update max age diff --git a/py/kubeflow/testing/cnrm_clients.py b/py/kubeflow/testing/cnrm_clients.py new file mode 100644 index 000000000..d12aa6a58 --- /dev/null +++ b/py/kubeflow/testing/cnrm_clients.py @@ -0,0 +1,42 @@ +"""Some helper functions for working with CNRM resources.""" +from kubernetes import client as k8s_client + +class CnrmClientApi(k8s_client.CustomObjectsApi): + """A wrapper around CustomObjectsApi.""" + + def __init__(self, client, kind): + """Create the client. + + Args: + client: K8s client + kind: The kind to generate the client for. + """ + super(CnrmClientApi, self).__init__(client) + + self.kind = kind + self.version = "v1beta1" + + if kind in ["containercluster", "containernodepool"]: + self.group = "container.cnrm.cloud.google.com" + elif kind in ["iampolicymember", "iamserviceaccount"]: + self.group = "iam.cnrm.cloud.google.com" + elif kind in ["computeaddress", "computedisk"]: + self.group = "compute.cnrm.cloud.google.com" + else: + raise ValueError("No CNRM client configured for kind {0}".format(kind)) + + if kind[-1] != "s": + self.plural = kind + "s" + else: + self.plural = kind + "es" + + def list_namespaced(self, namespace, **kwargs): + return self.list_namespaced_custom_object( + self.group, self.version, namespace, self.plural, **kwargs) + + def delete_namespaced(self, namespace, name, body, **kwargs): + return self.delete_namespaced_custom_object(self.group, self.version, + namespace, self.plural, name, + body, **kwargs) + + # TODO(jlewi): Add other methods as needed. diff --git a/py/kubeflow/testing/create_context.py b/py/kubeflow/testing/create_context.py new file mode 100644 index 000000000..dff070833 --- /dev/null +++ b/py/kubeflow/testing/create_context.py @@ -0,0 +1,44 @@ +"""A simple CLI to create Kubernetes contexts.""" +import logging +import fire +import subprocess +import re + +class ContextCreator: + @staticmethod + def create(project, location, cluster, name, namespace): + """Create a context for the given GCP cluster. + + Args: + project: Project that owns the cluster + location: zone or region for the cluster + cluster: Name of the cluster + name: Name to give the context + namespace: Namespace to use for the context. + """ + + if re.match("[^-]+-[^-]+-[^-]", location): + location_type = "zone" + else: + location_type = "region" + subprocess.check_call(["gcloud", f"--project={project}", "container", + "clusters", "get-credentials", + f"--{location_type}={location}", cluster]) + + current_context = subprocess.check_output(["kubectl", "config", + "current-context"]).strip() + subprocess.check_call(["kubectl", "config", "rename-context", + current_context, name]) + + # Set the namespace + subprocess.check_call(["kubectl", "config", "set-context", "--current", + "--namespace={namespace}"]) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + fire.Fire(ContextCreator) diff --git a/tekton/templates/Makefile b/tekton/templates/Makefile index 8dbe09802..e8a8edc15 100644 --- a/tekton/templates/Makefile +++ b/tekton/templates/Makefile @@ -1,4 +1,9 @@ ACM_REPO_DIR=../../acm-repo +AUTO_DEPLOY_CONTEXT=kf-ci-v1 + .PHONY: hydrate hydrate: - kustomize build -o $(ACM_REPO_DIR)/namespaces/auto-deploy ./installs/auto-deploy \ No newline at end of file + kustomize build -o $(ACM_REPO_DIR)/namespaces/auto-deploy ./installs/auto-deploy + +apply: hydrate + kubectl --context=kf-ci-v1 -n auto-deploy apply -f ../../acm-repo/namespaces/auto-deploy diff --git a/tekton/templates/tasks/cleanup-kubeflow-ci.yaml b/tekton/templates/tasks/cleanup-kubeflow-ci.yaml new file mode 100644 index 000000000..89af2c42f --- /dev/null +++ b/tekton/templates/tasks/cleanup-kubeflow-ci.yaml @@ -0,0 +1,68 @@ +# A Tekton task to cleanup the kubeflow-ci-project +apiVersion: tekton.dev/v1alpha1 +kind: Task +metadata: + name: cleanup-kubeflow-ci + # TODO(jlewi): Should we use a ClusterTask + namespace: tektoncd + annotations: + sidecar.istio.io/inject: "false" +spec: + inputs: + params: + - name: name + type: string + description: The name for the Kubeflow deployment + default: "kf-vbp-{uid}" + - name: project + type: string + description: The project to clean up. + default: "kubeflow-ci-deployment" + - name: management-cluster-name + type: string + description: The name of the management cluster. + default: "kf-ci-management" + - name: management-project + type: string + description: The project containing the management cluster + default: kubeflow-ci + - name: management-location + type: string + description: The location of the management cluster + default: us-central1 + resources: + - name: testing-repo + type: git + description: The GitHub repo containing kubeflow testing scripts + steps: + - name: create-context + image: gcr.io/kubeflow-ci/test-worker-py3@sha256:b679ce5d7edbcc373fd7d28c57454f4f22ae987f200f601252b6dcca1fd8823b + command: + - python + - -m + - kubeflow.testing.create_context + - create + - --name=$(inputs.params.management-project) + - --project=$(inputs.params.management-project) + - --location=$(inputs.params.management-location) + - --cluster=$(inputs.params.management-cluster-name) + - --namespace=$(inputs.params.project) + env: + - name: KUBECONFIG + value: /workspace/kubeconfig + - name: PYTHONPATH + value: /workspace/$(inputs.resources.testing-repo.name)/py + - name: cleanup-ci + image: gcr.io/kubeflow-ci/test-worker-py3@sha256:b679ce5d7edbcc373fd7d28c57454f4f22ae987f200f601252b6dcca1fd8823b + command: + - python + - -m + - kubeflow.testing.cleanup_blueprints + - auto-blueprints + - --project=$(inputs.params.project) + - --context=$(inputs.params.management-project) + env: + - name: KUBECONFIG + value: /workspace/kubeconfig + - name: PYTHONPATH + value: /workspace/$(inputs.resources.testing-repo.name)/py diff --git a/tekton/templates/tasks/kustomization.yaml b/tekton/templates/tasks/kustomization.yaml index 83bc88891..08bf1999d 100644 --- a/tekton/templates/tasks/kustomization.yaml +++ b/tekton/templates/tasks/kustomization.yaml @@ -4,5 +4,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: +- cleanup-kubeflow-ci.yaml - deploy-gcp-blueprint.yaml - notebook-test-task.yaml \ No newline at end of file diff --git a/test-infra/README.md b/test-infra/README.md new file mode 100644 index 000000000..13d92ec93 --- /dev/null +++ b/test-infra/README.md @@ -0,0 +1,5 @@ +# Test-infra + +Declarative configurations for various pieces of our CI infrastructure. + +* **ks_app** Some lingering ksonnet configurations for various jobs. \ No newline at end of file diff --git a/test-infra/cleanup/Makefile b/test-infra/cleanup/Makefile new file mode 100644 index 000000000..2331936b0 --- /dev/null +++ b/test-infra/cleanup/Makefile @@ -0,0 +1,11 @@ +ACM_REPO_DIR=../../acm-repo +AUTO_DEPLOY_CONTEXT=kf-ci-v1 + +.PHONY: hydrate +hydrate: + # Remove old config maps. + rm -f $(ACM_REPO_DIR)/namespaces/auto-deploy/~g_v1_configmap_cleanup-config-* + kustomize build -o $(ACM_REPO_DIR)/namespaces/auto-deploy ./ + +apply: hydrate + kubectl --context=kf-ci-v1 -n auto-deploy apply -f ../../acm-repo/namespaces/auto-deploy diff --git a/test-infra/cleanup/README.md b/test-infra/cleanup/README.md new file mode 100644 index 000000000..b9d8b50c2 --- /dev/null +++ b/test-infra/cleanup/README.md @@ -0,0 +1,16 @@ +# K8s resources to cleanup test infrastructure + +This directory contains K8s manifests to cleanup the Kubeflow CI. + +Per [kubeflow/testing#654](https://github.com/kubeflow/testing/issues/654) we are +in the process of: + +* Migrating from using K8s Jobs to using Tekton +* Using Kustomize as opposed to ksonnet +* Using GitOps(ACM) to keep the test infra up to date with the latest configs. + +This directory contains a kustomize manifest for a cron-job to submit +a Tekton Pipeline to cleanup blueprint auto-deployments. + +The PipelineRun in this directory can also be used for one-off manual runs of the cleanup +pipeline. \ No newline at end of file diff --git a/test-infra/cleanup/cleanup-blueprints-pipeline.yaml b/test-infra/cleanup/cleanup-blueprints-pipeline.yaml new file mode 100644 index 000000000..61a7cd649 --- /dev/null +++ b/test-infra/cleanup/cleanup-blueprints-pipeline.yaml @@ -0,0 +1,46 @@ +# A Tekton PipelineRun to do a one off +# cleaning up the Kubeflow auto-deployed blueprints. +# +apiVersion: tekton.dev/v1alpha1 +kind: PipelineRun +metadata: + generateName: cleanup-blueprints- + namespace: auto-deploy +spec: + # TODO(jlewi): Override any parameters? + #params: {} + resources: + - name: testing-repo + resourceSpec: + type: git + params: + # TODO(jlewi): Switch to master on kubeflow/gcp-blueprints + - name: revision + value: gcp_blueprint + - name: url + value: https://github.com/jlewi/testing.git + # Need to use a KSA with appropriate GSA + serviceAccountName: default-editor + pipelineSpec: + params: + - name: management-cluster-name + type: string + description: The name of the management cluster. + default: "kf-ci-management" + resources: + - name: testing-repo + type: git + tasks: + - name: cleanup-blueprints + # TODO(jlewi): expose other parameters? Right now + # we are just relying on the defaults defined in the task + params: + - name: management-cluster-name + value: "$(params.management-cluster-name)" + resources: + inputs: + - name: testing-repo + resource: testing-repo + taskRef: + name: cleanup-kubeflow-ci + kind: namespaced \ No newline at end of file diff --git a/test-infra/cleanup/cleanup-ci-cronjob.yaml b/test-infra/cleanup/cleanup-ci-cronjob.yaml new file mode 100644 index 000000000..d1b86234a --- /dev/null +++ b/test-infra/cleanup/cleanup-ci-cronjob.yaml @@ -0,0 +1,56 @@ +# A cronjob to regularly run the cleanup Pipeline. +# +# The cronjob uses kubectl to launch a Tekton PipelineRun. +# The PipelineRun is provided via configmap. +apiVersion: batch/v1beta1 +kind: CronJob +metadata: + labels: + app: cleanup-ci-kubeflow-ci-deployment + name: cleanup-ci-kubeflow-ci-deployment +spec: + concurrencyPolicy: Forbid + failedJobsHistoryLimit: 1 + jobTemplate: + metadata: + creationTimestamp: null + labels: + job: cleanup-kubeflow-ci-deployment + annotations: + sidecar.istio.io/inject: "false" + spec: + template: + metadata: + labels: + job: cleanup-kubeflow-ci-deployment + annotations: + sidecar.istio.io/inject: "false" + spec: + containers: + - command: + - kubectl + - create + - -f + - /configs/cleanup-blueprints-pipeline.yaml + image: gcr.io/kubeflow-ci/test-worker-py3@sha256:b679ce5d7edbcc373fd7d28c57454f4f22ae987f200f601252b6dcca1fd8823b + imagePullPolicy: IfNotPresent + name: create-pipeline + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /configs + name: cleanup-config + restartPolicy: OnFailure + # Need to use a service account bound to GSA + serviceAccountName: default-editor + volumes: + - name: cleanup-config + configMap: + # Kustomize will automatically replace the name with the unique name given + # to the configmap based on the config contents. + name: cleanup-config + schedule: 0 */2 * * * + successfulJobsHistoryLimit: 3 + suspend: false +status: + lastScheduleTime: "2020-05-07T14:00:00Z" diff --git a/test-infra/cleanup/kustomization.yaml b/test-infra/cleanup/kustomization.yaml new file mode 100644 index 000000000..0757fac36 --- /dev/null +++ b/test-infra/cleanup/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +# Currently run in auto-deploy namespace to have appropriate credentials +namespace: auto-deploy +resources: +- cleanup-ci-cronjob.yaml +configMapGenerator: +- name: cleanup-config + files: + # key will be name of the file + - ./cleanup-blueprints-pipeline.yaml \ No newline at end of file