diff --git a/images/Dockerfile.py3.aws b/images/Dockerfile.py3.aws new file mode 100644 index 000000000..686b73989 --- /dev/null +++ b/images/Dockerfile.py3.aws @@ -0,0 +1,93 @@ +# Build the docker image used to run the scripts +# to continuously update our docker files. +# +# The context for this docker file should be the root of the kubeflow/testing repository. +FROM ubuntu:18.04 + +RUN apt-get update -y && \ + apt-get install -y curl git python3.8 python3-pip wget jq && \ + ln -sf /usr/bin/python3.8 /usr/bin/python + +RUN python3.8 -m pip install \ + filelock \ + fire \ + google-api-python-client \ + google-cloud \ + google-cloud-storage \ + junit-xml \ + # See https://github.com/kubeflow/gcp-blueprints/issues/52#issuecomment-645446088 + # our libs seem to break with 11.0.0 + kubernetes==9.0.0 \ + lint \ + oauth2client \ + pytest==5.4 \ + pytest-timeout==1.4 \ + python-dateutil \ + retrying \ + watchdog \ + awscli \ + boto3 + +# Install go +RUN cd /tmp && \ + wget -O /tmp/go.tar.gz https://dl.google.com/go/go1.14.2.linux-amd64.tar.gz && \ + tar -C /usr/local -xzf go.tar.gz + +# Install the hub CLI for git +RUN cd /tmp && \ + curl -LO https://github.com/github/hub/releases/download/v2.13.0/hub-linux-amd64-2.13.0.tgz && \ + tar -xvf hub-linux-amd64-2.13.0.tgz && \ + mv hub-linux-amd64-2.13.0 /usr/local && \ + ln -sf /usr/local/hub-linux-amd64-2.13.0/bin/hub /usr/local/bin/hub + +RUN export KUSTOMIZE_VERSION=3.2.0 && \ + cd /tmp && \ + curl -LO https://github.com/kubernetes-sigs/kustomize/releases/download/v${KUSTOMIZE_VERSION}/kustomize_${KUSTOMIZE_VERSION}_linux_amd64 && \ + mv kustomize_${KUSTOMIZE_VERSION}_linux_amd64 /usr/local/bin/kustomize && \ + chmod a+x /usr/local/bin/kustomize + +# Install kubectl +RUN curl -LO https://amazon-eks.s3.us-west-2.amazonaws.com/1.17.9/2020-08-04/bin/linux/amd64/kubectl && \ + mv kubectl /usr/local/bin && \ + chmod a+x /usr/local/bin/kubectl + +# Install aws-iam-authenticator +RUN curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.17.9/2020-08-04/bin/linux/amd64/aws-iam-authenticator && \ + mv aws-iam-authenticator /usr/local/bin && \ + chmod a+x /usr/local/bin/aws-iam-authenticator + +# Install eksctl +RUN curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && \ + mv /tmp/eksctl /usr/local/bin && \ + chmod a+x /usr/local/bin/eksctl + +# Install ks +RUN curl --silent --location https://github.com/ksonnet/ksonnet/releases/download/v0.13.1/ks_0.13.1_linux_amd64.tar.gz | tar xz -C /tmp && \ + mv /tmp/ks_0.13.1_linux_amd64/ks /usr/local/bin/ks-13 && \ + chmod a+x /usr/local/bin/ks-13 + +# Create go symlinks +RUN ln -sf /usr/local/go/bin/go /usr/local/bin && \ + ln -sf /usr/local/go/bin/gofmt /usr/local/bin && \ + ln -sf /usr/local/go/bin/godoc /usr/local/bin + +RUN go get -u github.com/jstemmer/go-junit-report + +COPY ./images/checkout.sh /usr/local/bin +COPY ./images/checkout_repos.sh /usr/local/bin +RUN chmod a+x /usr/local/bin/checkout* + +COPY ./images/run_workflows.sh /usr/local/bin +RUN chmod a+x /usr/local/bin/run_workflows.sh + +# AWS BASH SCRIPTS +COPY ./images/aws-scripts/*.sh /usr/local/bin/ +RUN chmod a+x /usr/local/bin/*.sh + +ENV PYTHONPATH /src/kubeflow/testing/py + +ENV CLOUD_PROVIDER aws + +ENV PATH=/root/go/bin:${PATH} + +ENTRYPOINT ["/usr/local/bin/run_workflows.sh"] \ No newline at end of file diff --git a/images/aws-scripts/OWNERS b/images/aws-scripts/OWNERS new file mode 100644 index 000000000..cbc5a1109 --- /dev/null +++ b/images/aws-scripts/OWNERS @@ -0,0 +1,3 @@ +approvers: +- Jeffwan +- PatrickXYS \ No newline at end of file diff --git a/images/aws-scripts/check-load-balancer-status.sh b/images/aws-scripts/check-load-balancer-status.sh new file mode 100755 index 000000000..70d4b0d61 --- /dev/null +++ b/images/aws-scripts/check-load-balancer-status.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Copyright 2018 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This shell script is used to check if all deployments are RUNNING + +set -o errexit +set -o nounset +set -o pipefail + +EKS_CLUSTER_NAME="${CLUSTER_NAME}" + +aws eks update-kubeconfig --name=$EKS_CLUSTER_NAME + +echo "Start Fetching Ingress IP Address" + +# Retry 10 times w/ 30 seconds interval +retry_times=0 +retry_limit=10 +while [ "$retry_times" -lt "$retry_limit" ] +do + echo "See if we can fetch ingress" + ingress_ip=$(kubectl get ingress istio-ingress -n istio-system -o json | jq -r '.status.loadBalancer.ingress[0].hostname') + if [ ${#ingress_ip} -eq 0 ] ; + then + sleep 30 + echo "Retrying Fetching Ingress IP Address" + else + echo "The Kubeflow Deployment succeeded" + exit 0 + fi + + retry_times=$((retry_times+1)) +done + +echo "Kubeflow Deployment Status: ERROR" +exit 64 \ No newline at end of file diff --git a/images/aws-scripts/create-eks-cluster.sh b/images/aws-scripts/create-eks-cluster.sh new file mode 100755 index 000000000..8b4d9964b --- /dev/null +++ b/images/aws-scripts/create-eks-cluster.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright 2018 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This shell script is used to build an EKS cluster from our argo workflow + +set -o errexit +set -o nounset +set -o pipefail + +EKS_CLUSTER_NAME="${CLUSTER_NAME}" + +# Create EKS Cluster +# TODO (PatrickXYS): Need to determine which NG template we need +eksctl create cluster \ +--name $EKS_CLUSTER_NAME \ +--version ${EKS_CLUSTER_VERSION:-"1.17"} \ +--region ${AWS_REGION:-"us-west-2"} \ +--nodegroup-name linux-nodes \ +--node-type ${EKS_NODE_TYPE:-"m5.xlarge"} \ +--nodes ${DESIRED_NODE:-"2"} \ +--nodes-min ${MIN_NODE:-"1"} \ +--nodes-max ${MAX_NODE:-"4"} \ +--managed diff --git a/images/aws-scripts/delete-eks-cluster.sh b/images/aws-scripts/delete-eks-cluster.sh new file mode 100755 index 000000000..279b3de96 --- /dev/null +++ b/images/aws-scripts/delete-eks-cluster.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Copyright 2018 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This shell script is used to delete an EKS cluster from our argo workflow + +set -o errexit +set -o nounset +set -o pipefail + +EKS_CLUSTER_NAME="${CLUSTER_NAME}" + +# Delete EKS Cluster +eksctl delete cluster $EKS_CLUSTER_NAME \ No newline at end of file diff --git a/images/aws-scripts/deploy-kubeflow.sh b/images/aws-scripts/deploy-kubeflow.sh new file mode 100755 index 000000000..a5be7f5e4 --- /dev/null +++ b/images/aws-scripts/deploy-kubeflow.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Copyright 2018 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This shell script is used to deploy kubeflow by kfctl + +set -o errexit +set -o nounset +set -o pipefail + +EKS_CLUSTER_NAME="${CLUSTER_NAME}" +EKS_NAMESPACE_NAME="${EKS_NAMESPACE}" + +# Load kubeconfig +aws eks update-kubeconfig --name=$EKS_CLUSTER_NAME + +# Fetch v1.1-branch kfctl +wget https://github.com/PatrickXYS/kfctl/releases/download/test1/kfctl_v1.1.0-2-g08ee6e4_linux.tar.gz -O kfctl.tar.gz +tar -xvf kfctl.tar.gz + +# Add kfctl to PATH, to make the kfctl binary easier to use. +export PATH=$PATH:"$PWD:kfctl" + +echo "kfctl version: " +kfctl version + +### Workaround to fix issue +## msg="Encountered error applying application bootstrap: (kubeflow.error): Code 500 with message: Apply.Run +## : error when creating \"/tmp/kout927048001\": namespaces \"kubeflow-test-infra\" not found" filename="kustomize/kustomize.go:266" +kubectl create namespace $EKS_NAMESPACE_NAME +### + +# Use the following kfctl configuration file for the AWS setup without authentication: +export CONFIG_URI="https://raw.githubusercontent.com/kubeflow/manifests/v1.1-branch/kfdef/kfctl_aws.v1.1.0.yaml" + +# Create the directory you want to store deployment, this has to be ${EKS_CLUSTER_NAME} +mkdir ${EKS_CLUSTER_NAME} && cd ${EKS_CLUSTER_NAME} + +# Download your configuration files, so that you can customize the configuration before deploying Kubeflow. +wget -O kfctl_aws.yaml $CONFIG_URI + +# Deploy Kubeflow +kfctl apply -V -f kfctl_aws.yaml diff --git a/images/aws-scripts/uninstall-kubeflow.sh b/images/aws-scripts/uninstall-kubeflow.sh new file mode 100644 index 000000000..d19c06b3a --- /dev/null +++ b/images/aws-scripts/uninstall-kubeflow.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Copyright 2018 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This shell script is used to uninstall kubeflow by kfctl + +set -o errexit +set -o nounset +set -o pipefail + +EKS_CLUSTER_NAME="${CLUSTER_NAME}" +EKS_NAMESPACE_NAME="${EKS_NAMESPACE}" + +# Load kubeconfig +aws eks update-kubeconfig --name=$EKS_CLUSTER_NAME + +# Add kfctl to PATH, to make the kfctl binary easier to use. +export PATH=$PATH:"$PWD:kfctl" + +echo "kfctl version: " +kfctl version + +# Cd directory ${EKS_CLUSTER_NAME} +cd ${EKS_CLUSTER_NAME} + +# Print YAML file +cat kfctl_aws.yaml + +# Uninstall Kubeflow +kfctl delete -V -f kfctl_aws.yaml \ No newline at end of file diff --git a/images/run_workflows.sh b/images/run_workflows.sh index 4c0bcc4a0..ccdbf75ac 100644 --- a/images/run_workflows.sh +++ b/images/run_workflows.sh @@ -9,10 +9,25 @@ set -ex /usr/local/bin/checkout.sh /src # Trigger a workflow -python -m kubeflow.testing.run_e2e_workflow \ - --project=kubeflow-ci \ - --zone=us-east1-d \ - --cluster=kubeflow-testing \ - --bucket=kubernetes-jenkins \ - --config_file=/src/${REPO_OWNER}/${REPO_NAME}/prow_config.yaml \ - --repos_dir=/src +if [ -z "$CLOUD_PROVIDER" ] || [ "$CLOUD_PROVIDER" == "gcp" ] +then + python -m kubeflow.testing.run_e2e_workflow \ + --project=kubeflow-ci \ + --zone=us-east1-d \ + --cluster=kubeflow-testing \ + --bucket=kubernetes-jenkins \ + --config_file=/src/${REPO_OWNER}/${REPO_NAME}/prow_config.yaml \ + --repos_dir=/src +else + if [[ "$CLOUD_PROVIDER" == "aws" ]] + then + echo "Triggering AWS Argo Workflows" + python -m kubeflow.testing.run_e2e_workflow \ + --cluster=${AWS_EKS_CLUSTER:-"kubeflow-shared-test-infra-poc-argo"} \ + --bucket=${ARTIFACTS_S3_BUCKET:-"aws-kubernetes-jenkins"} \ + --config_file=/src/${REPO_OWNER}/${REPO_NAME}/prow_config.yaml \ + --repos_dir=/src \ + --cloud_provider=aws \ + --aws_region=${AWS_DEFAULT_REGION:-"us-west-2"} + fi +fi \ No newline at end of file diff --git a/py/kubeflow/testing/cloudprovider/aws/OWNERS b/py/kubeflow/testing/cloudprovider/aws/OWNERS new file mode 100644 index 000000000..cbc5a1109 --- /dev/null +++ b/py/kubeflow/testing/cloudprovider/aws/OWNERS @@ -0,0 +1,3 @@ +approvers: +- Jeffwan +- PatrickXYS \ No newline at end of file diff --git a/py/kubeflow/testing/cloudprovider/aws/__init__.py b/py/kubeflow/testing/cloudprovider/aws/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/py/kubeflow/testing/cloudprovider/aws/prow_artifacts.py b/py/kubeflow/testing/cloudprovider/aws/prow_artifacts.py new file mode 100644 index 000000000..0ff51a715 --- /dev/null +++ b/py/kubeflow/testing/cloudprovider/aws/prow_artifacts.py @@ -0,0 +1,371 @@ +"""Script to create artifacts needed by Gubernator. + +For reference see: +https://github.com/kubernetes/test-infra/tree/master/gubernator +""" +import argparse +import logging +import json +import os +import six +import time +from kubeflow.testing import test_util +import boto3 + +from kubeflow.testing.cloudprovider.aws import util as aws_util + +# The default bucket where we should upload artifacts to in +# prow. Currently AWS test-grid and spyglass are looking at the aws-kubernetes-jenkins bucket +AWS_PROW_RESULTS_BUCKET = "aws-kubernetes-jenkins" + +# TODO(jlewi): Replace create_finished in tensorflow/k8s/py/prow.py with this +# version. We should do that when we switch tensorflow/k8s to use Argo instead +# of Airflow. +def create_started(ui_urls): + """Return a string containing the contents of started.json for gubernator. + + ui_urls: Dictionary of workflow name to URL corresponding to the Argo UI + for the workflows launched. + """ + # See: + # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout + # For a list of fields expected by gubernator + started = { + "timestamp": int(time.time()), + "repos": { + }, + "metadata": { + }, + } + + repo_owner = os.getenv("REPO_OWNER", "") + repo_name = os.getenv("REPO_NAME", "") + + if repo_owner: + sha = os.getenv("PULL_PULL_SHA", "") + if not sha: + # Its a post submit job. + sha = os.getenv("PULL_BASE_SHA", "") + + started["repos"][repo_owner + "/" + repo_name] = sha + + PULL_REFS = os.getenv("PULL_REFS", "") + if PULL_REFS: + started["pull"] = PULL_REFS + + if six.PY3: + items = ui_urls.items() + else: + items = ui_urls.iteritems() + + for n, v in items: + started["metadata"][n + "-ui"] = v + return json.dumps(started) + + +# TODO(jlewi): Replace create_finished in tensorflow/k8s/py/prow.py with this +# version. We should do that when we switch tensorflow/k8s to use Argo instead +# of Airflow. +def create_finished(success, workflow_phase, ui_urls): + """Create a string containing the contents for finished.json. + + Args: + success: Bool indicating whether the workflow succeeded or not. + workflow_phase: Dictionary of workflow name to phase. + ui_urls: Dictionary of workflow name to URL corresponding to the Argo UI + for the workflows launched. + """ + if success: + result = "SUCCESS" + else: + result = "FAILED" + finished = { + "timestamp": int(time.time()), + "result": result, + # Dictionary of extra key value pairs to display to the user. + # TODO(jlewi): Perhaps we should add the GCR path of the Docker image + # we are running in. We'd have to plumb this in from bootstrap. + "metadata": { + }, + } + # kettle (https://github.com/kubernetes/test-infra/tree/master/kettle) expexts + # to get commit information in finished["metadata"]["repos"]. + # We leverage kettle to upload kubeflow test logs into bigquery. + PULL_REFS = os.getenv("PULL_REFS", "") + repo_owner = os.getenv("REPO_OWNER", "") + repo_name = os.getenv("REPO_NAME", "") + if repo_owner and PULL_REFS: + finished["metadata"]["repos"] = {} + finished["metadata"]["repos"][repo_owner + "/" + repo_name] = PULL_REFS + + names = set() + names.update(workflow_phase.keys()) + names.update(ui_urls.keys()) + for n in names: + finished["metadata"][n + "-phase"] = workflow_phase.get(n, "") + finished["metadata"][n + "-ui"] = ui_urls.get(n, "") + return json.dumps(finished) + + +def create_finished_file_s3(bucket, success, workflow_phase, ui_urls): + """Create the started file in S3 for gubernator.""" + contents = create_finished(success, workflow_phase, ui_urls) + + target = os.path.join(get_s3_dir(bucket), "finished.json") + aws_util.upload_to_s3(contents, target, "finished.json") + + +def get_s3_dir(bucket): + """Return the s3 directory for this job.""" + pull_number = os.getenv("PULL_NUMBER") + repo_owner = os.getenv("REPO_OWNER") + repo_name = os.getenv("REPO_NAME") + job_name = os.getenv("JOB_NAME") + job_type = os.getenv("JOB_TYPE") + + # Based on the prow docs the variable is BUILD_ID + # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables + # But it looks like the original version of this code was using BUILD_NUMBER. + # BUILD_NUMBER is now deprecated. + # https://github.com/kubernetes/test-infra/blob/master/prow/ANNOUNCEMENTS.md + # In effort to be defensive we try BUILD_ID and fall back to BUILD_NUMBER + build = os.getenv("BUILD_ID") + if not build: + logging.warning("BUILD_ID not set; trying BUILD_NUMBER; BUILD_NUMBER is deprecated") + build = os.getenv("BUILD_NUMBER") + + if job_type == "presubmit": + output = ("s3://{bucket}/pr-logs/pull/{owner}_{repo}/" + "{pull_number}/{job}/{build}").format( + bucket=bucket, + owner=repo_owner, repo=repo_name, + pull_number=pull_number, + job=os.getenv("JOB_NAME"), + build=build) + elif job_type == "postsubmit": + # It is a postsubmit job + output = ("s3://{bucket}/logs/{owner}_{repo}/" + "{job}/{build}").format( + bucket=bucket, owner=repo_owner, + repo=repo_name, job=job_name, + build=build) + else: + # Its a periodic job + output = ("s3://{bucket}/logs/{job}/{build}").format( + bucket=bucket, + job=job_name, + build=build) + + return output + + +def copy_artifacts_to_s3(args): + """Sync artifacts to S3.""" + # S3 layout is defined here: + # Example S3 layout: + # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout + + output = get_s3_dir(args.bucket) + + if args.suffix: + logging.info("Renaming all artifact files to include %s", args.suffix) + for dirpath, _, files in os.walk(args.artifacts_dir): + for filename in files: + full_path = os.path.join(dirpath, filename) + + name, ext = os.path.splitext(filename) + new_name = "{0}-{1}{2}".format(name, args.suffix, ext) + new_path = os.path.join(dirpath, new_name) + logging.info("Rename %s to %s", full_path, new_path) + os.rename(full_path, new_path) + aws_util.run(["aws", "s3", "sync", args.artifacts_dir, output]) + + +def create_pr_symlink_s3(args): + """Create a 'symlink' in S3 pointing at the results for a PR. + + This is a null op if PROW environment variables indicate this is not a PR + job. + """ + s3 = boto3.resource('s3') + # S3 layout is defined here: + # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout + pull_number = os.getenv("PULL_NUMBER") + if not pull_number: + # Symlinks are only created for pull requests. + return + + path = "pr-logs/directory/{job}/{build}.txt".format( + job=os.getenv("JOB_NAME"), build=os.getenv("BUILD_NUMBER")) + + pull_number = os.getenv("PULL_NUMBER") + + source = aws_util.to_s3_uri(args.bucket, path) + target = get_s3_dir(args.bucket) + logging.info("Creating symlink %s pointing to %s", source, target) + + file_name = "{build}.txt".format(build=os.getenv("BUILD_NUMBER")) + with open(file_name, "w+") as data: + data.write(target) + s3.meta.client.upload_file(file_name, args.bucket, path) + + +def check_no_errors_s3(s3_client, artifacts_dir): + """Check that all the XML files exist and there were no errors. + Args: + s3_client: The S3 client. + artifacts_dir: The directory where artifacts should be stored. + Returns: + True if there were no errors and false otherwise. + """ + bucket, prefix = aws_util.split_s3_uri(artifacts_dir) + no_errors = True + + junit_objects = s3_client.list_objects(Bucket=bucket, Prefix=os.path.join(prefix, "junit")) + + if "Contents" not in junit_objects.keys(): + return no_errors + + for b in junit_objects["Contents"]: + full_path = aws_util.to_s3_uri(bucket, b["Key"]) + if not os.path.splitext(b["Key"])[-1] == ".xml": + logging.info("Skipping %s; not an xml file", full_path) + continue + logging.info("Checking %s", full_path) + tmp_file = "/tmp/junit.xml" + s3_client.download_file(bucket, b["Key"], tmp_file) + with open(tmp_file) as f: + xml_contents = f.read() + + if test_util.get_num_failures(xml_contents) > 0: + logging.info("Test failures in %s", full_path) + no_errors = False + + return no_errors + +def finalize_prow_job_to_s3(bucket, workflow_success, workflow_phase, ui_urls): + """Finalize a prow job. + + Finalizing a PROW job consists of determining the status of the + prow job by looking at the junit files and then creating finished.json. + + Args + bucket: The S3 bucket where results are stored. + workflow_success: Bool indicating whether the job should be considered succeeded or failed. + workflow_phase: Dictionary of workflow name to phase the workflow is in. + ui_urls: Dictionary of workflow name to URL corresponding to the Argo UI + for the workflows launched. + Returns: + test_success: Bool indicating whether all tests succeeded. + """ + # logic for finalizing prow jon to S3 + s3_client = boto3.client("s3") + + output_dir = get_s3_dir(bucket) + artifacts_dir = os.path.join(output_dir, "artifacts") + + # If the workflow failed then we will mark the prow job as failed. + # We don't need to check the junit files for test failures because we + # already know it failed; furthermore we can't rely on the junit files + # if the workflow didn't succeed because not all junit files might be there. + test_success = True + if workflow_success: + test_success = check_no_errors_s3(s3_client, artifacts_dir) + else: + test_success = False + + create_finished_file_s3(bucket, test_success, workflow_phase, ui_urls) + + return test_success + + +def main(unparsed_args=None): # pylint: disable=too-many-locals + logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals + # create the top-level parser + parser = argparse.ArgumentParser( + description="Create prow artifacts.") + + parser.add_argument( + "--artifacts_dir", + default="", + type=str, + help="Directory to use for all the gubernator artifacts.") + + subparsers = parser.add_subparsers() + + ############################################################################# + # Copy artifacts to S3. + parser_copy = subparsers.add_parser( + "copy_artifacts_to_s3", help="Copy the artifacts.") + + parser_copy.add_argument( + "--bucket", + default=AWS_PROW_RESULTS_BUCKET, + type=str, + help="S3 Bucket to copy the artifacts to.") + + parser_copy.add_argument( + "--suffix", + default="", + type=str, + help=("Optional if supplied add this suffix to the names of all artifact " + "files before copying them to the S3 bucket.")) + + parser_copy.set_defaults(func=copy_artifacts_to_s3) + + ############################################################################# + # Create the pr symlink S3. + parser_link = subparsers.add_parser( + "create_pr_symlink_s3", help="Create a symlink pointing at PR output dir in S3; null " + "op if prow job is not a presubmit job.") + + parser_link.add_argument( + "--bucket", + default=AWS_PROW_RESULTS_BUCKET, + type=str, + help="S3 Bucket to copy the artifacts to") + + parser_link.set_defaults(func=create_pr_symlink_s3) + + ############################################################################# + # Process the command line arguments. + + # Parse the args + args = parser.parse_args(args=unparsed_args) + + # Setup a logging file handler. This way we can upload the log outputs + # to gubernator. + root_logger = logging.getLogger() + + test_log = os.path.join(os.path.join(args.artifacts_dir, "artifacts"), + "logs", "prow_artifacts." + args.func.__name__ + + ".log") + if not os.path.exists(os.path.dirname(test_log)): + try: + os.makedirs(os.path.dirname(test_log)) + # Ignore OSError because sometimes another process + # running in parallel creates this directory at the same time + except OSError: + pass + + + file_handler = logging.FileHandler(test_log) + root_logger.addHandler(file_handler) + # We need to explicitly set the formatter because it will not pick up + # the BasicConfig. + formatter = logging.Formatter(fmt=("%(levelname)s|%(asctime)s" + "|%(pathname)s|%(lineno)d| %(message)s"), + datefmt="%Y-%m-%dT%H:%M:%S") + file_handler.setFormatter(formatter) + logging.info("Logging to %s", test_log) + + args.func(args) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + main() \ No newline at end of file diff --git a/py/kubeflow/testing/cloudprovider/aws/util.py b/py/kubeflow/testing/cloudprovider/aws/util.py new file mode 100644 index 000000000..241428acd --- /dev/null +++ b/py/kubeflow/testing/cloudprovider/aws/util.py @@ -0,0 +1,181 @@ +"""Utilities used by our python scripts for building and releasing.""" +import datetime +import logging +import os +import re +import six +import subprocess +import time +import yaml + +from kubernetes.config import kube_config +from kubernetes.client import configuration as kubernetes_configuration + +import boto3 + + +def run(command, + cwd=None, + env=None, + polling_interval=datetime.timedelta(seconds=1)): + """Run a subprocess. + + Any subprocess output is emitted through the logging modules. + + Returns: + output: A string containing the output. + """ + logging.info("Running: %s \ncwd=%s", " ".join(command), cwd) + + if not env: + env = os.environ + else: + keys = sorted(env.keys()) + + lines = [] + for k in keys: + lines.append("{0}={1}".format(k, env[k])) + logging.info("Running: Environment:\n%s", "\n".join(lines)) + + process = subprocess.Popen( + command, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + logging.info("Subprocess output:\n") + output = [] + while process.poll() is None: + process.stdout.flush() + for line in iter(process.stdout.readline, b''): + if six.PY2: + line = line.strip() + else: + line = line.decode().strip() + + output.append(line) + logging.info(line) + + time.sleep(polling_interval.total_seconds()) + + process.stdout.flush() + for line in iter(process.stdout.readline, b''): + if six.PY2: + line = line.strip() + else: + line = line.decode().strip() + output.append(line) + logging.info(line) + + if process.returncode != 0: + raise subprocess.CalledProcessError( + process.returncode, "cmd: {0} exited with code {1}".format( + " ".join(command), process.returncode), "\n".join(output)) + + return "\n".join(output) + + +def to_s3_uri(bucket, path): + """Convert bucket and path to a S3 URI.""" + return "s3://" + os.path.join(bucket, path) + + +S3_REGEX = re.compile("s3://([^/]*)(/.*)?") + + +def split_s3_uri(s3_uri): + """Split a S3 URI into bucket and path.""" + m = S3_REGEX.match(s3_uri) + bucket = m.group(1) + path = "" + if m.group(2): + path = m.group(2).lstrip("/") + return bucket, path + + +# TODO(jlewi): This was originally a work around for +# https://github.com/kubernetes-incubator/client-python/issues/339. +# +# There was a fix (see issue) that sets the scope but userinfo.email scope +# wasn't included. Which I think will cause problems see +# https://github.com/kubernetes-client/python-base/issues/54 +# +# So as a work around we use this function to allow us to specify the scopes. +# +# This function is based on +# https://github.com/kubernetes-client/python-base/blob/master/config/kube_config.py#L331 +# we modify it though so that we can pass through the function to get credentials. +def load_kube_config(config_file=None, + context=None, + client_configuration=None, + persist_config=True, + get_google_credentials=None, + print_config=False, + **kwargs): + """Loads authentication and cluster information from kube-config file + and stores them in kubernetes.client.configuration. + + :param config_file: Name of the kube-config file. + :param context: set the active context. If is set to None, current_context + from config file will be used. + :param client_configuration: The kubernetes.client.ConfigurationObject to + set configs to. + :param persist_config: If True, config file will be updated when changed + (e.g GCP token refresh). + """ + + if config_file is None: + config_file = os.path.expanduser(kube_config.KUBE_CONFIG_DEFAULT_LOCATION) + logging.info("Using Kubernetes config file: %s", config_file) + + config_persister = None + if persist_config: + + def _save_kube_config(config_map): + with open(config_file, 'w') as f: + yaml.safe_dump(config_map, f, default_flow_style=False) + + config_persister = _save_kube_config + + loader = kube_config._get_kube_config_loader_for_yaml_file( # pylint: disable=protected-access + config_file, + active_context=context, + config_persister=config_persister, + get_google_credentials=get_google_credentials, + **kwargs) + + if client_configuration is None: + config = type.__call__(kubernetes_configuration.Configuration) + loader.load_and_set(config) # pylint: disable=too-many-function-args + kubernetes_configuration.Configuration.set_default(config) + else: + loader.load_and_set(client_configuration) # pylint: disable=too-many-function-args + # Dump the loaded config. + + # Warning this will print out any access tokens stored in your kubeconfig + if print_config: + run(["kubectl", "config", "view"]) + + +def aws_configure_credential(): + if os.getenv("AWS_ACCESS_KEY_ID") and os.getenv("AWS_SECRET_ACCESS_KEY"): + logging.info("AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set;") + else: + logging.info("AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are not set.") + run([ + "aws", "eks", "update-kubeconfig", "--name=" + os.getenv("AWS_EKS_CLUSTER") + ]) + + +def upload_to_s3(contents, target, file_name): + """Uploading contents to s3""" + s3 = boto3.resource('s3') + bucket_name, path = split_s3_uri(target) + with open(file_name, "w+") as data: + data.write(contents) + logging.info("Uploading file %s to %s.", file_name, target) + s3.meta.client.upload_file(file_name, bucket_name, path) + + +def upload_file_to_s3(source, target): + s3 = boto3.resource('s3') + bucket_name, path = split_s3_uri(target) + logging.info("Uploading file %s to %s.", source, target) + s3.meta.client.upload_file(source, bucket_name, path) diff --git a/py/kubeflow/testing/run_e2e_workflow.py b/py/kubeflow/testing/run_e2e_workflow.py index d9e9301cf..7d5f3c84f 100644 --- a/py/kubeflow/testing/run_e2e_workflow.py +++ b/py/kubeflow/testing/run_e2e_workflow.py @@ -72,6 +72,10 @@ import traceback import yaml +if os.getenv("CLOUD_PROVIDER") == "aws": + from kubeflow.testing.cloudprovider.aws import prow_artifacts as aws_prow_artifacts + from kubeflow.testing.cloudprovider.aws import util as aws_util + # The name of the command line argument for workflows for the var # to contain the test target name. # The goal is to be able to use target name grouping in test grid @@ -131,6 +135,15 @@ def create_started_file(bucket, ui_urls): target = os.path.join(prow_artifacts.get_gcs_dir(bucket), "started.json") util.upload_to_gcs(contents, target) + +def create_started_file_s3(bucket, ui_urls): + """Create the started file in S3 for gubernator.""" + contents = aws_prow_artifacts.create_started(ui_urls) + + target = os.path.join(aws_prow_artifacts.get_s3_dir(bucket), "started.json") + aws_util.upload_to_s3(contents, target, "started.json") + + def parse_config_file(config_file, root_dir): with open(config_file) as hf: config = yaml.load(hf) @@ -236,13 +249,18 @@ def run(args, file_handler): # pylint: disable=too-many-statements,too-many-bran logging.info("Extra python paths: %s", ":".join(extra_py_paths)) - # Create an initial version of the file with no urls - create_started_file(args.bucket, {}) + if not args.cloud_provider or args.cloud_provider == "gcp": + # Create an initial version of the file with no urls + create_started_file(args.bucket, {}) - util.maybe_activate_service_account() + util.maybe_activate_service_account() - util.configure_kubectl(args.project, args.zone, args.cluster) - util.load_kube_config() + util.configure_kubectl(args.project, args.zone, args.cluster) + util.load_kube_config() + elif args.cloud_provider == "aws": + create_started_file_s3(args.bucket, {}) + aws_util.aws_configure_credential() + aws_util.load_kube_config() tekton_runner = tekton_client.TektonRunner() workflow_names = [] @@ -323,7 +341,7 @@ def run(args, file_handler): # pylint: disable=too-many-statements,too-many-bran # Create a new environment for this run env = workflow_name - util.run([ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)], + util.run([ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args), "--api-spec=version:v1.8.0"], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, @@ -348,6 +366,9 @@ def run(args, file_handler): # pylint: disable=too-many-statements,too-many-bran get_namespace(args)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket", args.bucket], cwd=w.app_dir) + if args.cloud_provider == "aws": + util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "cluster_name", + "eks-cluster-{}".format(uuid.uuid4().hex[0:8])], cwd=w.app_dir) if args.release: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG")], cwd=w.app_dir) @@ -355,7 +376,12 @@ def run(args, file_handler): # pylint: disable=too-many-statements,too-many-bran # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() - param_names.sort() + if six.PY3: + # In python3, dict_keys.sort() not work given + # https://docs.python.org/3/whatsnew/3.0.html#views-and-iterators-instead-of-lists + param_names = sorted(param_names) + else: + param_names.sort() for k in param_names: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k])], cwd=w.app_dir) @@ -364,8 +390,12 @@ def run(args, file_handler): # pylint: disable=too-many-statements,too-many-bran util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir) util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir) - ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" - "?tab=workflow".format(workflow_name)) + if not args.cloud_provider or args.cloud_provider == "gcp": + ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" + "?tab=workflow".format(workflow_name)) + elif args.cloud_provider == "aws": + ui_url = ("http://86308603-argo-argo-5ce9-1162466691.us-west-2.elb.amazonaws.com/workflows/kubeflow-test-infra/{0}" + "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) elif w.tekton_run: @@ -449,18 +479,27 @@ def run(args, file_handler): # pylint: disable=too-many-statements,too-many-bran body=wf_result) logging.info("Created workflow:\n%s", yaml.safe_dump(py_func_result)) - ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" - "?tab=workflow".format(workflow_name)) + if not args.cloud_provider or args.cloud_provider == "gcp": + ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" + "?tab=workflow".format(workflow_name)) + elif args.cloud_provider == "aws": + ui_url = ("http://86308603-argo-argo-5ce9-1162466691.us-west-2.elb.amazonaws.com/workflows/kubeflow-test-infra/{0}" + "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) - ui_urls.update(tekton_runner.run( - tekton_client.ClusterInfo(args.project, - TEKTON_CLUSTER_ZONE, - TEKTON_CLUSTER_NAME), - tekton_client.ClusterInfo(args.project, args.zone, args.cluster))) - # We delay creating started.json until we know the Argo workflow URLs - create_started_file(args.bucket, ui_urls) + if not args.cloud_provider or args.cloud_provider == "gcp": + ui_urls.update(tekton_runner.run( + tekton_client.ClusterInfo(args.project, + TEKTON_CLUSTER_ZONE, + TEKTON_CLUSTER_NAME), + tekton_client.ClusterInfo(args.project, args.zone, args.cluster))) + + # We delay creating started.json until we know the Argo workflow URLs + create_started_file(args.bucket, ui_urls) + elif args.cloud_provider == "aws": + # We delay creating started.json until we know the Argo workflow URLs + create_started_file_s3(args.bucket, ui_urls) workflow_success = False workflow_phase = {} @@ -473,9 +512,12 @@ def run(args, file_handler): # pylint: disable=too-many-statements,too-many-bran timeout=datetime.timedelta(minutes=180), status_callback=argo_client.log_status ) - util.configure_kubectl(args.project, "us-east1-d", "kf-ci-v1") - util.load_kube_config() - tekton_results = tekton_runner.join() + if not args.cloud_provider or args.cloud_provider == "gcp": + util.configure_kubectl(args.project, "us-east1-d", "kf-ci-v1") + util.load_kube_config() + tekton_results = tekton_runner.join() + elif args.cloud_provider == "aws": + aws_util.load_kube_config() workflow_success = True except util.ExceptionWithWorkflowResults as e: # We explicitly log any exceptions so that they will be captured in the @@ -487,11 +529,14 @@ def run(args, file_handler): # pylint: disable=too-many-statements,too-many-bran logging.exception("Other exception: %s", e) raise finally: - util.configure_kubectl(args.project, args.zone, args.cluster) - util.load_kube_config() - prow_artifacts_dir = prow_artifacts.get_gcs_dir(args.bucket) - - # Upload workflow status to GCS. + if not args.cloud_provider or args.cloud_provider == "gcp": + util.configure_kubectl(args.project, args.zone, args.cluster) + util.load_kube_config() + prow_artifacts_dir = prow_artifacts.get_gcs_dir(args.bucket) + elif args.cloud_provider == "aws": + prow_artifacts_dir = aws_prow_artifacts.get_s3_dir(args.bucket) + + # Upload workflow status to GCS/S3. for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") @@ -501,10 +546,17 @@ def run(args, file_handler): # pylint: disable=too-many-statements,too-many-bran workflow_success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) - for wf_name, wf_status in workflow_status_yamls.items(): - util.upload_to_gcs( - wf_status, - os.path.join(prow_artifacts_dir, '{}.yaml'.format(wf_name))) + if not args.cloud_provider or args.cloud_provider == "gcp": + for wf_name, wf_status in workflow_status_yamls.items(): + util.upload_to_gcs( + wf_status, + os.path.join(prow_artifacts_dir, '{}.yaml'.format(wf_name))) + elif args.cloud_provider == "aws": + for wf_name, wf_status in workflow_status_yamls.items(): + aws_util.upload_to_s3( + wf_status, + os.path.join(prow_artifacts_dir, '{}.yaml'.format(wf_name)), + '{}.yaml'.format(wf_name)) for r in tekton_results: condition = "Failed" @@ -521,12 +573,20 @@ def run(args, file_handler): # pylint: disable=too-many-statements,too-many-bran # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() - util.upload_file_to_gcs( - file_handler.baseFilename, - os.path.join(prow_artifacts_dir, "build-log.txt")) - all_tests_success = prow_artifacts.finalize_prow_job( - args.bucket, workflow_success, workflow_phase, ui_urls) + if not args.cloud_provider or args.cloud_provider == "gcp": + util.upload_file_to_gcs( + file_handler.baseFilename, + os.path.join(prow_artifacts_dir, "build-log.txt")) + all_tests_success = prow_artifacts.finalize_prow_job( + args.bucket, workflow_success, workflow_phase, ui_urls) + elif args.cloud_provider == "aws": + aws_util.upload_file_to_s3( + file_handler.baseFilename, + os.path.join(prow_artifacts_dir, "build-log.txt")) + all_tests_success = aws_prow_artifacts.finalize_prow_job_to_s3( + args.bucket, workflow_success, workflow_phase, ui_urls + ) return all_tests_success @@ -590,12 +650,26 @@ def main(unparsed_args=None): # pylint: disable=too-many-locals default="tektoncd", help="Optional Tekton namespace to use") + ############ Other cloud provider like AWS/IBM/Arrikto + parser.add_argument( + "--cloud_provider", + type=str, + default="", ## can be "aws" / "ibm" / .... + help="Option to enable other cloud provider functionalities aws/ibm/cisco ..." + ) + + parser.add_argument( + "--aws_region", + type=str, + default="us-west-2", + help="region containing the EKS cluster to use to run the workflow." + ) + ############################################################################# # Process the command line arguments. # Parse the args args = parser.parse_args(args=unparsed_args) - # Setup a logging file handler. This way we can upload the log outputs # to gubernator. root_logger = logging.getLogger() @@ -628,4 +702,4 @@ def main(unparsed_args=None): # pylint: disable=too-many-locals # Exit with a non-zero exit code by to signal failure to prow. logging.error("One or more test steps failed exiting with non-zero exit " "code.") - sys.exit(1) + sys.exit(1) \ No newline at end of file