diff --git a/Makefile b/Makefile index 737ebb72..1fc550eb 100644 --- a/Makefile +++ b/Makefile @@ -73,7 +73,7 @@ all: build .PHONY: help help: ## Display this help. - @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) ##@ Development @@ -120,40 +120,60 @@ skaffold-dev-gcpmanager: protoc skaffold protogen render-skaffold-manifests ## R build: manifests generate fmt vet ## Build manager binary. go build -o bin/manager cmd/controllermanager/main.go -.PHONY: dev-up -dev-up: - docker build ./install -t substratus-installer && \ +.PHONY: gcp-dev-up +gcp-dev-up: build-installer docker run -it \ - -v ${HOME}/.kube:/root/.kube \ - -e PROJECT=$(shell gcloud config get project) \ - -e TOKEN=$(shell gcloud auth print-access-token) \ - -e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \ - -e INSTALL_OPERATOR=false \ - substratus-installer gcp-up.sh + -v ${HOME}/.kube:/root/.kube \ + -e PROJECT=$(shell gcloud config get project) \ + -e TOKEN=$(shell gcloud auth print-access-token) \ + -e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \ + -e INSTALL_OPERATOR=false \ + substratus-installer gcp-up.sh mkdir -p secrets gcloud iam service-accounts keys create --iam-account=substratus-gcp-manager@$(shell gcloud config get project).iam.gserviceaccount.com ./secrets/gcp-manager-key.json -.PHONY: dev-down -dev-down: +.PHONY: gcp-dev-down +gcp-dev-down: build-installer docker run -it \ - -v ${HOME}/.kube:/root/.kube \ - -e PROJECT=$(shell gcloud config get project) \ - -e TOKEN=$(shell gcloud auth print-access-token) \ - -e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \ - substratus-installer gcp-down.sh + -v ${HOME}/.kube:/root/.kube \ + -e PROJECT=$(shell gcloud config get project) \ + -e TOKEN=$(shell gcloud auth print-access-token) \ + -e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \ + substratus-installer gcp-down.sh rm ./secrets/gcp-manager-key.json -.PHONY: dev-run +.PHONY: aws-dev-up +aws-dev-up: build-installer + docker run -it \ + -v ${HOME}/.kube:/root/.kube \ + -e AWS_ACCOUNT_ID="$(shell aws sts get-caller-identity --query Account --output text)" \ + -e AWS_ACCESS_KEY_ID=$(shell aws configure get aws_access_key_id) \ + -e AWS_SECRET_ACCESS_KEY=$(shell aws configure get aws_secret_access_key) \ + -e AWS_SESSION_TOKEN=$(shell aws configure get aws_session_token) \ + -e INSTALL_OPERATOR=false \ + substratus-installer aws-up.sh + +.PHONY: aws-dev-down +aws-dev-down: build-installer + docker run -it \ + -v ${HOME}/.kube:/root/.kube \ + -e AWS_ACCOUNT_ID="$(shell aws sts get-caller-identity --query Account --output text)" \ + -e AWS_ACCESS_KEY_ID=$(shell aws configure get aws_access_key_id) \ + -e AWS_SECRET_ACCESS_KEY=$(shell aws configure get aws_secret_access_key) \ + -e AWS_SESSION_TOKEN=$(shell aws configure get aws_session_token) \ + substratus-installer aws-down.sh + +.PHONY: gcp-dev-run # Controller manager configuration # -dev-run: export CLOUD=gcp -dev-run: export GPU_TYPE=nvidia-l4 -dev-run: export PROJECT_ID=$(shell gcloud config get project) -dev-run: export CLUSTER_NAME=substratus -dev-run: export CLUSTER_LOCATION=us-central1 +gcp-dev-run: export CLOUD=gcp +gcp-dev-run: export GPU_TYPE=nvidia-l4 +gcp-dev-run: export PROJECT_ID=$(shell gcloud config get project) +gcp-dev-run: export CLUSTER_NAME=substratus +gcp-dev-run: export CLUSTER_LOCATION=us-central1 # Cloud manager configuration # -dev-run: export GOOGLE_APPLICATION_CREDENTIALS=./secrets/gcp-manager-key.json +gcp-dev-run: export GOOGLE_APPLICATION_CREDENTIALS=./secrets/gcp-manager-key.json # Run the controller manager and the cloud manager. -dev-run: manifests kustomize install-crds +gcp-dev-run: manifests kustomize install-crds go run ./cmd/gcpmanager & \ go run ./cmd/controllermanager/main.go \ --sci-address=localhost:10080 \ @@ -176,16 +196,17 @@ docker-push: ## Push docker image with the manager. .PHONY: docs docs: crd-ref-docs embedmd - $(CRD_REF_DOCS) --config=./docs/api/config.yaml \ + $(CRD_REF_DOCS) \ + --config=./docs/api/config.yaml \ --log-level=INFO \ --output-path=./docs/api/generated.md \ --source-path=./api \ - --templates-dir=./docs/api/templates/markdown \ + --templates-dir=./docs/api/templates/markdown \ --renderer=markdown # TODO: Embed YAML examples into the generate API documentation. # $(EMBEDMD) -w ./docs/api/generated.md -# PLATFORMS defines the target platforms for the manager image be build to provide support to multiple +# PLATFORMS defines the target platforms for the manager image be build to provide support to multiple # architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to: # - able to use docker buildx . More info: https://docs.docker.com/build/buildx/ # - have enable BuildKit, More info: https://docs.docker.com/develop/develop-images/build_enhancements/ @@ -209,7 +230,7 @@ protogen: protoc ## Generate protobuf files. ##@ Deployment ifndef ignore-not-found - ignore-not-found = false + ignore-not-found=false endif .PHONY: install-crds diff --git a/docs/development.md b/docs/development.md index bcdb9c25..b5000690 100644 --- a/docs/development.md +++ b/docs/development.md @@ -5,19 +5,19 @@ Create a GCP environment. ```sh -make dev-up +make gcp-dev-up ``` Run Substratus control plane locally. ```sh -make dev-run +make gcp-dev-run ``` Delete GCP infra. ```sh -make dev-down +make gcp-dev-down ``` TODO: Automate the cleanup of PVs... Don't forget to manually clean them up for now. diff --git a/install/Dockerfile b/install/Dockerfile index 8d3c0b2d..30ee639d 100644 --- a/install/Dockerfile +++ b/install/Dockerfile @@ -32,6 +32,7 @@ RUN DEBIAN_FRONTEND="noninteractive" \ curl \ git \ tzdata \ + gettext-base \ keyboard-configuration # AWS CLI diff --git a/install/kubernetes/eks-cluster.yaml.tpl b/install/kubernetes/eks-cluster.yaml.tpl index 9c349cae..2982b105 100644 --- a/install/kubernetes/eks-cluster.yaml.tpl +++ b/install/kubernetes/eks-cluster.yaml.tpl @@ -1,21 +1,20 @@ apiVersion: eksctl.io/v1alpha5 kind: ClusterConfig metadata: - name: substratus - region: us-west-2 + name: ${CLUSTER_NAME} + region: ${REGION} version: "1.27" tags: createdBy: eksctl environment: dev - karpenter.sh/discovery: substratus + karpenter.sh/discovery: ${CLUSTER_NAME} karpenter: createServiceAccount: true withSpotInterruptionQueue: true - defaultInstanceProfile: "KarpenterNodeInstanceProfile-substratus" + defaultInstanceProfile: "KarpenterNodeInstanceProfile-${CLUSTER_NAME}" version: "v0.29.0" -# TODO(bjb): do we need mngs with karpenter? # if karpenter doesn't suffice: https://github.com/eksctl-io/eksctl/blob/main/examples/23-kubeflow-spot-instance.yaml managedNodeGroups: - name: builder-ng @@ -26,7 +25,7 @@ managedNodeGroups: volumeSize: 100 minSize: 0 maxSize: 3 - desiredCapacity: 2 + desiredCapacity: 1 iam: withAddonPolicies: ebs: true @@ -64,8 +63,8 @@ iam: wellKnownPolicies: ebsCSIController: true - metadata: - name: substratus - namespace: substratus + name: ${CLUSTER_NAME} + namespace: ${CLUSTER_NAME} attachPolicy: Version: "2012-10-17" Statement: @@ -83,7 +82,7 @@ iam: - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}" - metadata: name: aws-manager - namespace: substratus + namespace: ${CLUSTER_NAME} attachPolicy: # https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-presigned-url.html Version: "2012-10-17" diff --git a/install/kubernetes/karpenter-provisioner.yaml.tpl b/install/kubernetes/karpenter-provisioner.yaml.tpl index 1fafec36..f614e8ad 100644 --- a/install/kubernetes/karpenter-provisioner.yaml.tpl +++ b/install/kubernetes/karpenter-provisioner.yaml.tpl @@ -20,7 +20,6 @@ spec: karpenter.sh/discovery: ${CLUSTER_NAME} securityGroupSelector: karpenter.sh/discovery: ${CLUSTER_NAME} - ttlSecondsAfterEmpty: 30 consolidation: enabled: true taints: @@ -34,46 +33,12 @@ spec: - key: node.kubernetes.io/instance-type operator: In values: - # aws ec2 describe-instance-types --region us-west-2 --query "InstanceTypes[?GpuInfo!=null].InstanceType" --output json | jq -r '.[]' | sort | grep -v dl1 | grep -v inf | grep -v p5 | grep -v trn1 | awk '{print "\""$1"\","}' - [ - "g2.2xlarge", - "g2.8xlarge", - "g3.16xlarge", - "g3.4xlarge", - "g3.8xlarge", - "g3s.xlarge", - "g4ad.16xlarge", - "g4ad.2xlarge", - "g4ad.4xlarge", - "g4ad.8xlarge", - "g4ad.xlarge", - "g4dn.12xlarge", - "g4dn.16xlarge", - "g4dn.2xlarge", - "g4dn.4xlarge", - "g4dn.8xlarge", - "g4dn.metal", - "g4dn.xlarge", - "g5.12xlarge", - "g5.16xlarge", - "g5.24xlarge", - "g5.2xlarge", - "g5.48xlarge", - "g5.4xlarge", - "g5.8xlarge", - "g5.xlarge", - "g5g.16xlarge", - "g5g.2xlarge", - "g5g.4xlarge", - "g5g.8xlarge", - "g5g.metal", - "g5g.xlarge", - "p2.16xlarge", - "p2.8xlarge", - "p2.xlarge", - "p3.16xlarge", - "p3.2xlarge", - "p3.8xlarge", - "p3dn.24xlarge", - "p4d.24xlarge", - ] + - key: karpenter.k8s.aws/instance-category + operator: In + values: ["g", "p"] + - key: karpenter.k8s.aws/instance-family + operator: NotIn + values: ["p5"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index b776f377..1facf3f5 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -4,21 +4,37 @@ set -e set -u # Required env variables: -# : "$TOKEN $PROJECT" +: "$AWS_ACCOUNT_ID $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" KUBERENTES_DIR=${SCRIPT_DIR}/../kubernetes -export EKSCTL_ENABLE_CREDENTIAL_CACHE=1 +EKSCTL_ENABLE_CREDENTIAL_CACHE=1 export CLUSTER_NAME=substratus export REGION=us-west-2 -export ARTIFACTS_REPO_NAME=substratus -export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" -export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts +export ARTIFACTS_REPO_NAME=${CLUSTER_NAME} +export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts + +aws eks update-kubeconfig \ + --region ${REGION} \ + --name ${CLUSTER_NAME} && + kubectl delete deployments --namespace=karpenter --all && + kubectl delete deployments --namespace=kube-system --all || + true + +aws iam delete-policy \ + --policy-arn arn:aws:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME} || + true -aws s3 rb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} >/dev/null || true -aws ecr delete-repository --repository-name ${ARTIFACTS_REPO_NAME} >/dev/null || true aws cloudformation delete-stack \ - --stack-name "Karpenter-${CLUSTER_NAME}" || true + --stack-name "Karpenter-${CLUSTER_NAME}" \ + --region ${REGION} || true envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml -eksctl delete cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml +eksctl delete cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || true + +aws ecr delete-repository \ + --repository-name ${ARTIFACTS_REPO_NAME} \ + --region ${REGION} >/dev/null || true + +aws s3 rb s3://${ARTIFACTS_BUCKET_NAME} \ + --region ${REGION} >/dev/null || true diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index 424c4eeb..4792de4f 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -4,26 +4,31 @@ set -e set -u # Required env variables: -# : "$TOKEN $PROJECT" +: "$AWS_ACCOUNT_ID $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY" + +INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" -# # TODO(bjb): pass AWS creds into script -# export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN} SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" KUBERENTES_DIR=${SCRIPT_DIR}/../kubernetes -# INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" -export EKSCTL_ENABLE_CREDENTIAL_CACHE=1 + +EKSCTL_ENABLE_CREDENTIAL_CACHE=1 export CLUSTER_NAME=substratus export REGION=us-west-2 -export ARTIFACTS_REPO_NAME=substratus -export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" -export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts - -aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} >/dev/null || true -aws ecr create-repository --repository-name ${ARTIFACTS_REPO_NAME} --region ${REGION} >/dev/null || true -# install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ +export ARTIFACTS_REPO_NAME=${CLUSTER_NAME} +export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts export KARPENTER_VERSION=v0.29.2 export AWS_PARTITION="aws" -export TEMPOUT=$(mktemp) +export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" +TEMPOUT=$(mktemp) + +aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} \ + --region ${REGION} >/dev/null || true + +aws ecr create-repository \ + --repository-name ${ARTIFACTS_REPO_NAME} \ + --region ${REGION} >/dev/null || true + +# install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$TEMPOUT && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -33,15 +38,23 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION} --region ${REGION} envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml -eksctl create cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || eksctl upgrade cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml +eksctl create cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || + eksctl upgrade cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml + +aws iam create-service-linked-role \ + --aws-service-name spot.amazonaws.com || true + +aws eks update-kubeconfig \ + --region ${REGION} \ + --name ${CLUSTER_NAME} -export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" -aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true -aws eks --region ${REGION} update-kubeconfig --name ${CLUSTER_NAME} # Logout of helm registry to perform an unauthenticated pull against the public ECR helm registry logout public.ecr.aws || true - -helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version ${KARPENTER_VERSION} --namespace karpenter --create-namespace \ +helm upgrade \ + --create-namespace \ + --install karpenter oci://public.ecr.aws/karpenter/karpenter \ + --version ${KARPENTER_VERSION} \ + --namespace karpenter \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${KARPENTER_IAM_ROLE_ARN} \ --set settings.aws.clusterName=${CLUSTER_NAME} \ --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ @@ -64,8 +77,8 @@ helm upgrade \ eks/aws-node-termination-handler # Install the substratus operator. -# if [ "${INSTALL_OPERATOR}" == "yes" ]; then -# kubectl apply -f kubernetes/namespace.yaml -# kubectl apply -f kubernetes/config.yaml -# kubectl apply -f kubernetes/system.yaml -# fi +if [ "${INSTALL_OPERATOR}" == "yes" ]; then + kubectl apply -f kubernetes/namespace.yaml + kubectl apply -f kubernetes/config.yaml + kubectl apply -f kubernetes/system.yaml +fi