From 6a30dca4c0c2293bb8c296580b28a791a687431e Mon Sep 17 00:00:00 2001 From: Alex Collins Date: Fri, 15 Jan 2021 16:43:27 -0800 Subject: [PATCH 01/12] feat(controller): Support rate-limitng pod creation. Signed-off-by: Alex Collins --- .github/workflows/ci-build.yaml | 15 ++++++------ Makefile | 24 +++++++++++++------ Procfile | 1 + config/config.go | 9 +++++++ go.mod | 1 + .../mixins/workflow-controller-configmap.yaml | 6 +++++ workflow/controller/config.go | 6 +++++ workflow/controller/controller.go | 2 ++ workflow/controller/operator.go | 9 ++++--- workflow/controller/workflowpod.go | 9 +++++++ 10 files changed, 65 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci-build.yaml b/.github/workflows/ci-build.yaml index 3789c50f877a..440541a53b4a 100644 --- a/.github/workflows/ci-build.yaml +++ b/.github/workflows/ci-build.yaml @@ -33,7 +33,7 @@ jobs: - name: Run tests env: GOPATH: /home/runner/go - run: make test STATIC_FILES=false + run: make test e2e-tests: name: E2E Tests @@ -111,7 +111,8 @@ jobs: if [ ${{matrix.test}} == smoke ] || [ ${{matrix.test}} == test-e2e-cron ] ; then PROFILE=minimal fi - KUBECONFIG=~/.kube/config make start PROFILE=$PROFILE E2E_EXECUTOR=${{matrix.containerRuntimeExecutor}} ALWAYS_OFFLOAD_NODE_STATUS=${{matrix.alwaysOffloadNodeStatus}} DEV_IMAGE=true STATIC_FILES=false 2>&1 > /tmp/log/argo-e2e/argo.log & + KUBECONFIG=~/.kube/config make install PROFILE=$PROFILE E2E_EXECUTOR=${{matrix.containerRuntimeExecutor}} ALWAYS_OFFLOAD_NODE_STATUS=${{matrix.alwaysOffloadNodeStatus}} DEV_IMAGE=true + KUBECONFIG=~/.kube/config make start PROFILE=$PROFILE E2E_EXECUTOR=${{matrix.containerRuntimeExecutor}} ALWAYS_OFFLOAD_NODE_STATUS=${{matrix.alwaysOffloadNodeStatus}} DEV_IMAGE=true 2>&1 > /tmp/log/argo-e2e/argo.log & - name: Install gotestsum run: go install gotest.tools/gotestsum - name: Wait for Argo Server to be ready @@ -123,10 +124,10 @@ jobs: GOPATH: /home/runner/go run: make ${{ matrix.test }} GOTEST='gotestsum --format testname --' - name: Upload logs - if: ${{ always() }} + if: ${{ failure() }} uses: actions/upload-artifact@v1 with: - name: ${{ matrix.test }}-${{ github.run_id }}-argo.log + name: ${{ matrix.test }}-${{matrix.containerRuntimeExecutor}}-${{matrix.alwaysOffloadNodeStatus}}-${{ github.run_id }}-argo.log path: /tmp/log/argo-e2e/argo.log codegen: @@ -171,15 +172,15 @@ jobs: ln -s "$PWD" /home/runner/go/src/github.com/argoproj/argo # we use -B to force make to always make targets - make -B codegen STATIC_FILES=false + make -B codegen - name: Make lint env: GOPATH: /home/runner/go - run: make lint STATIC_FILES=false + run: make lint - name: Make validate-examples env: GOPATH: /home/runner/go - run: make validate-examples STATIC_FILES=false + run: make validate-examples - name: Ensure nothing changed run: git diff --exit-code diff --git a/Makefile b/Makefile index 0bc722841b1c..917afd57d5d4 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,13 @@ GIT_COMMIT = $(shell git rev-parse HEAD) GIT_REMOTE = origin GIT_BRANCH = $(shell git rev-parse --symbolic-full-name --verify --quiet --abbrev-ref HEAD) GIT_TAG = $(shell git describe --always --tags --abbrev=0 || echo untagged) -GIT_TREE_STATE = $(shell if [ -z "`git status --porcelain`" ]; then echo "clean" ; else echo "dirty"; fi) +GIT_TREE_STATE = $(shell [ -z "`git status --porcelain`" ] && echo clean || echo dirty) + +# this is automatically set on CI +CI = +YARN = $(shell [ "`command -v yarn`" != '' ] && echo true || echo false) +K3D = $(shell [ "`command -v kubectl`" != '' ] && [ "`kubectl config current-context`" == "k3d-"* ] && echo true || echo false) +DEV_MACHINE = $(shell [ "`uname -s`" = Darwin ] && echo true || echo false) export DOCKER_BUILDKIT = 1 @@ -48,7 +54,7 @@ CONTROLLER_IMAGE_FILE := dist/controller-image.marker # perform static compilation STATIC_BUILD ?= true -STATIC_FILES ?= true +STATIC_FILES ?= $(shell [ $YARN = true ] && [ $DEV_MACHINE = true ] && echo true || echo false) GOTEST ?= go test PROFILE ?= minimal # by keeping this short we speed up the tests @@ -59,14 +65,13 @@ AUTH_MODE := hybrid ifeq ($(PROFILE),sso) AUTH_MODE := sso endif -ifeq ($(STATIC_FILES),false) +ifneq ($(CI),) AUTH_MODE := client endif # Which mode to run in: # * `local` run the workflow–controller and argo-server as single replicas on the local machine (default) # * `kubernetes` run the workflow-controller and argo-server on the Kubernetes cluster RUN_MODE := local -K3D := $(shell if [[ "`which kubectl`" != '' ]] && [[ "`kubectl config current-context`" == "k3d-"* ]]; then echo true; else echo false; fi) LOG_LEVEL := debug UPPERIO_DB_DEBUG := 0 NAMESPACED := true @@ -171,9 +176,11 @@ cli: dist/argo argo-server.crt argo-server.key ui/dist/app/index.html: $(shell find ui/src -type f && find ui -maxdepth 1 -type f) # Build UI @mkdir -p ui/dist/app -ifeq ($(STATIC_FILES),true) +ifeq ($(YARN),true) # `yarn install` is fast (~2s), so you can call it safely. JOBS=max yarn --cwd ui install +endif +ifeq ($(STATIC_FILES),true) # `yarn build` is slow, so we guard it with a up-to-date check. JOBS=max yarn --cwd ui build else @@ -391,7 +398,7 @@ lint: server/static/files.go $(GOPATH)/bin/golangci-lint # Lint Go files golangci-lint run --fix --verbose --concurrency 4 --timeout 5m # Lint UI files -ifeq ($(STATIC_FILES),true) +ifeq ($(YARN),true) yarn --cwd ui lint endif @@ -426,7 +433,9 @@ argosay: test/e2e/images/argosay/v2/argosay ifeq ($(K3D),true) k3d image import argoproj/argosay:v2 endif +ifeq ($(DOCKER_PUSH),true) docker push argoproj/argosay:v2 +endif test/e2e/images/argosay/v2/argosay: test/e2e/images/argosay/v2/main/argosay.go cd test/e2e/images/argosay/v2 && GOOS=linux CGO_ENABLED=0 go build -ldflags '-w -s' -o argosay ./main @@ -446,6 +455,7 @@ start: controller-image cli-image install executor-image else start: install controller cli executor-image $(GOPATH)/bin/goreman endif + @echo "starting STATIC_FILES=$(STATIC_FILES), YARN=$(YARN), AUTH_MODE=$(AUTH_MODE), RUN_MODE=$(RUN_MODE)" ifeq ($(RUN_MODE),kubernetes) kubectl -n $(KUBE_NAMESPACE) wait --for=condition=Available deploy argo-server kubectl -n $(KUBE_NAMESPACE) wait --for=condition=Available deploy workflow-controller @@ -467,7 +477,7 @@ endif sleep 10s ./hack/port-forward.sh ifeq ($(RUN_MODE),local) - env DEFAULT_REQUEUE_TIME=$(DEFAULT_REQUEUE_TIME) SECURE=$(SECURE) ALWAYS_OFFLOAD_NODE_STATUS=$(ALWAYS_OFFLOAD_NODE_STATUS) LOG_LEVEL=$(LOG_LEVEL) UPPERIO_DB_DEBUG=$(UPPERIO_DB_DEBUG) VERSION=$(VERSION) AUTH_MODE=$(AUTH_MODE) NAMESPACED=$(NAMESPACED) NAMESPACE=$(KUBE_NAMESPACE) $(GOPATH)/bin/goreman -set-ports=false -logtime=false start + env DEFAULT_REQUEUE_TIME=$(DEFAULT_REQUEUE_TIME) SECURE=$(SECURE) ALWAYS_OFFLOAD_NODE_STATUS=$(ALWAYS_OFFLOAD_NODE_STATUS) LOG_LEVEL=$(LOG_LEVEL) UPPERIO_DB_DEBUG=$(UPPERIO_DB_DEBUG) VERSION=$(VERSION) AUTH_MODE=$(AUTH_MODE) NAMESPACED=$(NAMESPACED) NAMESPACE=$(KUBE_NAMESPACE) $(GOPATH)/bin/goreman -set-ports=false -logtime=false start controller argo-server $(shell [ $(YARN) = true ] && echo ui || echo) endif .PHONY: wait diff --git a/Procfile b/Procfile index 988c299f6246..498c12e6765a 100644 --- a/Procfile +++ b/Procfile @@ -1,2 +1,3 @@ controller: DEFAULT_REQUEUE_TIME=${DEFAULT_REQUEUE_TIME} LEADER_ELECTION_IDENTITY=local ALWAYS_OFFLOAD_NODE_STATUS=${ALWAYS_OFFLOAD_NODE_STATUS} OFFLOAD_NODE_STATUS_TTL=30s WORKFLOW_GC_PERIOD=30s UPPERIO_DB_DEBUG=${UPPERIO_DB_DEBUG} ARCHIVED_WORKFLOW_GC_PERIOD=30s ./dist/workflow-controller --executor-image argoproj/argoexec:${VERSION} --namespaced=${NAMESPACED} --namespace ${NAMESPACE} --loglevel ${LOG_LEVEL} argo-server: UPPERIO_DB_DEBUG=${UPPERIO_DB_DEBUG} ./dist/argo --loglevel ${LOG_LEVEL} server --namespaced=${NAMESPACED} --namespace ${NAMESPACE} --auth-mode ${AUTH_MODE} --secure=$SECURE --x-frame-options=SAMEORIGIN +ui: yarn --cwd ui install && yarn --cwd ui start diff --git a/config/config.go b/config/config.go index d0cec26874ec..67c9071a19a4 100644 --- a/config/config.go +++ b/config/config.go @@ -12,6 +12,11 @@ import ( var EmptyConfigFunc = func() interface{} { return &Config{} } +type ResourceRateLimit struct { + Limit float64 `json:"limit"` + Burst int `json:"burst"` +} + // Config contain the configuration settings for the workflow controller type Config struct { @@ -75,6 +80,10 @@ type Config struct { // Parallelism limits the max total parallel workflows that can execute at the same time Parallelism int `json:"parallelism,omitempty"` + ResourceLimit int `json:"resourceLimit,omitempty"` + + ResourceRateLimit *ResourceRateLimit `json:"resourceRateLimit,omitempty"` + // Persistence contains the workflow persistence DB configuration Persistence *PersistConfig `json:"persistence,omitempty"` diff --git a/go.mod b/go.mod index 8533355e67f5..be0da40033b0 100644 --- a/go.mod +++ b/go.mod @@ -55,6 +55,7 @@ require ( golang.org/x/net v0.0.0-20201216054612-986b41b23924 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9 + golang.org/x/time v0.0.0-20200630173020-3af7569d3a1e golang.org/x/tools v0.0.0-20200717024301-6ddee64345a6 google.golang.org/api v0.20.0 google.golang.org/genproto v0.0.0-20200806141610-86f49bd18e98 diff --git a/test/e2e/manifests/mixins/workflow-controller-configmap.yaml b/test/e2e/manifests/mixins/workflow-controller-configmap.yaml index f5a9b9173457..a864b4b22472 100644 --- a/test/e2e/manifests/mixins/workflow-controller-configmap.yaml +++ b/test/e2e/manifests/mixins/workflow-controller-configmap.yaml @@ -4,6 +4,12 @@ metadata: name: workflow-controller-configmap data: containerRuntimeExecutor: pns + # total number of pending + running resources (i.e. pods) + resourceLimit: "5" + # limit or creation of resources (i.e. pods) + resourceRateLimit: | + limit: 0.01 + burst: 1 workflowDefaults: | spec: activeDeadlineSeconds: 300 diff --git a/workflow/controller/config.go b/workflow/controller/config.go index c05c08ef4eec..04edee192018 100644 --- a/workflow/controller/config.go +++ b/workflow/controller/config.go @@ -4,6 +4,7 @@ import ( "context" log "github.com/sirupsen/logrus" + "golang.org/x/time/rate" apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" "sigs.k8s.io/yaml" @@ -82,6 +83,11 @@ func (wfc *WorkflowController) updateConfig(v interface{}) error { } wfc.hydrator = hydrator.New(wfc.offloadNodeStatusRepo) wfc.updateEstimatorFactory() + if x := config.ResourceRateLimit; x != nil { + wfc.rateLimiter = rate.NewLimiter(rate.Limit(x.Limit), x.Burst) + } else { + wfc.rateLimiter = nil + } return nil } diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index 67fa259236e7..9e81d4993c40 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -11,6 +11,7 @@ import ( "github.com/argoproj/pkg/errors" syncpkg "github.com/argoproj/pkg/sync" log "github.com/sirupsen/logrus" + "golang.org/x/time/rate" apiv1 "k8s.io/api/core/v1" apierr "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -79,6 +80,7 @@ type WorkflowController struct { // restConfig is used by controller to send a SIGUSR1 to the wait sidecar using remotecommand.NewSPDYExecutor(). restConfig *rest.Config kubeclientset kubernetes.Interface + rateLimiter *rate.Limiter dynamicInterface dynamic.Interface wfclientset wfclientset.Interface diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index 78437e74f827..04249c6f4b67 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -89,7 +89,8 @@ type wfOperationCtx struct { deadline time.Time // activePods tracks the number of active (Running/Pending) pods for controlling // parallelism - activePods int64 + activePods int64 + createdResources int // we need to count how many resources we created in this operation // workflowDeadline is the deadline which the workflow is expected to complete before we // terminate the workflow. workflowDeadline *time.Time @@ -111,7 +112,9 @@ var ( // ErrDeadlineExceeded indicates the operation exceeded its deadline for execution ErrDeadlineExceeded = errors.New(errors.CodeTimeout, "Deadline exceeded") // ErrParallelismReached indicates this workflow reached its parallelism limit - ErrParallelismReached = errors.New(errors.CodeForbidden, "Max parallelism reached") + ErrParallelismReached = errors.New(errors.CodeForbidden, "Max parallelism reached") + ErrResourceRateLimitReached = errors.New(errors.CodeForbidden, "rate-limit for resource creation reached") + ErrResourceLimitReached = errors.New(errors.CodeForbidden, "limiting for max resources reached") // ErrTimeout indicates a specific template timed out ErrTimeout = errors.New(errors.CodeTimeout, "timeout") ) @@ -883,7 +886,7 @@ func (woc *wfOperationCtx) podReconciliation(ctx context.Context) error { if node.Fulfilled() && !node.IsDaemoned() { if tmpVal, tmpOk := pod.Labels[common.LabelKeyCompleted]; tmpOk { if tmpVal == "true" { - return + return } } woc.completedPods[pod.ObjectMeta.Name] = true diff --git a/workflow/controller/workflowpod.go b/workflow/controller/workflowpod.go index 774437b67329..8a9c88419c2a 100644 --- a/workflow/controller/workflowpod.go +++ b/workflow/controller/workflowpod.go @@ -376,6 +376,14 @@ func (woc *wfOperationCtx) createWorkflowPod(ctx context.Context, nodeName strin pod.Spec.ActiveDeadlineSeconds = &newActiveDeadlineSeconds } + if limit := woc.controller.Config.ResourceLimit; limit > 0 && len(woc.controller.podInformer.GetIndexer().ListKeys())+woc.createdResources >= limit { + return nil, ErrResourceLimitReached + } + + if rl := woc.controller.rateLimiter; rl != nil && !rl.Allow() { + return nil, ErrResourceRateLimitReached + } + created, err := woc.controller.kubeclientset.CoreV1().Pods(woc.wf.ObjectMeta.Namespace).Create(ctx, pod, metav1.CreateOptions{}) if err != nil { if apierr.IsAlreadyExists(err) { @@ -392,6 +400,7 @@ func (woc *wfOperationCtx) createWorkflowPod(ctx context.Context, nodeName strin } woc.log.Infof("Created pod: %s (%s)", nodeName, created.Name) woc.activePods++ + woc.createdResources++ return created, nil } From 04e7c544e3f2b4ee7d0f466952ccd5191a8680a1 Mon Sep 17 00:00:00 2001 From: Alex Collins Date: Fri, 15 Jan 2021 17:30:47 -0800 Subject: [PATCH 02/12] :limit: M Makefile Signed-off-by: Alex Collins --- Makefile | 8 +++----- .../mixins/workflow-controller-configmap.yaml | 11 +++++++---- .../app/workflows/components/workflow-creator.tsx | 4 ++-- workflow/controller/operator.go | 13 ++++++------- workflow/controller/workflowpod.go | 3 ++- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 917afd57d5d4..c463f5ab96fb 100644 --- a/Makefile +++ b/Makefile @@ -173,17 +173,15 @@ images: cli-image executor-image controller-image .PHONY: cli cli: dist/argo argo-server.crt argo-server.key +ifeq ($(STATIC_FILES),true) ui/dist/app/index.html: $(shell find ui/src -type f && find ui -maxdepth 1 -type f) - # Build UI - @mkdir -p ui/dist/app -ifeq ($(YARN),true) # `yarn install` is fast (~2s), so you can call it safely. JOBS=max yarn --cwd ui install -endif -ifeq ($(STATIC_FILES),true) # `yarn build` is slow, so we guard it with a up-to-date check. JOBS=max yarn --cwd ui build else +ui/dist/app/index.html: + @mkdir -p ui/dist/app echo "Built without static files" > ui/dist/app/index.html endif diff --git a/test/e2e/manifests/mixins/workflow-controller-configmap.yaml b/test/e2e/manifests/mixins/workflow-controller-configmap.yaml index a864b4b22472..05bb20c2805b 100644 --- a/test/e2e/manifests/mixins/workflow-controller-configmap.yaml +++ b/test/e2e/manifests/mixins/workflow-controller-configmap.yaml @@ -4,11 +4,14 @@ metadata: name: workflow-controller-configmap data: containerRuntimeExecutor: pns - # total number of pending + running resources (i.e. pods) - resourceLimit: "5" - # limit or creation of resources (i.e. pods) + # The maximum number number of incomplete pods. + resourceLimit: "3" + # Rate-limit or creation of resources (i.e. pods) + # In this example, we throttle to 10 per second. + # When pod creation is throttled by this, the workflow will get re-queued. + # This means the workflow may not be reconciled for 10s (by default), therefore no more pods may be created. resourceRateLimit: | - limit: 0.01 + limit: 10 burst: 1 workflowDefaults: | spec: diff --git a/ui/src/app/workflows/components/workflow-creator.tsx b/ui/src/app/workflows/components/workflow-creator.tsx index 6c58b5813003..0a45a8bbf847 100644 --- a/ui/src/app/workflows/components/workflow-creator.tsx +++ b/ui/src/app/workflows/components/workflow-creator.tsx @@ -68,14 +68,14 @@ export const WorkflowCreator = ({namespace, onCreate}: {namespace: string; onCre

Submit new workflow

Either:

-