Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(controller): k8s_request_total and workflow_condition metrics #4811

Merged
merged 20 commits into from
Jan 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,4 @@ git-ask-pass.sh
/.brew_home
/go-diagrams/
/.run/
pprof
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

52 changes: 31 additions & 21 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ KUBE_NAMESPACE ?= argo

VERSION := latest
DEV_IMAGE := true
DOCKER_PUSH := false

# VERSION is the version to be used for files in manifests and should always be latest uunlesswe are releasing
# we assume HEAD means you are on a tag
Expand Down Expand Up @@ -73,6 +74,9 @@ NAMESPACED := true
ifeq ($(PROFILE),prometheus)
RUN_MODE := kubernetes
endif
ifeq ($(PROFILE),stress)
RUN_MODE := kubernetes
endif

ALWAYS_OFFLOAD_NODE_STATUS := false
ifeq ($(PROFILE),mysql)
Expand Down Expand Up @@ -144,6 +148,7 @@ define docker_build
docker build --progress plain -t $(IMAGE_NAMESPACE)/$(1):$(VERSION) --target $(1) -f $(DOCKERFILE) --build-arg IMAGE_OS=$(OUTPUT_IMAGE_OS) --build-arg IMAGE_ARCH=$(OUTPUT_IMAGE_ARCH) .
if [ $(DEV_IMAGE) = true ]; then mv $(2) dist/$(2)-$(OUTPUT_IMAGE_OS)-$(OUTPUT_IMAGE_ARCH); fi
if [ $(K3D) = true ]; then k3d image import $(IMAGE_NAMESPACE)/$(1):$(VERSION); fi
if [ $(DOCKER_PUSH) = true ] && [ $(IMAGE_NAMESPACE) != argoproj ] ; then docker push $(IMAGE_NAMESPACE)/$(1):$(VERSION) ; fi
touch $(3)
endef
define docker_pull
Expand Down Expand Up @@ -390,15 +395,21 @@ endif
test: server/static/files.go
env KUBECONFIG=/dev/null $(GOTEST) ./...

dist/$(PROFILE).yaml: $(MANIFESTS) $(E2E_MANIFESTS) /usr/local/bin/kustomize
mkdir -p dist
kustomize build --load_restrictor=none test/e2e/manifests/$(PROFILE) | sed 's/:latest/:$(VERSION)/' | sed 's/pns/$(E2E_EXECUTOR)/' > dist/$(PROFILE).yaml

.PHONY: install
install: dist/$(PROFILE).yaml
cat test/e2e/manifests/argo-ns.yaml | sed 's/argo/$(KUBE_NAMESPACE)/' > dist/argo-ns.yaml
kubectl apply -f dist/argo-ns.yaml
kubectl -n $(KUBE_NAMESPACE) apply -l app.kubernetes.io/part-of=argo --prune --force -f dist/$(PROFILE).yaml
install: $(MANIFESTS) $(E2E_MANIFESTS) /usr/local/bin/kustomize
kubectl get ns $(KUBE_NAMESPACE) || kubectl create ns $(KUBE_NAMESPACE)
kubectl config set-context --current --namespace=$(KUBE_NAMESPACE)
@echo "installing PROFILE=$(PROFILE) VERSION=$(VERSION), E2E_EXECUTOR=$(E2E_EXECUTOR)"
kustomize build --load_restrictor=none test/e2e/manifests/$(PROFILE) | sed 's/image: argoproj/image: $(IMAGE_NAMESPACE)/' | sed 's/:latest/:$(VERSION)/' | sed 's/pns/$(E2E_EXECUTOR)/' | kubectl -n $(KUBE_NAMESPACE) apply -f -
kubectl -n $(KUBE_NAMESPACE) apply -f test/stress/massive-workflow.yaml
kubectl -n $(KUBE_NAMESPACE) rollout restart deploy workflow-controller
kubectl -n $(KUBE_NAMESPACE) rollout restart deploy argo-server
kubectl -n $(KUBE_NAMESPACE) rollout restart deploy minio
ifeq ($(RUN_MODE),kubernetes)
# scale to 2 replicas so we touch upon leader election
kubectl -n $(KUBE_NAMESPACE) scale deploy/workflow-controller --replicas 2
kubectl -n $(KUBE_NAMESPACE) scale deploy/argo-server --replicas 1
endif

.PHONY: pull-build-images
pull-build-images:
Expand All @@ -421,36 +432,35 @@ test-images:
$(call docker_pull,argoproj/argosay:v2)
$(call docker_pull,python:alpine3.6)

.PHONY: stop
stop:
killall argo workflow-controller kubectl || true

$(GOPATH)/bin/goreman:
go get github.com/mattn/goreman

.PHONY: start
start: stop install controller cli executor-image $(GOPATH)/bin/goreman
kubectl config set-context --current --namespace=$(KUBE_NAMESPACE)
ifeq ($(RUN_MODE),kubernetes)
$(MAKE) controller-image cli-image
kubectl -n $(KUBE_NAMESPACE) scale deploy/workflow-controller --replicas 1
kubectl -n $(KUBE_NAMESPACE) scale deploy/argo-server --replicas 1
start: controller-image cli-image install executor-image
else
start: install controller cli executor-image $(GOPATH)/bin/goreman
endif
ifeq ($(RUN_MODE),kubernetes)
kubectl -n $(KUBE_NAMESPACE) wait --for=condition=Ready pod -l app=argo-server --timeout 1m
kubectl -n $(KUBE_NAMESPACE) wait --for=condition=Ready pod -l app=workflow-controller --timeout 1m
kubectl -n $(KUBE_NAMESPACE) wait --for=condition=Available deploy argo-server
kubectl -n $(KUBE_NAMESPACE) wait --for=condition=Available deploy workflow-controller
endif
ifeq ($(PROFILE),prometheus)
kubectl -n $(KUBE_NAMESPACE) wait --for=condition=Ready pod -l app=prometheus --timeout 1m
kubectl -n $(KUBE_NAMESPACE) wait --for=condition=Available deploy prometheus
endif
ifeq ($(PROFILE),stress)
kubectl -n $(KUBE_NAMESPACE) wait --for=condition=Available deploy prometheus
endif
./hack/port-forward.sh
# Check dex, minio, postgres and mysql are in hosts file
ifeq ($(AUTH_MODE),sso)
grep '127.0.0.1[[:blank:]]*dex' /etc/hosts
endif
grep '127.0.0.1[[:blank:]]*minio' /etc/hosts
grep '127.0.0.1[[:blank:]]*postgres' /etc/hosts
grep '127.0.0.1[[:blank:]]*mysql' /etc/hosts
# allow time for pods to terminate
sleep 10s
./hack/port-forward.sh
ifeq ($(RUN_MODE),local)
env DEFAULT_REQUEUE_TIME=$(DEFAULT_REQUEUE_TIME) SECURE=$(SECURE) ALWAYS_OFFLOAD_NODE_STATUS=$(ALWAYS_OFFLOAD_NODE_STATUS) LOG_LEVEL=$(LOG_LEVEL) UPPERIO_DB_DEBUG=$(UPPERIO_DB_DEBUG) VERSION=$(VERSION) AUTH_MODE=$(AUTH_MODE) NAMESPACED=$(NAMESPACED) NAMESPACE=$(KUBE_NAMESPACE) $(GOPATH)/bin/goreman -set-ports=false -logtime=false start
endif
Expand Down
9 changes: 9 additions & 0 deletions cmd/workflow-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package main
import (
"context"
"fmt"
"net/http"
_ "net/http/pprof"
"os"
"time"

Expand All @@ -21,6 +23,7 @@ import (
wfclientset "github.com/argoproj/argo/pkg/client/clientset/versioned"
cmdutil "github.com/argoproj/argo/util/cmd"
"github.com/argoproj/argo/workflow/controller"
"github.com/argoproj/argo/workflow/metrics"
)

const (
Expand Down Expand Up @@ -64,6 +67,8 @@ func NewRootCommand() *cobra.Command {
config.Burst = burst
config.QPS = qps

metrics.AddMetricsTransportWrapper(config)

namespace, _, err := clientConfig.Namespace()
if err != nil {
return err
Expand All @@ -89,6 +94,10 @@ func NewRootCommand() *cobra.Command {

go wfController.Run(ctx, workflowWorkers, workflowTTLWorkers, podWorkers, podCleanupWorkers)

go func() {
log.Println(http.ListenAndServe("localhost:6060", nil))
}()
Comment on lines +97 to +99
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the log.Println?


// Wait forever
select {}
},
Expand Down
68 changes: 59 additions & 9 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,65 @@ a way to view and analyze historical data, consider the [workflow archive](workf

### Default Controller Metrics

There are several controller-level metrics. These include:

* `workflows_processed_count`: a count of all Workflow updates processed by the controller
* `count`: a count of all workflows currently accessible by the controller by status
* `operation_duration_seconds`: a histogram of durations of operations
* `error_count`: a count of certain errors incurred by the controller
* `queue_depth_count`: the depth of the queue of workflows or cron workflows to be processed by the controller
* `queue_adds_count`: the number of adds to the queue of workflows or cron workflows
* `queue_latency`: the time workflows or cron workflows spend in the queue waiting to be processed
Metrics for the Four Golden Signals are:

* Latency: `argo_workflows_queue_latency`
* Traffic: `argo_workflows_count` and `argo_workflows_queue_depth_count`
* Errors: `argo_workflows_count` and `argo_workflows_error_count`
* Saturation: `argo_workflows_workers_busy` and `argo_workflows_workflow_condition`

<!-- titles should be the exact metric name for deep-linking, alphabetical ordered -->

#### argo_pod_missing

Pods were not seen. E.g. by being deleted by Kubernetes. You should only see this under high load.

!!! NOTE
This metric's name starts with `argo_` not `argo_workflows_`.

#### argo_workflows_count

Number of workflow in each phase. The `Running` count does not mean that a workflows pods are running, just that the controller has scheduled them. A workflow can be stuck in `Running` with pending pods for a long time.

#### argo_workflows_error_count

A count of certain errors incurred by the controller.

#### argo_workflows_k8s_request_total

Number of API requests sent to the Kubernetes API.

#### argo_workflows_operation_duration_seconds

A histogram of durations of operations.

#### argo_workflows_pods_count

It is possible for a workflow to start, but no pods be running (e.g. cluster is too busy to run them). This metric sheds light on actual work being done.

#### argo_workflows_queue_adds_count

The number of additions to the queue of workflows or cron workflows.

#### argo_workflows_queue_depth_count

The depth of the queue of workflows or cron workflows to be processed by the controller.

#### argo_workflows_queue_latency

The time workflows or cron workflows spend in the queue waiting to be processed.

#### argo_workflows_workers_busy

The number of workers that are busy.

#### argo_workflows_workflow_condition

The number of workflow with different conditions. This will tell you the number of workflows with running pods.

#### argo_workflows_workflows_processed_count

A count of all Workflow updates processed by the controller.

### Metric types

Expand Down
32 changes: 32 additions & 0 deletions docs/stress-testing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Stress Testing

Create a cluster in [`jesse-sb` project](https://console.cloud.google.com/access/iam?cloudshell=false&project=jesse-sb).

Install `gcloud` binary.

Login to GCP: `gloud auth login`

Connect to your new cluster.

Make sure you've logged in to Docker Hub: `docker login`

Run `make start PROFILE=stress IMAGE_NAMESPACE=alexcollinsintuit DOCKER_PUSH=true`.

If this fails, just try running it again.

Open http://localhost:2746 and check you can run a workflow.

Open `test/stress/main.go` and run it with a small number (e.g. 10) workflows and make sure they complete.

Do you get `ImagePullBackOff`? Make sure image is `argoproj/argosay:v2` in `kubectl -n argo edit workflowtemplate massive-workflow`.

Open http://localhost:9091/graph.

You can use [this Tab Auto Refresh Chrome extension](https://chrome.google.com/webstore/detail/tab-auto-refresh/oomoeacogjkolheacgdkkkhbjipaomkn) to auto-refresh the page.

Open `test/stress/main.go` and run it with a large number (e.g. 10000).

Use Prometheus to analyse this.

Finally, you can capture PProf using `./hack/capture-pprof.sh`.

14 changes: 14 additions & 0 deletions hack/capture-pprof.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash
set -eu -o pipefail

echo " https://blog.golang.org/pprof"

cd $(dirname $0)/..

n=$(date +%s)

go tool pprof -png -output dist/heap-$n.png http://localhost:6060/debug/pprof/heap
go tool pprof -png -output dist/allocs-$n.png http://localhost:6060/debug/pprof/allocs
go tool pprof -png -output dist/block-$n.png http://localhost:6060/debug/pprof/block
go tool pprof -png -output dist/mutex-$n.png http://localhost:6060/debug/pprof/mutex
go tool pprof -png -output dist/profile-$n.png http://localhost:6060/debug/pprof/profile?seconds=30
14 changes: 10 additions & 4 deletions hack/port-forward.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ info() {
echo '[INFO] ' "$@"
}

pf MinIO pod/minio 9000
killall kubectl || true

if [[ "$(kubectl -n argo get pod -l app=minio -o name)" != "" ]]; then
pf MinIO deploy/minio 9000
fi

dex=$(kubectl -n argo get pod -l app=dex -o name)
if [[ "$dex" != "" ]]; then
Expand All @@ -39,13 +43,15 @@ if [[ "$mysql" != "" ]]; then
fi

if [[ "$(kubectl -n argo get pod -l app=argo-server -o name)" != "" ]]; then
pf "Argo Server" deploy/argo-server 2746
pf "Argo Server" svc/argo-server 2746
fi

if [[ "$(kubectl -n argo get pod -l app=workflow-controller -o name)" != "" ]]; then
pf "Workflow Controller" deploy/workflow-controller 9090
pf "Workflow Controller Metrics" svc/workflow-controller-metrics 9090
pf "Workflow Controller PProf" svc/workflow-controller-pprof 6060
fi

if [[ "$(kubectl -n argo get pod -l app=prometheus -o name)" != "" ]]; then
pf "Prometheus Server" deploy/prometheus 9091 9090
pf "Prometheus Server" svc/prometheus 9091 9090
fi

Loading