Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move statsd agent to a deployment on the operator nodegroup #2247

Merged
merged 6 commits into from
Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions cmd/dequeuer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func main() {
userContainerPort int
apiName string
jobID string
statsdPort int
statsdAddress string
apiKind string
adminPort int
)
Expand All @@ -58,8 +58,8 @@ func main() {
flag.StringVar(&apiKind, "api-kind", "", fmt.Sprintf("api kind (%s|%s)", userconfig.BatchAPIKind.String(), userconfig.AsyncAPIKind.String()))
flag.StringVar(&apiName, "api-name", "", "api name")
flag.StringVar(&jobID, "job-id", "", "job ID")
flag.StringVar(&statsdAddress, "statsd-address", "", "address to push statsd metrics")
flag.IntVar(&userContainerPort, "user-port", 8080, "target port to which the dequeued messages will be sent to")
flag.IntVar(&statsdPort, "statsd-port", 9125, "port for to send udp statsd metrics")
flag.IntVar(&adminPort, "admin-port", 0, "port where the admin server (for the probes) will be exposed")

flag.Parse()
Expand All @@ -69,8 +69,6 @@ func main() {
version = consts.CortexVersion
}

hostIP := os.Getenv("HOST_IP")

log := logging.GetLogger()
defer func() {
_ = log.Sync()
Expand Down Expand Up @@ -158,7 +156,7 @@ func main() {
TargetURL: targetURL,
}

metricsClient, err := statsd.New(fmt.Sprintf("%s:%d", hostIP, statsdPort))
metricsClient, err := statsd.New(statsdAddress)
if err != nil {
exit(log, err, "unable to initialize metrics client")
}
Expand Down
40 changes: 21 additions & 19 deletions manager/manifests/prometheus-statsd-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,13 @@ data:
observer_type: histogram

---

apiVersion: apps/v1
kind: DaemonSet
kind: Deployment
metadata:
name: prometheus-statsd-exporter
namespace: default
spec:
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
replicas: 1
selector:
matchLabels:
name: prometheus-statsd-exporter
Expand Down Expand Up @@ -62,7 +58,6 @@ spec:
protocol: TCP
- name: statsd-udp
containerPort: 9125
hostPort: 9125
protocol: UDP
livenessProbe:
httpGet:
Expand All @@ -86,16 +81,23 @@ spec:
items:
- key: statsd-mapping.yaml
path: statsd-mapping.yaml
nodeSelector:
workload: "true"
terminationGracePeriodSeconds: 60
tolerations:
- key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: workload
operator: Exists
effect: NoSchedule

---
apiVersion: v1
kind: Service
metadata:
namespace: default
name: prometheus-statsd-exporter
labels:
cortex.dev/name: prometheus-statsd-exporter
spec:
selector:
name: prometheus-statsd-exporter
ports:
- port: 9125
name: statsd-udp
protocol: UDP
- port: 9102
name: metrics
protocol: TCP
5 changes: 1 addition & 4 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,7 @@ var (
AdminPortName = "admin"
AdminPortStr = "15000"
AdminPortInt32 = int32(15000)

StatsDPortStr = "9125"

AuthHeader = "X-Cortex-Authorization"
AuthHeader = "X-Cortex-Authorization"

DefaultInClusterConfigPath = "/configs/cluster/cluster.yaml"
MaxBucketLifecycleRules = 100
Expand Down
6 changes: 2 additions & 4 deletions pkg/operator/resources/validations.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,23 +106,21 @@ func ValidateClusterAPIs(apis []userconfig.API) error {
CPU Reservations:

FluentBit 100
StatsDExporter 100
NodeExporter 110 (it has two containers)
KubeProxy 100
AWS cni 10
Reserved (150 + 150) see eks.yaml for details
*/
var _cortexCPUReserve = kresource.MustParse("720m")
var _cortexCPUReserve = kresource.MustParse("620m")

/*
Memory Reservations:

FluentBit 150
StatsDExporter 100
NodeExporter 200 (it has two containers)
Reserved (300 + 300 + 200) see eks.yaml for details
*/
var _cortexMemReserve = kresource.MustParse("1250Mi")
var _cortexMemReserve = kresource.MustParse("1150Mi")

var _nvidiaCPUReserve = kresource.MustParse("100m")
var _nvidiaMemReserve = kresource.MustParse("100Mi")
Expand Down
24 changes: 6 additions & 18 deletions pkg/workloads/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ const (
_clusterConfigDirVolume = "cluster-config"
_clusterConfigConfigMap = "cluster-config"
_clusterConfigDir = "/configs/cluster"

_statsdAddress = "prometheus-statsd-exporter.default:9125"
)

var (
Expand Down Expand Up @@ -128,18 +130,11 @@ func asyncDequeuerProxyContainer(api spec.API, queueURL string) (kcore.Container
"--queue", queueURL,
"--api-kind", api.Kind.String(),
"--api-name", api.Name,
"--statsd-address", _statsdAddress,
"--user-port", s.Int32(*api.Pod.Port),
"--statsd-port", consts.StatsDPortStr,
"--admin-port", consts.AdminPortStr,
},
Env: append(baseEnvVars, kcore.EnvVar{
Name: "HOST_IP",
ValueFrom: &kcore.EnvVarSource{
FieldRef: &kcore.ObjectFieldSelector{
FieldPath: "status.hostIP",
},
},
}),
Env: baseEnvVars,
Ports: []kcore.ContainerPort{
{
Name: consts.AdminPortName,
Expand Down Expand Up @@ -181,18 +176,11 @@ func batchDequeuerProxyContainer(api spec.API, jobID, queueURL string) (kcore.Co
"--api-kind", api.Kind.String(),
"--api-name", api.Name,
"--job-id", jobID,
"--statsd-address", _statsdAddress,
"--user-port", s.Int32(*api.Pod.Port),
"--statsd-port", consts.StatsDPortStr,
"--admin-port", consts.AdminPortStr,
},
Env: append(baseEnvVars, kcore.EnvVar{
Name: "HOST_IP",
ValueFrom: &kcore.EnvVarSource{
FieldRef: &kcore.ObjectFieldSelector{
FieldPath: "status.hostIP",
},
},
}),
Env: baseEnvVars,
ReadinessProbe: &kcore.Probe{
Handler: kcore.Handler{
HTTPGet: &kcore.HTTPGetAction{
Expand Down