-
Notifications
You must be signed in to change notification settings - Fork 65
/
values.yaml
540 lines (511 loc) · 23 KB
/
values.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
# Configuration of dependency charts
# -------------------------------------------------------------------------------
#
# cluster-autoscaler is responsible for understanding if k8s nodes needs to be
# added or removed. A pending pod is a sign that another node should be added,
# and an underused node is a sign that a node should be removed.
#
# We always need a cluster-autoscaler, but only deploy it ourselves if its not
# provided as part of the k8s cluster.
#
# values ref: https://github.com/kubernetes/autoscaler/blob/master/charts/cluster-autoscaler/values.yaml
#
cluster-autoscaler:
enabled: false
# ingress-nginx is responsible for proxying traffic based on k8s Ingress
# resources defined in the k8s cluster.
#
# Typically, all inbound traffic arrives to some ingress-nginx controller pod
# via a k8s Service that has a public IP, and is thereafter proxied to a k8s
# Service (a Pod selected by a k8s Service) that doesn't have a public IP.
#
# values ref: https://github.com/kubernetes/ingress-nginx/blob/main/charts/ingress-nginx/values.yaml
#
ingress-nginx:
controller:
podLabels:
# nginx-ingress controllers need to be allowed proxy traffic onwards to
# JupyterHub's proxy pod (and only the proxy pod) in clusters with
# NetworkPolicy enforcement enabled. Adding this label on the controller
# pod allows that.
#
# ref: https://z2jh.jupyter.org/en/stable/administrator/security.html#introduction-to-the-chart-s-four-network-policies
#
hub.jupyter.org/network-access-proxy-http: "true"
resources:
requests:
cpu: 100m # chart's default is 100m
# This pod has been observed at 104-131Mi so far in 2i2c clusters, above
# the default request of 90Mi. To protect it from being evicted by using
# more memory than requested, we have increased the memory request.
memory: 250Mi # chart's default is 90Mi
# allowSnippetAnnotations being set to false by default was the result of
# the below linked CVE. We should investigate if there are other ways we can
# use this type of config.
# https://github.com/kubernetes/ingress-nginx/issues/7837
allowSnippetAnnotations: true
admissionWebhooks:
# Disabled as cert-manager in EKS 1.30 ran into the following issue
# working with Ingress objects with the webhook not working out, with an
# error message including this:
#
# err="Internal error occurred: failed calling webhook \"validate.nginx.ingress.kubernetes.io\": failed to call webhook: Post \"https://support-ingress-nginx-controller-admission.support.svc:443/networking/v1/ingresses?timeout=10s\": tls: failed to verify certificate: x509: certificate signed by unknown authority"
#
# Disabling this shouldn't be an issue, it just helps catch invalid
# configurations beyond what the k8s api-server will catch.
#
enabled: false
# prometheus is responsible for collecting metrics and informing grafana about
# the metrics on request. It comes with several dependency charts, where we
# opt-out of a few.
#
# values ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus/values.yaml
#
prometheus:
# alertmanager is an optional prometheus chart dependency that we opt-out from
# as we favor Grafana for this functionality. Grafana provides alerts and does
# so with a better UI that we expose publicly behind auth anyhow.
#
alertmanager:
enabled: false
# prometheus-pushgateway is an optional prometheus chart dependency that we
# opt-out from. pushgateway provides a way to complement prometheus server's
# behavior of scraping metrics from services by allowing services to push
# metrics to prometheus.
#
prometheus-pushgateway:
enabled: false
# kube-state-metrics is an optional prometheus chart dependency that we rely
# on to collect metrics about the k8s cluster's pods and nodes as reported by
# the k8s api-server.
#
# values ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-state-metrics/values.yaml
#
kube-state-metrics:
# kube-state-metrics stopped collecting *all* labels on all kubernetes objects as
# prometheus labels, as it exploded the total number of metrics being collected. We
# have to explicitly allow-list the labels we want. These are the ones needed for
# the graphs from https://github.com/jupyterhub/grafana-dashboards to work
metricLabelsAllowlist:
- pods=[app,component,hub.jupyter.org/username,app.kubernetes.io/component]
- nodes=[*]
- services=[app, component]
# prometheus-node-exporter is an optional prometheus chart dependency that we
# rely on to collect metrics about the nodes
#
# values ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml
#
prometheus-node-exporter:
# resources for the node-exporter was set after inspecting cpu and memory
# use via prometheus and grafana.
#
# node-exporter is typically found using between 0-3m CPU and 2-22Mi memory,
# but we've seen it fail to report cpu/memory use metrics from time to time
# when requesting and limiting to 5m, so we've increased requests/limit it
# to 10m.
#
# PromQL queries for CPU and memory use:
# - CPU: sum(rate(container_cpu_usage_seconds_total{container="node-exporter", namespace="support"}[5m])) by (pod)
# - Memory: sum(container_memory_usage_bytes{container="node-exporter", namespace="support"}) by (pod)
#
resources:
limits:
cpu: 10m
memory: 30Mi
requests:
cpu: 10m
memory: 30Mi
networkPolicy:
enabled: true
server:
# Prometheus sometimes seems to run into bugs where the 'write ahead log'
# used for recovery from crashes and network failures gets massive, and
# requires an order of magnitude more RAM to start up than usual. This could
# have been greatly improved in prometheus 2.45 which is used as of
# 2023-06-28, but we stick with a workaround for now until other users have
# concluded things have greatly improved for them in prometheus 2.45+.
#
# ref: https://github.com/prometheus/prometheus/issues/6934#issuecomment-1610921555
#
# We are deleting the WAL at each startup now, and it has made us lose some
# data during restarts, but that is preferable to having the entire
# prometheus server be down.
#
# We can't use initContainers here, as they are not run when a container
# restarts - only when the pod itself is redone. And unfortunately there
# isn't a simple script we can run that will pass through the args to
# /bin/prometheus after running the rm. So we improvise here - this is
# the commandline of what prometheus is actually executing, copied over.
# This must be revalidated with each bump of the prometheus helm chart!
#
# IMPORTANT: When upgrading the prometheus chart we need to manually adjust
# to changes to...
#
# - The Deployment resource template's container args:
# https://github.com/prometheus-community/helm-charts/blob/41a8b07fb3477be61a58ad9acd8879625eabf0d0/charts/prometheus/templates/deploy.yaml#L129-L163
#
# - The default values of the config referenced in the template:
# https://github.com/prometheus-community/helm-charts/blob/41a8b07fb3477be61a58ad9acd8879625eabf0d0/charts/prometheus/values.yaml
#
# In the commit 41a8b07f linked above, this config was
# referenced and defaulted to:
#
# server.retention - "15d"
# server.retentionSize - ""
# server.configPath - "/etc/config/prometheus.yml"
# server.storagePath - ""
# server.persistentVolume.mountPath - "/data"
# server.extraFlags - ["web.enable-lifecycle"]
# server.extraArgs - {}
# server.prefixURL - ""
# server.baseURL - ""
#
# Currently our adjustments from the defaults are to
# server.retention and server.extraFlags configured
# directly below server.defaultFlagsOverride.
#
command:
- /bin/sh
defaultFlagsOverride:
- -c
# If changing this, please also find wherever this has been copied to and change
# those as well as appropriate
- "rm -rvf /data/wal/* && \
exec /bin/prometheus \
--storage.tsdb.retention.time=1098d \
--config.file=/etc/config/prometheus.yml \
--storage.tsdb.path=/data \
--web.console.libraries=/etc/prometheus/console_libraries \
--web.console.templates=/etc/prometheus/consoles \
--web.enable-lifecycle"
# extraFlags MUST BE UPDATED in prometheus.server.defaultFlagsOverride as well
extraFlags:
- web.enable-lifecycle
# retention MUST BE UPDATED in prometheus.server.defaultFlagsOverride as well
retention: 1098d # Keep data for at least 3 years
ingress:
ingressClassName: nginx
annotations:
# Annotations required to enable basic authentication for any ingress
# into prometheus server from the outside world. This secret is
# created via templates/prometheus-ingress-auth/secret.yaml file in the support chart,
# and the contents are controlled by config under prometheusIngressAuthSecret.
# Ingress is not enabled by default, so whichever clusters we want
# this we should enable it explicitly.
nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-secret: prometheus-ingress-auth-basic
nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
# If we enable external ingress into prometheus, we must secure it with HTTPS
cert-manager.io/cluster-issuer: letsencrypt-prod
# Increase timeout for each query made by the grafana frontend to the
# grafana backend to 2min, which is the default timeout for prometheus
# queries. This also matches the timeout for the dataproxy in grafana,
# set under `grafana.grafana.ini` below. These two timeouts are set together
# to allow prometheus the best chance of executing queries we care about.
# While grafana in the same cluster does not go through nginx to access
# prometheus, the central grafana used does go through nginx. So we
# increase the timeout here as well.
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
strategy:
# type is set to Recreate as we have a persistent disk attached. The
# default of RollingUpdate would fail by getting stuck waiting for a new
# pod trying to mount the same storage that only supports being mounted by
# one pod.
type: Recreate
resources:
# Prometheus cpu/memory requests/limits needs to be carefully considered based on:
#
# CPU:
#
# During startup prometheus-server can compete for ~all of the node's
# CPU unless its bounded by a limit. The key to avoiding this isn't to
# provide a low limit as what's low is relative to the node's capacity,
# which could for example be 2, 4, or 8 CPU.
#
# The avoid starving other pods, the key is to have a sufficiently low
# CPU request so that this pod doesn't get far higher priority to use
# the available CPU on the node when in competition with the other pods
# for CPU.
#
# Memory:
#
# During startup prometheus-server will use up a lot of memory, up to X
# where X grows are more metrics has been collected. Over time,
# prometheus-server may run into its memory limit and get OOMKilled, or
# the node it runs on may run out of memory before that happens and
# prometheus-server or other pods exceeding their memory limit will get
# evicted.
#
# To fail reliably in prometheus-server without influencing other pods,
# we should put the memory request to match the limit.
#
requests:
cpu: 0.1
memory: 4Gi
limits:
cpu: 4
memory: 4Gi
# prometheus-server may take a long time to startup when loading large
# amounts of previously scraped data. To avoid having a livenessProbe
# terminate the starting pod, we opt-in to a more tolerant startupProbe that
# is active instead of the livenessProbe during startup.
startupProbe:
enabled: true
periodSeconds: 10
failureThreshold: 60
podLabels:
# To scrape metrics from a hub pod, prometheus-server needs to be
# explicitly allowed network access to hub pods in clusters with
# NetworkPolicy enforcement enabled. Adding this label does the trick.
#
# ref: https://z2jh.jupyter.org/en/stable/administrator/security.html#introduction-to-the-chart-s-four-network-policies
#
hub.jupyter.org/network-access-hub: "true"
persistentVolume:
size: 400Gi
service:
type: ClusterIP
# grafana presents dashboards based on metrics from a prometheus server.
#
# We setup dashboards programmatically via a grafana REST API based on
# https://github.com/jupyterhub/grafana-dashboards.
#
# values ref: https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml
#
grafana:
# We decrypt and pass `grafana.adminPassword` just in time as part of
# deploying grafana, and find it acceptable - so we disable this strict check.
assertNoLeakedSecrets: false
persistence:
# A PVC is used to enable Grafana to store auth details and dashboard
# definitions.
enabled: true
# size could probably be smaller as not much space should be required, but
# the chart default of 10Gi has made it a bit tricky to reduce it
# retroactively.
size: 10Gi
deploymentStrategy:
# type Recreate is required since we attach a PVC that can only be used by
# mounted for writing by one pod at the time.
type: Recreate
readinessProbe:
# With one grafana pod replica, having a readiness probe fail is pointless.
# We ensure it won't fail before the livenessProbe that would restart the
# container.
failureThreshold: 1000
initialDelaySeconds: 1
rbac:
# namespaced makes us not get ClusterRole service accounts etc, and we do
# fine without it.
namespaced: true
# initChownData refers to an init container enabled by default that isn't
# needed as we don't reconfigure the linux user the grafana server will run
# as.
initChownData:
enabled: false
# resources for grafana was set after inspecting cpu and memory use via
# prometheus and grafana.
#
# Grafana's memory use seems to increase over time but seems reasonable to
# stay below 200Mi in general. Memory can peak when dashboards are updated,
# and was increased to 400Mi as its been seen getting OOMKilled. Grafana's CPU
# use seems minuscule with peaks at up to 9m CPU from one user is browsing its
# dashboards.
#
# PromQL queries for CPU and memory use:
# - CPU: sum(rate(container_cpu_usage_seconds_total{container="grafana", namespace="support"}[5m])) by (pod)
# - Memory: sum(container_memory_usage_bytes{container="grafana", namespace="support"}) by (pod)
#
resources:
limits:
cpu: 100m
memory: 400Mi
requests:
cpu: 10m
memory: 200Mi
service:
type: ClusterIP
ingress:
enabled: true
ingressClassName: nginx
annotations:
# nginx.ingress.kubernetes.io/proxy-body-size increased from the default
# of 1m to avoid HTTP 413 (Request Entity Too Large) when making POST
# request (via jupyterhub/grafana-dashboard's deploy script, from a github
# workflow) to update dashboards with json files representing them.
nginx.ingress.kubernetes.io/proxy-body-size: 64m
# Increase timeout for each query made by the grafana frontend to the
# grafana backend to 2min, which is the default timeout for prometheus
# queries. This also matches the timeout for the dataproxy in grafana,
# set under `grafana.ini` below. These two timeouts are set together
# to allow prometheus the best chance of executing queries we care about.
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
cert-manager.io/cluster-issuer: letsencrypt-prod
# grafana is partially configured for GitHub authentication here, but the
# following values need to be set as well, where client_secret should be kept
# secret:
#
# server:
# root_url: https://grafana.<cluster_name>.2i2c.cloud/
# auth.github:
# enabled: true
# allowed_organizations: "2i2c-org some-other-gh-org"
# client_id: ""
# client_secret: ""
#
# grafana.ini ref: https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/
#
grafana.ini:
# dataproxy is used to make requests to prometheus via the backend.
# This allows authless access to prometheus server in the same namespace.
dataproxy:
# Enable logging so we can debug grafana timeouts
logging: true
# Default prometheus query timeout is 120s, so let's allow grafana to
# wait until that much time.
# See https://prometheus.io/docs/prometheus/latest/command-line/prometheus/
# for default prometheus query timeouts.
# Because our grafana is behind an nginx ingress, this should match the
# read timeout set above under `ingress.annotations`.
timeout: 120
unified_alerting:
# Disable alerting for now as its not something our community users can
# setup. See https://github.com/2i2c-org/infrastructure/issues/4919.
enabled: false
server:
root_url: ""
enable_gzip: true
auth.github:
enabled: false
allowed_organizations: ""
client_id: ""
client_secret: ""
allow_sign_up: true
scopes: user:email,read:org
auth_url: https://github.com/login/oauth/authorize
token_url: https://github.com/login/oauth/access_token
api_url: https://api.github.com/user
plugins:
- yesoreyeram-infinity-datasource
datasources:
datasources.yaml:
apiVersion: 1
datasources:
# Automatically add the prometheus server in the same namespace as the grafana as a datasource
- name: prometheus
orgId: 1
type: prometheus
# This is the name of the kubernetes service exposed by the prometheus server
url: http://support-prometheus-server
access: proxy
isDefault: false
editable: false
- name: yesoreyeram-infinity-datasource
type: yesoreyeram-infinity-datasource
isDefault: false
editable: true
# cryptnono kills processes attempting to mine the cryptocurrency Monero in the
# k8s cluster.
#
# values ref: https://github.com/yuvipanda/cryptnono/blob/main/cryptnono/values.yaml
#
cryptnono:
enabled: true
# resources for cryptnono was set after inspecting cpu and memory use via
# prometheus and grafana.
#
# cryptnono has an init container (fetch-kernel-headers) and one container per
# detector. We currently only use one detector (monero).
#
# In the past, the init container init container has been found using up to 1.6Gi and up to about
# 600m for 4 minutes. However, recent changes seem to have made this much faster,
# and there's no record of the initcontainer because our prometheus scrape interval
# is 1minute, and the init container seems to complete by then. We retain the older
# measured metrics until we can make new measurements.
#
# Since cryptnono is a non-critical service, we are at the moment allowing it
# to be evicted during node memory pressure by providing a low memory request
# compared to the limit. We are also not requesting significant amounts of CPU
# so that it doesn't compete well with others initially.
fetchKernelHeaders:
resources:
limits:
cpu: 800m
memory: 2Gi
requests:
cpu: 5m
memory: 100Mi
image:
# Brings in https://github.com/cryptnono/cryptnono/pull/33
# FIXME: Unset this once that gets merged and we update to the newest cryptnono version
tag: 0.3.1-0.dev.git.138.ha4b11b8
detectors:
execwhacker:
enabled: true
# Cryptnono can get information about what container the kill was in, so we can identify which
# user was cryptomining
containerdHostPath: /run/containerd/containerd.sock
metrics:
enabled: true
configs:
unencrypted-test-01:
bannedCommandStrings:
# Provide an unencrypted, randomly generated test string. All processes that have this in their commandline will be killed.
# This is helpful for testing execwhacker is enabled and functional
- beiquatohGa1uay0ahMies9couyahPeiz9xohju3Ahvaik3FaeM7eey1thaish1U
monero:
# disable the monero specific detector, as execwhacker catches most of that too
# monero detector is using bpftrace, which has issues with cross-kernel compatibility
enabled: false
resources:
# Measured with the following prometheus queries:
# Memory: sum(container_memory_usage_bytes{container="monero", namespace="support"}) by (instance)
# CPU: sum(rate(container_cpu_usage_seconds_total{container="trace", namespace="support"}[5m])) by (instance)
# Seems to hover mostly around the 60Mi mark for memory, and generally less than 0.0002 in CPU. But
# 1m (or 0.001) is the lowest that can be specified in kubernetes, so we use that.
limits:
memory: 128Mi
cpu: 5m
requests:
memory: 64Mi
cpu: 1m
# aws-ce-grafana-backend exposes AWS Cost Explorer API info to Grafana
#
# values ref: https://github.com/2i2c-org/infrastructure/blob/main/helm-charts/aws-ce-grafana-backend/values.yaml
#
aws-ce-grafana-backend:
enabled: false
# Configuration of templates provided directly by this chart
# -------------------------------------------------------------------------------
#
prometheusIngressAuthSecret:
enabled: false
username: ""
password: ""
redirects:
rules: []
# Enable a daemonset to install nvidia device plugin to GPU nodes
# Not necessary on GCP & AWS don't need this, as it is handled automatically by terraform or eksctl
# respectively
nvidiaDevicePlugin:
azure:
enabled: false
# Setup a separate storageClass specifically for prometheus data
prometheusStorageClass:
gke:
# Defaults to false, until all GKE clusters have been manually
# migrated. Could default to true after that.
enabled: false
# pd-balanced is SSD backed, much faster than spinning standard disks and
# cheaper than pd-ssd. We add the -retain to indicate the retainPolicy
# of Retain, rather than the default of Delete
name: balanced-rwo-retain
parameters:
type: pd-balanced
# Setup a deployment that will periodically backup the Filestore contents
gcpFilestoreBackups:
enabled: false
# A placeholder as global values that can be referenced from the same location
# of any chart should be possible to provide, but aren't necessarily provided or
# used.
global: {}