forked from 2i2c-org/infrastructure
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Use 'jupyter-meets-the-earth' rather than jmte as name, because the existing cluster is already called 'jmte'. - SFTP service is gone! - Replicates config from https://github.com/2i2c-org/infrastructure/pull/436/files to the extent possible - Uses our IRSA config for AWS permissions, rather than the eksctl created service account in use earlier. - Uses CILogon+GitHub for authentication, rather than auth0+github - Re-use the same EFS filesystem from before, avoiding the need to copy a few terabytes of data around - Hub is now at jmte.2i2c.cloud, and the old URL (hub.jupyterearth.org) redirects here. Same for staging. Ref 2i2c-org#2201
- Loading branch information
Showing
13 changed files
with
793 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
name: jupyter-meets-the-earth | ||
provider: aws | ||
aws: | ||
key: enc-deployer-credentials.secret.json | ||
clusterType: eks | ||
clusterName: jupyter-meets-the-earth | ||
region: us-west-2 | ||
support: | ||
helm_chart_values_files: | ||
- support.values.yaml | ||
- enc-support.secret.values.yaml | ||
hubs: | ||
- name: staging | ||
domain: staging.hub.jupytearth.org | ||
helm_chart: daskhub | ||
helm_chart_values_files: | ||
- common.values.yaml | ||
- staging.values.yaml | ||
- enc-staging.secret.values.yaml | ||
- name: prod | ||
display_name: "Jupyter Meets the Earth" | ||
domain: hub.jupytearth.org | ||
helm_chart: daskhub | ||
helm_chart_values_files: | ||
- common.values.yaml | ||
- prod.values.yaml | ||
- enc-prod.secret.values.yaml |
340 changes: 340 additions & 0 deletions
340
config/clusters/jupyter-meets-the-earth/common.values.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,340 @@ | ||
basehub: | ||
nfs: | ||
# enabled is adjusted by staging/prod values | ||
# enabled: true | ||
shareCreator: | ||
enabled: true | ||
pv: | ||
serverIP: fs-01707b06.efs.us-west-2.amazonaws.com | ||
# mountOptions from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html | ||
mountOptions: | ||
- rsize=1048576 | ||
- wsize=1048576 | ||
- timeo=600 | ||
- soft # We pick soft over hard, so NFS lockups don't lead to hung processes | ||
- retrans=2 | ||
- noresvport | ||
# baseShareName is required to be just "/" so that we can create | ||
# various sub folders in the filesystem that our PV to access the | ||
# NFS server can reference successfully as it isn't supported to | ||
# access a not yet existing folder. This creation is automated by | ||
# the nfs-share-creator resource part of the basehub Helm chart. | ||
baseShareName: / | ||
|
||
jupyterhub: | ||
custom: | ||
homepage: | ||
templateVars: | ||
org: | ||
name: Jupyter meets the Earth | ||
logo_url: https://pangeo-data.github.io/jupyter-earth/_static/jupyter-earth.png | ||
url: https://jupytearth.org | ||
designed_by: | ||
name: 2i2c | ||
url: https://2i2c.org | ||
operated_by: | ||
name: 2i2c | ||
url: https://2i2c.org | ||
funded_by: | ||
name: Jupyter meets the Earth | ||
url: https://jupytearth.org | ||
|
||
singleuser: | ||
# extraFiles ref: https://zero-to-jupyterhub.readthedocs.io/en/latest/resources/reference.html#singleuser-extrafiles | ||
extraFiles: | ||
jupyter_notebook_config.json: | ||
mountPath: /etc/jupyter/jupyter_notebook_config.json | ||
data: | ||
# Allow jupyterlab option to show hidden files in browser | ||
# https://github.com/berkeley-dsep-infra/datahub/issues/3160 | ||
ContentsManager: | ||
allow_hidden: true | ||
initContainers: | ||
# Need to explicitly fix ownership here, since EFS doesn't do anonuid | ||
- name: volume-mount-ownership-fix | ||
image: busybox | ||
command: | ||
[ | ||
"sh", | ||
"-c", | ||
"id && chown 1000:1000 /home/jovyan /home/jovyan/shared /home/jovyan/shared-public && ls -lhd /home/jovyan", | ||
] | ||
securityContext: | ||
runAsUser: 0 | ||
volumeMounts: | ||
- name: home | ||
mountPath: /home/jovyan | ||
subPath: "{username}" | ||
- name: home | ||
mountPath: /home/jovyan/shared | ||
subPath: _shared | ||
- name: home | ||
mountPath: /home/jovyan/shared-public | ||
subPath: _shared_public | ||
|
||
# /dev/shm is mounted as a filesystem path, where writing to it means to | ||
# write to memory. | ||
# | ||
# How to: https://stackoverflow.com/questions/46085748/define-size-for-dev-shm-on-container-engine/46434614#46434614 | ||
# Request for this by Ellie: https://fperezgroup.slack.com/archives/C020XCEFPEH/p1658168872788389 | ||
# | ||
storage: | ||
extraVolumes: | ||
- name: dev-shm | ||
emptyDir: | ||
medium: Memory | ||
extraVolumeMounts: | ||
- name: dev-shm | ||
mountPath: /dev/shm | ||
# FIXME: we override the list extraVolumeMounts which is also set in | ||
# the the basehub chart, due to that, we need to add this here | ||
# as well. An option is to add hub.extraConfig entries that | ||
# append the kubespawner configuration to include these extra | ||
# volume mounts. | ||
# | ||
- name: home | ||
mountPath: /home/jovyan/shared | ||
subPath: _shared | ||
readOnly: true | ||
- name: home | ||
mountPath: /home/jovyan/shared-public | ||
subPath: _shared_public | ||
|
||
# Increased as we have experienced a too slow image pull at least | ||
# once. Our pods can take ~6-7 minutes to start on a new node it | ||
# seems, so this gives us some margin. | ||
startTimeout: 1200 | ||
|
||
extraEnv: | ||
GH_SCOPED_CREDS_APP_URL: https://github.com/apps/hub-jupytearth-org-github-integ | ||
GH_SCOPED_CREDS_CLIENT_ID: Iv1.a073b1649637af12 | ||
|
||
# FIXME: Until we can set this just for the GPU nodes, we need to set it for everyon | ||
NVIDIA_DRIVER_CAPABILITIES: compute,utility | ||
|
||
image: | ||
# NOTE: We use the jupyterhub-configurator so this image/tag is not | ||
# relevant. Visit its UI to configure the hub. | ||
# | ||
# staging: https://staging.hub.jupytearth.org/services/configurator/ | ||
# prod: https://hub.jupytearth.org/services/configurator/ | ||
pullPolicy: Always | ||
name: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env | ||
tag: "latest" | ||
|
||
profileList: | ||
- display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB" | ||
default: True | ||
description: "A shared machine, the recommended option until you experience a limitation." | ||
kubespawner_override: | ||
cpu_guarantee: 0.225 | ||
mem_guarantee: 0.875G | ||
mem_limit: null | ||
node_selector: | ||
node.kubernetes.io/instance-type: m5.xlarge | ||
- display_name: "4th of Medium: 1-4 CPU, 4-16 GB" | ||
description: "A shared machine." | ||
kubespawner_override: | ||
cpu_guarantee: 0.875 | ||
mem_guarantee: 3.5G | ||
mem_limit: null | ||
node_selector: | ||
node.kubernetes.io/instance-type: m5.xlarge | ||
- display_name: "Medium: 4 CPU, 16 GB" | ||
description: "A dedicated machine for you." | ||
kubespawner_override: | ||
cpu_guarantee: 3.5 | ||
mem_guarantee: 14G | ||
mem_limit: null | ||
node_selector: | ||
node.kubernetes.io/instance-type: m5.xlarge | ||
- display_name: "Large: 16 CPU, 64 GB" | ||
description: "A dedicated machine for you." | ||
kubespawner_override: | ||
mem_guarantee: 56G | ||
mem_limit: null | ||
node_selector: | ||
node.kubernetes.io/instance-type: m5.4xlarge | ||
- display_name: "Massive: 64 CPU, 256 GB" | ||
description: "A dedicated machine for you." | ||
kubespawner_override: | ||
mem_guarantee: 224G | ||
mem_limit: null | ||
node_selector: | ||
node.kubernetes.io/instance-type: m5.16xlarge | ||
- display_name: "Massive high-memory: 64 CPU, 976 GB" | ||
description: "A dedicated machine for you." | ||
kubespawner_override: | ||
mem_guarantee: 900G | ||
mem_limit: null | ||
node_selector: | ||
node.kubernetes.io/instance-type: x1.16xlarge | ||
- display_name: "Medium GPU: 4 CPU, 16 GB, 1 T4 Tensor Core GPU" | ||
description: "A dedicated machine for you with one GPU attached." | ||
kubespawner_override: | ||
cpu_guarantee: 3.5 | ||
mem_guarantee: 14G | ||
mem_limit: null | ||
node_selector: | ||
node.kubernetes.io/instance-type: g4dn.xlarge | ||
extra_resource_limits: | ||
nvidia.com/gpu: "1" | ||
- display_name: "Large GPU: 16 CPU, 64 GB, 1 T4 Tensor Core GPU" | ||
description: "A dedicated machine for you with one GPU attached." | ||
kubespawner_override: | ||
mem_guarantee: 56G | ||
mem_limit: null | ||
node_selector: | ||
node.kubernetes.io/instance-type: g4dn.4xlarge | ||
extra_resource_limits: | ||
nvidia.com/gpu: "1" | ||
- display_name: "Massive GPU: 64 CPU, 256 GB, 1 T4 Tensor Core GPU" | ||
description: "A dedicated machine for you with one GPU attached." | ||
kubespawner_override: | ||
mem_guarantee: 224G | ||
mem_limit: null | ||
node_selector: | ||
node.kubernetes.io/instance-type: g4dn.16xlarge | ||
extra_resource_limits: | ||
nvidia.com/gpu: "1" | ||
- display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB - Test of latest image" | ||
description: "Helps us test an image before we make it the default" | ||
kubespawner_override: | ||
image: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env:latest | ||
image_pull_policy: Always | ||
cpu_guarantee: 0.225 | ||
mem_guarantee: 0.875G | ||
node_selector: | ||
node.kubernetes.io/instance-type: m5.xlarge | ||
mem_limit: null | ||
|
||
hub: | ||
config: | ||
JupyterHub: | ||
authenticator_class: cilogon | ||
CILogonOAuthenticator: | ||
scope: | ||
- "profile" | ||
username_claim: "preferred_username" | ||
# Only show the option to login with GitHub | ||
shown_idps: | ||
- http://github.com/login/oauth/authorize | ||
Authenticator: | ||
allowed_users: &users | ||
# This is just listing a few of the users/admins, a lot of | ||
# users has been added manually, see: | ||
# https://github.com/pangeo-data/jupyter-earth/issues/53 | ||
- abbyazari # Abby Azari | ||
- andersy005 # Anderson Banihirwe | ||
- consideratio # Erik Sundell | ||
- choldgraf # Chris Holdgraf | ||
- elliesch # Ellie Abrahams | ||
- EMscience # Edom Moges | ||
- espg # Shane Grigsby | ||
- facusapienza21 # Facundo Sapienza | ||
- fperez # Fernando Pérez | ||
- kmpaul # Kevin Paul | ||
- lrennels # Lisa Rennels | ||
- mrsiegfried # Matthew Siegfried | ||
- tsnow03 # Tasha Snow | ||
- whyjz # Whyjay Zheng | ||
- yuvipanda # Yuvi Panda | ||
- jonathan-taylor # Jonathan Taylor | ||
admin_users: *users | ||
allowNamedServers: true | ||
|
||
dask-gateway: | ||
gateway: | ||
backend: | ||
scheduler: | ||
# IMPORTANT: We have experienced that the scheduler can fail with | ||
# 1GB memory limit. This was observed "stream closed" | ||
# from the python client working against the | ||
# Dask-Gateway created DaskCluster. | ||
# | ||
# CommClosedError: in <TLS (closed) ConnectionPool.gather local=tls://192.168.40.210:54296 remote=gateway://traefik-prod-dask-gateway.prod:80/prod.b9600f678bb747c1a5f038b5bef3eb90>: Stream is closed | ||
# | ||
cores: | ||
request: 1 | ||
limit: 64 | ||
memory: | ||
request: 2G | ||
limit: 500G | ||
extraPodConfig: | ||
nodeSelector: | ||
hub.jupyter.org/node-purpose: user | ||
k8s.dask.org/node-purpose: null | ||
# serviceAccountName is adjusted by staging/prod values | ||
# serviceAccountName: *user-sa | ||
worker: | ||
extraPodConfig: | ||
nodeSelector: | ||
k8s.dask.org/node-purpose: worker | ||
# serviceAccountName is adjusted by staging/prod values | ||
# serviceAccountName: *user-sa | ||
|
||
# Note that we are overriding options provided in 2i2c's helm chart that has | ||
# default values for these config entries. | ||
# | ||
extraConfig: | ||
# This configuration represents options that can be presented to users | ||
# that want to create a Dask cluster using dask-gateway. For more | ||
# details, see https://gateway.dask.org/cluster-options.html | ||
# | ||
# The goal is to provide a simple configuration that allow the user some | ||
# flexibility while also fitting well well on AWS nodes that are all | ||
# having 1:4 ratio between CPU and GB of memory. By providing the | ||
# username label, we help administrators to track user pods. | ||
option_handler: | | ||
from dask_gateway_server.options import Options, Select, String, Mapping | ||
def cluster_options(user): | ||
def option_handler(options): | ||
if ":" not in options.image: | ||
raise ValueError("When specifying an image you must also provide a tag") | ||
extra_labels = {} | ||
extra_annotations = { | ||
"prometheus.io/scrape": "true", | ||
"prometheus.io/port": "8787", | ||
} | ||
chosen_worker_cpu = int(options.worker_specification.split("CPU")[0]) | ||
chosen_worker_memory = 4 * chosen_worker_cpu | ||
# We multiply the requests by a fraction to ensure that the | ||
# worker fit well within a node that need some resources | ||
# reserved for system pods. | ||
return { | ||
# A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable | ||
"image": options.image, | ||
"scheduler_extra_pod_labels": extra_labels, | ||
"scheduler_extra_pod_annotations": extra_annotations, | ||
"worker_extra_pod_labels": extra_labels, | ||
"worker_extra_pod_annotations": extra_annotations, | ||
"worker_cores": 0.85 * chosen_worker_cpu, | ||
"worker_cores_limit": chosen_worker_cpu, | ||
"worker_memory": "%fG" % (0.85 * chosen_worker_memory), | ||
"worker_memory_limit": "%fG" % chosen_worker_memory, | ||
"environment": options.environment, | ||
} | ||
return Options( | ||
Select( | ||
"worker_specification", | ||
[ | ||
"1CPU, 4GB", | ||
"2CPU, 8GB", | ||
"4CPU, 16GB", | ||
"8CPU, 32GB", | ||
"16CPU, 64GB", | ||
"32CPU, 128GB", | ||
"64CPU, 256GB", | ||
], | ||
default="1CPU, 4GB", | ||
label="Worker specification", | ||
), | ||
# The default image is set via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable | ||
String("image", label="Image"), | ||
Mapping("environment", {}, label="Environment variables"), | ||
handler=option_handler, | ||
) | ||
c.Backend.cluster_options = cluster_options | ||
idle: | | ||
# timeout after 30 minutes of inactivity | ||
c.KubeClusterConfig.idle_timeout = 1800 |
25 changes: 25 additions & 0 deletions
25
config/clusters/jupyter-meets-the-earth/enc-deployer-credentials.secret.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{ | ||
"AccessKey": { | ||
"AccessKeyId": "ENC[AES256_GCM,data:A3+Abzcvq+I2hZq2u4coAYzNjvk=,iv:B4kPrUIM8nx/VTrEQI+tUxEySkDDe6eZHJqAJ9B4YcU=,tag:PtO2TdNEJsaYY0nQyvTHSw==,type:str]", | ||
"SecretAccessKey": "ENC[AES256_GCM,data:gfFXGESHTJn6tiQUpMkpbpqNJJ43KxkNvYaH8V7sC5lRKUPl85Dw7w==,iv:krcKBzv/Wzu+jjtd9MJiTQvj6ELo2JHXird+mn0Vt5c=,tag:jv4YANW0drzpjpVekpmzqg==,type:str]", | ||
"UserName": "ENC[AES256_GCM,data:8fWApCCT7IL+9E6t0FkRS3XTaHDL+XA=,iv:/rsHbqCvzulMvT6Jzj20zqfOb39ojUWprFbn8359ozA=,tag:Nc1L5ufStyZMOUxI8xVrzA==,type:str]" | ||
}, | ||
"sops": { | ||
"kms": null, | ||
"gcp_kms": [ | ||
{ | ||
"resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs", | ||
"created_at": "2023-04-07T13:38:22Z", | ||
"enc": "CiUA4OM7eGDmmlUnGoSPNr9unRpxJ7GEcQ5/pXY2SrvhODPp9JWFEkkALQgViOWaFqYsRFv2FP6kqShPvabRqOC6KTPai4WGjiuK10rHIgiBbGNAfwQdenfi/vBU3h0rslaKojCN2qO4H+TAb4LG7eyO" | ||
} | ||
], | ||
"azure_kv": null, | ||
"hc_vault": null, | ||
"age": null, | ||
"lastmodified": "2023-04-07T13:38:23Z", | ||
"mac": "ENC[AES256_GCM,data:HD/8swJpKnpElskOZXFjkJW6SjTIKChIZtHTqqlYexrj1x/HqrkLaGdHAuWIijZ91SOjxWlQxY67RzbpiJgdxG7XUcokrHqs+mEaWV65XVS087jucZo2tVC86wBFwNe4smlAEj6AF8n2gq/UAQbWoBE4fo3Vm/ojzhStqlLL0aQ=,iv:rrI6EO+c1LONQAHbsG7/TfEGlrrlKfzuriO+g29DFno=,tag:ZJqRJHVKlXOI+5S6cpsFtg==,type:str]", | ||
"pgp": null, | ||
"unencrypted_suffix": "_unencrypted", | ||
"version": "3.7.3" | ||
} | ||
} |
Oops, something went wrong.