Initial commit of JMTE hub

- Use 'jupyter-meets-the-earth' rather than jmte as name, because the existing cluster is already called 'jmte'. - SFTP service is gone! - Replicates config from https://github.com/2i2c-org/infrastructure/pull/436/files to the extent possible - Uses our IRSA config for AWS permissions, rather than the eksctl created service account in use earlier. - Uses CILogon+GitHub for authentication, rather than auth0+github - Re-use the same EFS filesystem from before, avoiding the need to copy a few terabytes of data around - Hub is now at jmte.2i2c.cloud, and the old URL (hub.jupyterearth.org) redirects here. Same for staging. Ref 2i2c-org#2201
yuvipanda · Apr 10, 2023 · 5f14abb · 5f14abb
1 parent dba8b14
commit 5f14abb
Show file tree

Hide file tree

Showing 13 changed files with 793 additions and 0 deletions.
diff --git a/config/clusters/jupyter-meets-the-earth/cluster.yaml b/config/clusters/jupyter-meets-the-earth/cluster.yaml
@@ -0,0 +1,27 @@
+name: jupyter-meets-the-earth
+provider: aws
+aws:
+  key: enc-deployer-credentials.secret.json
+  clusterType: eks
+  clusterName: jupyter-meets-the-earth
+  region: us-west-2
+support:
+  helm_chart_values_files:
+    - support.values.yaml
+    - enc-support.secret.values.yaml
+hubs:
+  - name: staging
+    domain: staging.hub.jupytearth.org
+    helm_chart: daskhub
+    helm_chart_values_files:
+      - common.values.yaml
+      - staging.values.yaml
+      - enc-staging.secret.values.yaml
+  - name: prod
+    display_name: "Jupyter Meets the Earth"
+    domain: hub.jupytearth.org
+    helm_chart: daskhub
+    helm_chart_values_files:
+      - common.values.yaml
+      - prod.values.yaml
+      - enc-prod.secret.values.yaml
diff --git a/config/clusters/jupyter-meets-the-earth/common.values.yaml b/config/clusters/jupyter-meets-the-earth/common.values.yaml
@@ -0,0 +1,340 @@
+basehub:
+  nfs:
+    # enabled is adjusted by staging/prod values
+    # enabled: true
+    shareCreator:
+      enabled: true
+    pv:
+      serverIP: fs-01707b06.efs.us-west-2.amazonaws.com
+      # mountOptions from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
+      mountOptions:
+        - rsize=1048576
+        - wsize=1048576
+        - timeo=600
+        - soft # We pick soft over hard, so NFS lockups don't lead to hung processes
+        - retrans=2
+        - noresvport
+      # baseShareName is required to be just "/" so that we can create
+      # various sub folders in the filesystem that our PV to access the
+      # NFS server can reference successfully as it isn't supported to
+      # access a not yet existing folder. This creation is automated by
+      # the nfs-share-creator resource part of the basehub Helm chart.
+      baseShareName: /
+
+  jupyterhub:
+    custom:
+      homepage:
+        templateVars:
+          org:
+            name: Jupyter meets the Earth
+            logo_url: https://pangeo-data.github.io/jupyter-earth/_static/jupyter-earth.png
+            url: https://jupytearth.org
+          designed_by:
+            name: 2i2c
+            url: https://2i2c.org
+          operated_by:
+            name: 2i2c
+            url: https://2i2c.org
+          funded_by:
+            name: Jupyter meets the Earth
+            url: https://jupytearth.org
+
+    singleuser:
+      # extraFiles ref: https://zero-to-jupyterhub.readthedocs.io/en/latest/resources/reference.html#singleuser-extrafiles
+      extraFiles:
+        jupyter_notebook_config.json:
+          mountPath: /etc/jupyter/jupyter_notebook_config.json
+          data:
+            # Allow jupyterlab option to show hidden files in browser
+            # https://github.com/berkeley-dsep-infra/datahub/issues/3160
+            ContentsManager:
+              allow_hidden: true
+      initContainers:
+        # Need to explicitly fix ownership here, since EFS doesn't do anonuid
+        - name: volume-mount-ownership-fix
+          image: busybox
+          command:
+            [
+              "sh",
+              "-c",
+              "id && chown 1000:1000 /home/jovyan /home/jovyan/shared /home/jovyan/shared-public && ls -lhd /home/jovyan",
+            ]
+          securityContext:
+            runAsUser: 0
+          volumeMounts:
+            - name: home
+              mountPath: /home/jovyan
+              subPath: "{username}"
+            - name: home
+              mountPath: /home/jovyan/shared
+              subPath: _shared
+            - name: home
+              mountPath: /home/jovyan/shared-public
+              subPath: _shared_public
+
+      # /dev/shm is mounted as a filesystem path, where writing to it means to
+      # write to memory.
+      #
+      # How to: https://stackoverflow.com/questions/46085748/define-size-for-dev-shm-on-container-engine/46434614#46434614
+      # Request for this by Ellie: https://fperezgroup.slack.com/archives/C020XCEFPEH/p1658168872788389
+      #
+      storage:
+        extraVolumes:
+          - name: dev-shm
+            emptyDir:
+              medium: Memory
+        extraVolumeMounts:
+          - name: dev-shm
+            mountPath: /dev/shm
+          # FIXME: we override the list extraVolumeMounts which is also set in
+          #        the the basehub chart, due to that, we need to add this here
+          #        as well. An option is to add hub.extraConfig entries that
+          #        append the kubespawner configuration to include these extra
+          #        volume mounts.
+          #
+          - name: home
+            mountPath: /home/jovyan/shared
+            subPath: _shared
+            readOnly: true
+          - name: home
+            mountPath: /home/jovyan/shared-public
+            subPath: _shared_public
+
+      # Increased as we have experienced a too slow image pull at least
+      # once. Our pods can take ~6-7 minutes to start on a new node it
+      # seems, so this gives us some margin.
+      startTimeout: 1200
+
+      extraEnv:
+        GH_SCOPED_CREDS_APP_URL: https://github.com/apps/hub-jupytearth-org-github-integ
+        GH_SCOPED_CREDS_CLIENT_ID: Iv1.a073b1649637af12
+
+        # FIXME: Until we can set this just for the GPU nodes, we need to set it for everyon
+        NVIDIA_DRIVER_CAPABILITIES: compute,utility
+
+      image:
+        # NOTE: We use the jupyterhub-configurator so this image/tag is not
+        #       relevant. Visit its UI to configure the hub.
+        #
+        #       staging: https://staging.hub.jupytearth.org/services/configurator/
+        #       prod:    https://hub.jupytearth.org/services/configurator/
+        pullPolicy: Always
+        name: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env
+        tag: "latest"
+
+      profileList:
+        - display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB"
+          default: True
+          description: "A shared machine, the recommended option until you experience a limitation."
+          kubespawner_override:
+            cpu_guarantee: 0.225
+            mem_guarantee: 0.875G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: m5.xlarge
+        - display_name: "4th of Medium: 1-4 CPU, 4-16 GB"
+          description: "A shared machine."
+          kubespawner_override:
+            cpu_guarantee: 0.875
+            mem_guarantee: 3.5G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: m5.xlarge
+        - display_name: "Medium: 4 CPU, 16 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            cpu_guarantee: 3.5
+            mem_guarantee: 14G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: m5.xlarge
+        - display_name: "Large: 16 CPU, 64 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            mem_guarantee: 56G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: m5.4xlarge
+        - display_name: "Massive: 64 CPU, 256 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            mem_guarantee: 224G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: m5.16xlarge
+        - display_name: "Massive high-memory: 64 CPU, 976 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            mem_guarantee: 900G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: x1.16xlarge
+        - display_name: "Medium GPU: 4 CPU, 16 GB, 1 T4 Tensor Core GPU"
+          description: "A dedicated machine for you with one GPU attached."
+          kubespawner_override:
+            cpu_guarantee: 3.5
+            mem_guarantee: 14G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: g4dn.xlarge
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
+        - display_name: "Large GPU: 16 CPU, 64 GB, 1 T4 Tensor Core GPU"
+          description: "A dedicated machine for you with one GPU attached."
+          kubespawner_override:
+            mem_guarantee: 56G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: g4dn.4xlarge
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
+        - display_name: "Massive GPU: 64 CPU, 256 GB, 1 T4 Tensor Core GPU"
+          description: "A dedicated machine for you with one GPU attached."
+          kubespawner_override:
+            mem_guarantee: 224G
+            mem_limit: null
+            node_selector:
+              node.kubernetes.io/instance-type: g4dn.16xlarge
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
+        - display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB - Test of latest image"
+          description: "Helps us test an image before we make it the default"
+          kubespawner_override:
+            image: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env:latest
+            image_pull_policy: Always
+            cpu_guarantee: 0.225
+            mem_guarantee: 0.875G
+            node_selector:
+              node.kubernetes.io/instance-type: m5.xlarge
+            mem_limit: null
+
+    hub:
+      config:
+        JupyterHub:
+          authenticator_class: cilogon
+        CILogonOAuthenticator:
+          scope:
+            - "profile"
+          username_claim: "preferred_username"
+          # Only show the option to login with GitHub
+          shown_idps:
+            - http://github.com/login/oauth/authorize
+        Authenticator:
+          allowed_users: &users
+            # This is just listing a few of the users/admins, a lot of
+            # users has been added manually, see:
+            # https://github.com/pangeo-data/jupyter-earth/issues/53
+            - abbyazari # Abby Azari
+            - andersy005 # Anderson Banihirwe
+            - consideratio # Erik Sundell
+            - choldgraf # Chris Holdgraf
+            - elliesch # Ellie Abrahams
+            - EMscience # Edom Moges
+            - espg # Shane Grigsby
+            - facusapienza21 # Facundo Sapienza
+            - fperez # Fernando Pérez
+            - kmpaul # Kevin Paul
+            - lrennels # Lisa Rennels
+            - mrsiegfried # Matthew Siegfried
+            - tsnow03 # Tasha Snow
+            - whyjz # Whyjay Zheng
+            - yuvipanda # Yuvi Panda
+            - jonathan-taylor # Jonathan Taylor
+          admin_users: *users
+      allowNamedServers: true
+
+dask-gateway:
+  gateway:
+    backend:
+      scheduler:
+        # IMPORTANT: We have experienced that the scheduler can fail with
+        #            1GB memory limit. This was observed "stream closed"
+        #            from the python client working against the
+        #            Dask-Gateway created DaskCluster.
+        #
+        #            CommClosedError: in <TLS (closed) ConnectionPool.gather local=tls://192.168.40.210:54296 remote=gateway://traefik-prod-dask-gateway.prod:80/prod.b9600f678bb747c1a5f038b5bef3eb90>: Stream is closed
+        #
+        cores:
+          request: 1
+          limit: 64
+        memory:
+          request: 2G
+          limit: 500G
+        extraPodConfig:
+          nodeSelector:
+            hub.jupyter.org/node-purpose: user
+            k8s.dask.org/node-purpose: null
+          # serviceAccountName is adjusted by staging/prod values
+          # serviceAccountName: *user-sa
+      worker:
+        extraPodConfig:
+          nodeSelector:
+            k8s.dask.org/node-purpose: worker
+          # serviceAccountName is adjusted by staging/prod values
+          # serviceAccountName: *user-sa
+
+    # Note that we are overriding options provided in 2i2c's helm chart that has
+    # default values for these config entries.
+    #
+    extraConfig:
+      # This configuration represents options that can be presented to users
+      # that want to create a Dask cluster using dask-gateway. For more
+      # details, see https://gateway.dask.org/cluster-options.html
+      #
+      # The goal is to provide a simple configuration that allow the user some
+      # flexibility while also fitting well well on AWS nodes that are all
+      # having 1:4 ratio between CPU and GB of memory. By providing the
+      # username label, we help administrators to track user pods.
+      option_handler: |
+        from dask_gateway_server.options import Options, Select, String, Mapping
+        def cluster_options(user):
+            def option_handler(options):
+                if ":" not in options.image:
+                    raise ValueError("When specifying an image you must also provide a tag")
+                extra_labels = {}
+                extra_annotations = {
+                    "prometheus.io/scrape": "true",
+                    "prometheus.io/port": "8787",
+                }
+                chosen_worker_cpu = int(options.worker_specification.split("CPU")[0])
+                chosen_worker_memory = 4 * chosen_worker_cpu
+                # We multiply the requests by a fraction to ensure that the
+                # worker fit well within a node that need some resources
+                # reserved for system pods.
+                return {
+                    # A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
+                    "image": options.image,
+                    "scheduler_extra_pod_labels": extra_labels,
+                    "scheduler_extra_pod_annotations": extra_annotations,
+                    "worker_extra_pod_labels": extra_labels,
+                    "worker_extra_pod_annotations": extra_annotations,
+                    "worker_cores": 0.85 * chosen_worker_cpu,
+                    "worker_cores_limit": chosen_worker_cpu,
+                    "worker_memory": "%fG" % (0.85 * chosen_worker_memory),
+                    "worker_memory_limit": "%fG" % chosen_worker_memory,
+                    "environment": options.environment,
+                }
+            return Options(
+                Select(
+                    "worker_specification",
+                    [
+                        "1CPU, 4GB",
+                        "2CPU, 8GB",
+                        "4CPU, 16GB",
+                        "8CPU, 32GB",
+                        "16CPU, 64GB",
+                        "32CPU, 128GB",
+                        "64CPU, 256GB",
+                    ],
+                    default="1CPU, 4GB",
+                    label="Worker specification",
+                ),
+                # The default image is set via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
+                String("image", label="Image"),
+                Mapping("environment", {}, label="Environment variables"),
+                handler=option_handler,
+            )
+        c.Backend.cluster_options = cluster_options
+      idle: |
+        # timeout after 30 minutes of inactivity
+        c.KubeClusterConfig.idle_timeout = 1800
diff --git a/config/clusters/jupyter-meets-the-earth/enc-deployer-credentials.secret.json b/config/clusters/jupyter-meets-the-earth/enc-deployer-credentials.secret.json
@@ -0,0 +1,25 @@
+{
+	"AccessKey": {
+		"AccessKeyId": "ENC[AES256_GCM,data:A3+Abzcvq+I2hZq2u4coAYzNjvk=,iv:B4kPrUIM8nx/VTrEQI+tUxEySkDDe6eZHJqAJ9B4YcU=,tag:PtO2TdNEJsaYY0nQyvTHSw==,type:str]",
+		"SecretAccessKey": "ENC[AES256_GCM,data:gfFXGESHTJn6tiQUpMkpbpqNJJ43KxkNvYaH8V7sC5lRKUPl85Dw7w==,iv:krcKBzv/Wzu+jjtd9MJiTQvj6ELo2JHXird+mn0Vt5c=,tag:jv4YANW0drzpjpVekpmzqg==,type:str]",
+		"UserName": "ENC[AES256_GCM,data:8fWApCCT7IL+9E6t0FkRS3XTaHDL+XA=,iv:/rsHbqCvzulMvT6Jzj20zqfOb39ojUWprFbn8359ozA=,tag:Nc1L5ufStyZMOUxI8xVrzA==,type:str]"
+	},
+	"sops": {
+		"kms": null,
+		"gcp_kms": [
+			{
+				"resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs",
+				"created_at": "2023-04-07T13:38:22Z",
+				"enc": "CiUA4OM7eGDmmlUnGoSPNr9unRpxJ7GEcQ5/pXY2SrvhODPp9JWFEkkALQgViOWaFqYsRFv2FP6kqShPvabRqOC6KTPai4WGjiuK10rHIgiBbGNAfwQdenfi/vBU3h0rslaKojCN2qO4H+TAb4LG7eyO"
+			}
+		],
+		"azure_kv": null,
+		"hc_vault": null,
+		"age": null,
+		"lastmodified": "2023-04-07T13:38:23Z",
+		"mac": "ENC[AES256_GCM,data:HD/8swJpKnpElskOZXFjkJW6SjTIKChIZtHTqqlYexrj1x/HqrkLaGdHAuWIijZ91SOjxWlQxY67RzbpiJgdxG7XUcokrHqs+mEaWV65XVS087jucZo2tVC86wBFwNe4smlAEj6AF8n2gq/UAQbWoBE4fo3Vm/ojzhStqlLL0aQ=,iv:rrI6EO+c1LONQAHbsG7/TfEGlrrlKfzuriO+g29DFno=,tag:ZJqRJHVKlXOI+5S6cpsFtg==,type:str]",
+		"pgp": null,
+		"unencrypted_suffix": "_unencrypted",
+		"version": "3.7.3"
+	}
+}