Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate Farallon Staging hub to this repository. #379

Merged
merged 14 commits into from
May 7, 2021
Merged
196 changes: 196 additions & 0 deletions config/hubs/farallon.cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
name: farallon
provider: kubeconfig
kubeconfig:
file: secrets/farallon.yaml
hubs:
- name: farallon-staging
domain: staging.farallon.2i2c.cloud
template: daskhub
auth0:
connection: github
config:
scratchBucket:
enabled: false
basehub:
nfsPVC:
nfs:
# from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
mountOptions:
- rsize=1048576
- wsize=1048576
- timeo=600
- soft # We pick soft over hard, so NFS lockups don't lead to hung processes
damianavila marked this conversation as resolved.
Show resolved Hide resolved
- retrans=2
- noresvport
serverIP: fs-7b129903.efs.us-east-2.amazonaws.com
baseShareName: /homes/
shareCreator:
tolerations:
- key: node-role.kubernetes.io/master
operator: "Exists"
effect: "NoSchedule"
jupyterhub:
homepage:
templateVars:
org:
name: Farallon Institute
logo_url: https://2i2c.org/media/logo.png
url: http://www.faralloninstitute.org/
designed_by:
name: 2i2c
url: https://2i2c.org
operated_by:
name: 2i2c
url: https://2i2c.org
funded_by:
name: Farallon Institute
urL: http://www.faralloninstitute.org/
singleuser:
initContainers:
# Need to explicitly fix ownership here, since EFS doesn't do anonuid
- name: volume-mount-ownership-fix
image: busybox
command: ["sh", "-c", "id && chown 1000:1000 /home/jovyan && ls -lhd /home/jovyan"]
securityContext:
runAsUser: 0
volumeMounts:
- name: home
mountPath: /home/jovyan
subPath: "{username}"
image:
name: 677861182063.dkr.ecr.us-east-2.amazonaws.com/2i2c-hub/user-image
tag: 9cd76f1
profileList:
# The mem-guarantees are here so k8s doesn't schedule other pods
# on these nodes.
- display_name: "Default: m5.xlarge"
description: "~4CPUs & ~15GB RAM"
kubespawner_override:
# Expllicitly unset mem_limit, so it overrides the default memory limit we set in
# basehub/values.yaml
mem_limit: null
mem_guarantee: 14G
cpu_guarantee: 3
node_selector:
hub.jupyter.org/pool-name: notebook-m5-xlarge
- display_name: "Default: m5.2xlarge"
description: "~8CPUs & ~30GB RAM"
kubespawner_override:
# Expllicitly unset mem_limit, so it overrides the default memory limit we set in
# basehub/values.yaml
mem_limit: null
mem_guarantee: 28G
cpu_guarantee: 7
node_selector:
hub.jupyter.org/pool-name: notebook-m5-2xlarge
scheduling:
userPlaceholder:
enabled: false
replicas: 0
userScheduler:
enabled: false
proxy:
service:
type: LoadBalancer
https:
enabled: true
hosts:
- staging.farallon.2i2c.cloud
chp:
nodeSelector: {}
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
traefik:
nodeSelector: {}
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
hub:
allowNamedServers: true
networkPolicy:
# FIXME: For dask gateway
enabled: false
readinessProbe:
enabled: false
nodeSelector: {}
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
dask-gateway:
traefik:
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
controller:
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
gateway:
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
backend:
scheduler:
extraPodConfig:
nodeSelector:
hub.jupyter.org/pool-name: dask-worker
tolerations:
- key: "k8s.dask.org/dedicated"
operator: "Equal"
value: "worker"
effect: "NoSchedule"
- key: "k8s.dask.org_dedicated"
operator: "Equal"
value: "worker"
effect: "NoSchedule"
worker:
extraPodConfig:
nodeSelector:
hub.jupyter.org/pool-name: dask-worker
tolerations:
- key: "k8s.dask.org/dedicated"
operator: "Equal"
value: "worker"
effect: "NoSchedule"
- key: "k8s.dask.org_dedicated"
operator: "Equal"
value: "worker"
effect: "NoSchedule"

# TODO: figure out a replacement for userLimits.
extraConfig:
optionHandler: |
from dask_gateway_server.options import Options, Integer, Float, String
def cluster_options(user):
def option_handler(options):
if ":" not in options.image:
raise ValueError("When specifying an image you must also provide a tag")
extra_annotations = {
"hub.jupyter.org/username": user.name,
"prometheus.io/scrape": "true",
"prometheus.io/port": "8787",
}
extra_labels = {
"hub.jupyter.org/username": user.name,
}
return {
"worker_cores_limit": options.worker_cores,
"worker_cores": min(options.worker_cores / 2, 1),
"worker_memory": "%fG" % options.worker_memory,
"image": options.image,
"scheduler_extra_pod_annotations": extra_annotations,
"worker_extra_pod_annotations": extra_annotations,
"scheduler_extra_pod_labels": extra_labels,
"worker_extra_pod_labels": extra_labels,
}
return Options(
Integer("worker_cores", 2, min=1, max=16, label="Worker Cores"),
Float("worker_memory", 4, min=1, max=32, label="Worker Memory (GiB)"),
String("image", default="pangeo/pangeo-notebook:latest", label="Image"),
handler=option_handler,
)
c.Backend.cluster_options = cluster_options
idle: |
# timeout after 30 minutes of inactivity
c.KubeClusterConfig.idle_timeout = 1800
damianavila marked this conversation as resolved.
Show resolved Hide resolved
16 changes: 15 additions & 1 deletion config/hubs/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,23 @@ properties:
type: string
description: |
Cloud provider this cluster is running on. Used to perform
authentication against the cluster. Currently supports gcp.
authentication against the cluster. Currently supports gcp
and raw kubeconfig files.
enum:
- gcp
- kubeconfig
kubeconfig:
type: object
description: |
Configuration to connect to a cluster purely via a kubeconfig
file.
additionalProperties: false
properties:
file:
type: string
descriptiON: |
Path to kubeconfig file (encrypted with sops) to use for
connecting to the cluster
gcp:
type: object
additionalProperties: false
Expand Down
54 changes: 36 additions & 18 deletions deployer/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,30 @@ def build_image(self):

@contextmanager
def auth(self):
with tempfile.NamedTemporaryFile() as kubeconfig:
# FIXME: This is dumb
os.environ['KUBECONFIG'] = kubeconfig.name
assert self.spec['provider'] == 'gcp'

if self.spec['provider'] == 'gcp':
yield from self.auth_gcp()
elif self.spec['provider'] == 'kubeconfig':
yield from self.auth_kubeconfig()
else:
raise ValueError(f'Provider {self.spec["provider"]} not supported')


def auth_kubeconfig(self):
"""
Context manager for authenticating with just a kubeconfig file

For the duration of the contextmanager, we:
1. Decrypt the file specified in kubeconfig.file with sops
2. Set `KUBECONFIG` env var to our decrypted file path, so applications
we call (primarily helm) will use that as config
"""
config = self.spec['kubeconfig']
config_path = config['file']

with decrypt_file(config_path) as decrypted_key_path:
# FIXME: Unset this after our yield
os.environ['KUBECONFIG'] = decrypted_key_path
yield

def auth_gcp(self):
config = self.spec['gcp']
Expand All @@ -52,23 +70,23 @@ def auth_gcp(self):
# Else, it'll just have a `zone` key set. Let's respect either.
location = config.get('zone', config.get('region'))
cluster = config['cluster']
with tempfile.NamedTemporaryFile() as kubeconfig:
with decrypt_file(key_path) as decrypted_key_path:
subprocess.check_call([
'gcloud', 'auth',
'activate-service-account',
'--key-file', os.path.abspath(decrypted_key_path)
])

with decrypt_file(key_path) as decrypted_key_path:
subprocess.check_call([
'gcloud', 'auth',
'activate-service-account',
'--key-file', os.path.abspath(decrypted_key_path)
'gcloud', 'container', 'clusters',
# --zone works with regions too
f'--zone={location}',
f'--project={project}',
'get-credentials', cluster
])

subprocess.check_call([
'gcloud', 'container', 'clusters',
# --zone works with regions too
f'--zone={location}',
f'--project={project}',
'get-credentials', cluster
])

yield
yield


class Hub:
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,4 @@ spec:
apiVersion: resourcemanager.cnrm.cloud.google.com/v1beta1
kind: Project
external: projects/{{ .Values.jupyterhub.cloudResources.gcp.projectId }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
annotations:
iam.gke.io/gcp-service-account: {{ include "cloudResources.gcp.serviceAccountName" .}}@{{ .Values.jupyterhub.cloudResources.gcp.projectId }}.iam.gserviceaccount.com
name: user-sa
{{- end }}
5 changes: 1 addition & 4 deletions hub-templates/basehub/templates/nfs-pvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ spec:
nfs:
server: {{ .Values.nfsPVC.nfs.serverIP | quote}}
path: "{{ .Values.nfsPVC.nfs.baseShareName }}{{ .Release.Name }}"
mountOptions:
- soft
- noatime
- vers=4.2
mountOptions: {{ .Values.nfsPVC.nfs.mountOptions | toJson }}
---
apiVersion: v1
kind: PersistentVolumeClaim
Expand Down
2 changes: 2 additions & 0 deletions hub-templates/basehub/templates/nfs-share-creator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ spec:
spec:
restartPolicy: Never
terminationGracePeriodSeconds: 0
tolerations: {{ .Values.nfsPVC.shareCreator.tolerations | toJson }}

containers:
- name: dummy
image: busybox
Expand Down
10 changes: 10 additions & 0 deletions hub-templates/basehub/templates/user-sa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
annotations:
{{ if .Values.jupyterhub.cloudResources.scratchBucket.enabled}}
{{ if eq .Values.jupyterhub.cloudResources.provider "gcp" }}
iam.gke.io/gcp-service-account: {{ include "cloudResources.gcp.serviceAccountName" .}}@{{ .Values.jupyterhub.cloudResources.gcp.projectId }}.iam.gserviceaccount.com
{{- end }}
{{- end }}
name: user-sa
4 changes: 4 additions & 0 deletions hub-templates/basehub/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ nfsPVC:
shareCreator:
tolerations: []
nfs:
mountOptions:
- soft
- noatime
- vers=4.2
serverIP: nfs-server-01
# MUST HAVE TRAILING SLASH
baseShareName: /export/home-01/homes/
Expand Down
Loading