From 1eb3b86eff5de0e83981a905ab4243ef53f0a30e Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 9 Jun 2021 01:43:46 +0530
Subject: [PATCH 01/27] Provision per-cluster support components

- Set up cert-manager explicitly, since it can not be set up as
  a dependent chart of our 'support' chart.
- 'support' chart deploys prometheus, grafana and nginx-ingress
  in a standard configuration.
- Allow per-cluster overrides with config under support/clusters.
- Add a script that will provision the support components.

Ref https://github.com/2i2c-org/pilot-hubs/issues/388
---
 support/Chart.yaml         | 14 +++------
 support/clusters/2i2c.yaml |  4 +++
 support/provision.py       | 58 ++++++++++++++++++++++++++++++++++++++
 support/values.yaml        | 11 ++++----
 4 files changed, 71 insertions(+), 16 deletions(-)
 create mode 100644 support/clusters/2i2c.yaml
 create mode 100644 support/provision.py

diff --git a/support/Chart.yaml b/support/Chart.yaml
index 0e8d264ac..9d50e68e2 100644
--- a/support/Chart.yaml
+++ b/support/Chart.yaml
@@ -7,24 +7,18 @@ dependencies:
   # Prometheus for collection of metrics.
   # https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus
   - name: prometheus
-    version: 11.15.0
+    version: 14.1.1
     repository: https://prometheus-community.github.io/helm-charts
 
   # Grafana for dashboarding of metrics.
   # https://github.com/grafana/helm-charts/tree/main/charts/grafana
   - name: grafana
-    version: 5.6.4
+    version: 6.11.0
     repository: https://grafana.github.io/helm-charts
 
   # ingress-nginx for a k8s Ingress resource controller that routes traffic from
   # a single IP entrypoint to various services exposed via k8s Ingress resources
   # that references this controller.
   - name: ingress-nginx
-    version: 2.15.0
-    repository: https://kubernetes.github.io/ingress-nginx
-
-  # cert-manager for acquisition of TLS certificates
-  # https://github.com/jetstack/cert-manager/tree/master/deploy/charts/cert-manager
-  - name: cert-manager
-    version: v1.0.0-beta.1
-    repository: https://charts.jetstack.io
+    version: 3.33.0
+    repository: https://kubernetes.github.io/ingress-nginx
\ No newline at end of file
diff --git a/support/clusters/2i2c.yaml b/support/clusters/2i2c.yaml
new file mode 100644
index 000000000..70b71ba30
--- /dev/null
+++ b/support/clusters/2i2c.yaml
@@ -0,0 +1,4 @@
+grafana:
+  ingress:
+    hosts:
+      - grafana.pilot.2i2c.cloud
\ No newline at end of file
diff --git a/support/provision.py b/support/provision.py
new file mode 100644
index 000000000..14be3d5cd
--- /dev/null
+++ b/support/provision.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""
+Maintain per-cluster support chart
+
+Sets up the following charts:
+
+1. cert-manager (with CRDs), since it can not be
+   installed as a dependent chart.
+   https://github.com/jetstack/cert-manager/issues/3062#issuecomment-708252281
+2. support/, a meta chart that installs nginx-ingress, grafana
+   and prometheus.
+"""
+import shutil
+import subprocess
+from pathlib import Path
+import argparse
+
+
+CERT_MANAGER_VERSION = 'v1.3.1'
+HERE = Path(__file__).parent
+
+def main():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument(
+        'cluster',
+        help='Name of cluster to provision support charts in'
+    )
+
+    args = argparser.parse_args()
+
+    print("Provisioning cert-manager...")
+    subprocess.check_call([
+        'helm', 'upgrade', '--install', '--create-namespace',
+        '--namespace', 'cert-manager',
+        'cert-manager', 'jetstack/cert-manager',
+        '--version', CERT_MANAGER_VERSION,
+        '--set', 'installCRDs=true'
+    ])
+    print("Done!")
+
+    print("Support charts...")
+
+    shutil.rmtree(HERE / 'charts')
+    subprocess.check_call([
+        'helm', 'dep', 'up', str(HERE)
+    ])
+
+    subprocess.check_call([
+        'helm', 'upgrade', '--install', '--create-namespace',
+        '--namespace', 'support',
+        'support', 'support',
+        '-f', str(HERE / 'clusters' / (args.cluster + '.yaml')),
+        '--wait'
+    ])
+    print("Done!")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/support/values.yaml b/support/values.yaml
index f9f002143..054edcd45 100644
--- a/support/values.yaml
+++ b/support/values.yaml
@@ -15,18 +15,20 @@ prometheus:
         # Deploy onto user nodes
         key: hub.jupyter.org_dedicated
         value: user
+      - effect: NoSchedule
+        # Deploy onto user nodes
+        key: k8s.dask.org_dedicated
+        value: worker
     updateStrategy:
       type: RollingUpdate
   pushgateway:
     enabled: false
-  rbac:
-    create: true
   server:
     resources:
       # Without this, prometheus can easily starve users
       requests:
         cpu: 0.2
-        memory: 768Mi
+        memory: 512Mi
       limits:
         cpu: 1
         memory: 2Gi
@@ -58,9 +60,6 @@ grafana:
     hosts:
       - grafana.pilot.2i2c.cloud
 
-  # grafana.ini:
-  #   server:
-  #     root_url: http://grafana.datahub.berkeley.edu/
   datasources:
     datasources.yaml:
       apiVersion: 1

From 2361d5169cfeb7c5bebfb7f07df04fca01a8f73d Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 9 Jun 2021 16:51:38 +0530
Subject: [PATCH 02/27] Deploy support components from deployer script

- Automates auth to cluster for automated deployment
- Puts config overrides in same place as cluster config, to
  avoid duplication
- Sets up HTTPS for the 2i2c pilot hubs grafana
---
 config/hubs/2i2c.cluster.yaml | 10 ++++++
 deployer/__main__.py          | 22 +++++++++++++
 deployer/hub.py               | 27 ++++++++++++++++
 support/clusters/2i2c.yaml    |  4 ---
 support/provision.py          | 58 -----------------------------------
 support/values.yaml           |  2 --
 6 files changed, 59 insertions(+), 64 deletions(-)
 delete mode 100644 support/clusters/2i2c.yaml
 delete mode 100644 support/provision.py

diff --git a/config/hubs/2i2c.cluster.yaml b/config/hubs/2i2c.cluster.yaml
index 26c7fbc91..234f5cc30 100644
--- a/config/hubs/2i2c.cluster.yaml
+++ b/config/hubs/2i2c.cluster.yaml
@@ -6,6 +6,16 @@ gcp:
   project: two-eye-two-see
   cluster: pilot-hubs-cluster
   zone: us-central1-b
+support:
+  config:
+    grafana:
+      ingress:
+        hosts:
+          - grafana.pilot.2i2c.cloud
+        tls:
+          - secretName: grafana-tls
+            hosts:
+              - grafana.pilot.2i2c.cloud
 hubs:
   - name: staging
     domain: staging.pilot.2i2c.cloud
diff --git a/deployer/__main__.py b/deployer/__main__.py
index 70f2fabeb..8dab3b395 100644
--- a/deployer/__main__.py
+++ b/deployer/__main__.py
@@ -29,6 +29,23 @@ def build(cluster_name):
             cluster.build_image()
 
 
+def deploy_support(cluster_name):
+    """
+    Deploy support components to a cluster
+    """
+
+    # Validate our config with JSON Schema first before continuing
+    validate(cluster_name)
+
+
+    config_file_path = Path(os.getcwd()) / "config/hubs" / f'{cluster_name}.cluster.yaml'
+    with open(config_file_path) as f:
+        cluster = Cluster(yaml.load(f))
+
+    if cluster.support:
+        with cluster.auth():
+            cluster.deploy_support()
+
 def deploy(cluster_name, hub_name, skip_hub_health_test, config_path):
     """
     Deploy one or more hubs in a given cluster
@@ -97,6 +114,7 @@ def main():
     build_parser = subparsers.add_parser("build")
     deploy_parser = subparsers.add_parser("deploy")
     validate_parser = subparsers.add_parser("validate")
+    deploy_support_parser = subparsers.add_parser("deploy-support")
 
     build_parser.add_argument("cluster_name")
 
@@ -107,6 +125,8 @@ def main():
 
     validate_parser.add_argument("cluster_name")
 
+    deploy_support_parser.add_argument("cluster_name")
+
     args = argparser.parse_args()
 
     if args.action == "build":
@@ -115,6 +135,8 @@ def main():
         deploy(args.cluster_name, args.hub_name, args.skip_hub_health_test, args.config_path)
     elif args.action == 'validate':
         validate(args.cluster_name)
+    elif args.action == 'deploy-support':
+        deploy_support(args.cluster_name)
     else:
         # Print help message and exit when no arguments are passed
         # FIXME: Is there a better way to do this?
diff --git a/deployer/hub.py b/deployer/hub.py
index 5c020d1b1..b41883981 100644
--- a/deployer/hub.py
+++ b/deployer/hub.py
@@ -29,6 +29,7 @@ def __init__(self, spec):
             Hub(self, hub_yaml)
             for hub_yaml in self.spec['hubs']
         ]
+        self.support = self.spec.get('support', {})
 
     def build_image(self):
         self.ensure_docker_credhelpers()
@@ -77,6 +78,32 @@ def ensure_docker_credhelpers(self):
                 with open(dockercfg_path, 'w') as f:
                     json.dump(config, f, indent=4)
 
+    def deploy_support(self):
+        cert_manager_version = 'v1.3.1'
+
+        print("Provisioning cert-manager...")
+        subprocess.check_call([
+            'helm', 'upgrade', '--install', '--create-namespace',
+            '--namespace', 'cert-manager',
+            'cert-manager', 'jetstack/cert-manager',
+            '--version', cert_manager_version,
+            '--set', 'installCRDs=true'
+        ])
+        print("Done!")
+
+        print("Support charts...")
+
+        with tempfile.NamedTemporaryFile(mode='w') as f:
+            yaml.dump(self.support.get('config', {}), f)
+            f.flush()
+            subprocess.check_call([
+                'helm', 'upgrade', '--install', '--create-namespace',
+                '--namespace', 'support',
+                'support', 'support',
+                '-f', f.name,
+                '--wait'
+            ])
+        print("Done!")
 
     def auth_kubeconfig(self):
         """
diff --git a/support/clusters/2i2c.yaml b/support/clusters/2i2c.yaml
deleted file mode 100644
index 70b71ba30..000000000
--- a/support/clusters/2i2c.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-grafana:
-  ingress:
-    hosts:
-      - grafana.pilot.2i2c.cloud
\ No newline at end of file
diff --git a/support/provision.py b/support/provision.py
deleted file mode 100644
index 14be3d5cd..000000000
--- a/support/provision.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-"""
-Maintain per-cluster support chart
-
-Sets up the following charts:
-
-1. cert-manager (with CRDs), since it can not be
-   installed as a dependent chart.
-   https://github.com/jetstack/cert-manager/issues/3062#issuecomment-708252281
-2. support/, a meta chart that installs nginx-ingress, grafana
-   and prometheus.
-"""
-import shutil
-import subprocess
-from pathlib import Path
-import argparse
-
-
-CERT_MANAGER_VERSION = 'v1.3.1'
-HERE = Path(__file__).parent
-
-def main():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument(
-        'cluster',
-        help='Name of cluster to provision support charts in'
-    )
-
-    args = argparser.parse_args()
-
-    print("Provisioning cert-manager...")
-    subprocess.check_call([
-        'helm', 'upgrade', '--install', '--create-namespace',
-        '--namespace', 'cert-manager',
-        'cert-manager', 'jetstack/cert-manager',
-        '--version', CERT_MANAGER_VERSION,
-        '--set', 'installCRDs=true'
-    ])
-    print("Done!")
-
-    print("Support charts...")
-
-    shutil.rmtree(HERE / 'charts')
-    subprocess.check_call([
-        'helm', 'dep', 'up', str(HERE)
-    ])
-
-    subprocess.check_call([
-        'helm', 'upgrade', '--install', '--create-namespace',
-        '--namespace', 'support',
-        'support', 'support',
-        '-f', str(HERE / 'clusters' / (args.cluster + '.yaml')),
-        '--wait'
-    ])
-    print("Done!")
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/support/values.yaml b/support/values.yaml
index 054edcd45..01aed37a8 100644
--- a/support/values.yaml
+++ b/support/values.yaml
@@ -57,8 +57,6 @@ grafana:
     annotations:
       kubernetes.io/ingress.class: nginx
       cert-manager.io/cluster-issuer: letsencrypt-prod
-    hosts:
-      - grafana.pilot.2i2c.cloud
 
   datasources:
     datasources.yaml:

From 11a393e616546cc0cb53546f5a885c42b3f868c7 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 26 May 2021 00:33:35 +0530
Subject: [PATCH 03/27] Add hub for MOEM-IGE group

- traefik tag bump was required to get LE working. It's
  already bumped in newer z2jh versions
- NFS server was again set up manually, and needed the `insecure`
  flag - even though other hubs are setup the same way and didn't
  need this. NFS situation needs to be sorted.

Ref https://github.com/2i2c-org/pilot-hubs/issues/207
---
 config/hubs/meom.cluster.yaml     | 88 +++++++++++++++++++++++++++++++
 hub-templates/basehub/values.yaml |  2 +
 terraform/meom.tfvars             | 15 ++++++
 3 files changed, 105 insertions(+)
 create mode 100644 config/hubs/meom.cluster.yaml
 create mode 100644 terraform/meom.tfvars

diff --git a/config/hubs/meom.cluster.yaml b/config/hubs/meom.cluster.yaml
new file mode 100644
index 000000000..49810756a
--- /dev/null
+++ b/config/hubs/meom.cluster.yaml
@@ -0,0 +1,88 @@
+name: meom
+provider: gcp
+gcp:
+  key: secrets/meom.json
+  project: meom-ige-cnrs
+  cluster: meom-cluster
+  zone: us-central1-b
+hubs:
+  - name: staging
+    domain: staging.meom-ige.2i2c.cloud
+    template: daskhub
+    auth0:
+      connection: github
+    config: &meomConfig
+      basehub:
+        nfsPVC:
+          nfs:
+            # from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
+            mountOptions:
+            - rsize=1048576
+            - wsize=1048576
+            - timeo=600
+            - soft # We pick soft over hard, so NFS lockups don't lead to hung processes
+            - retrans=2
+            - noresvport
+            serverIP: nfs-server-01
+            baseShareName: /export/home-01/homes/
+        jupyterhub:
+          custom:
+            homepage:
+              templateVars:
+                org:
+                  name: "SWOT Ocean Pangeo Team"
+                  logo_url: https://2i2c.org/media/logo.png
+                  url: https://2i2c.org
+                designed_by:
+                  name: 2i2c
+                  url: https://2i2c.org
+                operated_by:
+                  name: 2i2c
+                  url: https://2i2c.org
+                funded_by:
+                  name: SWOT Ocean Pangeo Team
+                  url: https://2i2c.org
+          singleuser:
+            image:
+              name: pangeo/pangeo-notebook
+              tag: 2021.02.19
+          scheduling:
+            userPlaceholder:
+              enabled: false
+              replicas: 0
+            userScheduler:
+              enabled: false
+          proxy:
+            service:
+              type: LoadBalancer
+            https:
+              enabled: true
+          hub:
+            config:
+              Authenticator:
+                allowed_users: &users
+                  - roxyboy
+                  - lesommer
+                  - auraoupa
+                  - yuvipanda
+                  - choldgraf
+                  - GeorgianaElena
+                admin_users: *users
+
+            allowNamedServers: true
+            networkPolicy:
+              # FIXME: For dask gateway
+              enabled: false
+            readinessProbe:
+              enabled: false
+      dask-gateway:
+          extraConfig:
+            idle: |
+              # timeout after 30 minutes of inactivity
+              c.KubeClusterConfig.idle_timeout = 1800
+  - name: prod
+    domain: meom-ige.2i2c.cloud
+    template: daskhub
+    auth0:
+      connection: github
+    config: *meomConfig
diff --git a/hub-templates/basehub/values.yaml b/hub-templates/basehub/values.yaml
index 60a41b75e..0b9b9a625 100644
--- a/hub-templates/basehub/values.yaml
+++ b/hub-templates/basehub/values.yaml
@@ -86,6 +86,8 @@ jupyterhub:
         limits:
           memory: 1Gi
     traefik:
+      image:
+        tag: v2.4.8
       nodeSelector:
         hub.jupyter.org/node-purpose: core
       resources:
diff --git a/terraform/meom.tfvars b/terraform/meom.tfvars
new file mode 100644
index 000000000..60bd9f85f
--- /dev/null
+++ b/terraform/meom.tfvars
@@ -0,0 +1,15 @@
+prefix     = "meom"
+project_id = "meom-ige-cnrs"
+
+# Inane CPU requests mean we need at least 3 CPUs for a base node?!?!
+# But, we can't have custom machine sizes with odd number of CPUs -
+# only even numbers. So we go with 4. 3840 is the smallest amount
+# of RAM a 4 CPU n1 instance can have.
+core_node_machine_type = "n1-custom-4-3840"
+
+# Give each user ~4G of RAM and ~2CPU
+user_node_machine_type = "n1-custom-2-4096"
+
+
+# dask nodes are e2 since they are expected to autoscale
+dask_node_machine_type = "e2-custom-2-4096"

From 119bdcfb65d1cce699229cbe7fed286dcaab0b69 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Thu, 27 May 2021 22:21:00 +0530
Subject: [PATCH 04/27] Use g1-small instances for core nodes

---
 terraform/meom.tfvars | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/terraform/meom.tfvars b/terraform/meom.tfvars
index 60bd9f85f..e0a97c529 100644
--- a/terraform/meom.tfvars
+++ b/terraform/meom.tfvars
@@ -1,15 +1,16 @@
 prefix     = "meom"
 project_id = "meom-ige-cnrs"
 
-# Inane CPU requests mean we need at least 3 CPUs for a base node?!?!
-# But, we can't have custom machine sizes with odd number of CPUs -
-# only even numbers. So we go with 4. 3840 is the smallest amount
-# of RAM a 4 CPU n1 instance can have.
-core_node_machine_type = "n1-custom-4-3840"
+# Minimum number of nodes required to fit kube-system is either
+# 2 n1-highcpu-2 nodes, or 3 g1-small nodes. If you don't enable
+# networkpolicy, you can get away with 1 n1-custom-4-3840 node -
+# but with that enable, calico-typha wants 2 replicas that
+# must run on two nodes since they both want the same hostport.
+# 3 g1-small is 13$ a month, wile a single n2-highcpu-2 is
+# already 36$ a month. We want very low base price, and
+# our core nodes will barely see any CPU usage, so g1-small is
+# the way to go
+core_node_machine_type = "g1-small"
 
 # Give each user ~4G of RAM and ~2CPU
 user_node_machine_type = "n1-custom-2-4096"
-
-
-# dask nodes are e2 since they are expected to autoscale
-dask_node_machine_type = "e2-custom-2-4096"

From ad19ba5c3cb64c17ebdbc7baa0ed3dad60dc4fc6 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Fri, 28 May 2021 06:52:00 +0530
Subject: [PATCH 05/27] meom: Use JupyterLab as default interface

---
 config/hubs/meom.cluster.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/config/hubs/meom.cluster.yaml b/config/hubs/meom.cluster.yaml
index 49810756a..31cd2c5e3 100644
--- a/config/hubs/meom.cluster.yaml
+++ b/config/hubs/meom.cluster.yaml
@@ -43,6 +43,7 @@ hubs:
                   name: SWOT Ocean Pangeo Team
                   url: https://2i2c.org
           singleuser:
+            defaultUrl: /lab
             image:
               name: pangeo/pangeo-notebook
               tag: 2021.02.19

From ecb25c3971f4d7c1f9f3523b88d5dd1584915c0f Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Sun, 30 May 2021 11:54:45 +0530
Subject: [PATCH 06/27] Refactor GCP terraform code

- Setup the cluster with the [terraform google provider][1], instead
  of the higher level [gke module][2]. The code gets simpler, and
  makes more terraform features (like for_each) accessible more easily.
- Allow multiple notebook and dask nodepools to be set up. Most
  research hubs want 2-3 options of notebook sizes to optimize for
  spend. I attempted to use [gke node autoprovisioning][3] instead of
  requiring manual nodepool provisioning, but it consistently
  provisioned nodes bigger than required. We should re-evaluate it
  later.
- Expose the GCP SA used by the k8s nodes to the user pods. A highly
  restricted SA is used for this, to prevent damage as much as possible.
  Users can then make requests to GCS buckets in other
  projects on behalf of this project.
- Dask Nodepools will default to matching sizes of the notebook
  nodepools. Can be overriden if necessary.
- Split terraform code into multiple files for easier maintenance
- Move tfvars files into a subdirectory and split terraform code
  into multiple files for easier maintenance
- Remove unused terraform variables
- Add some inline terraform docs
- Setup MOEM-IGE cluster + hub with new terraform code

[1]: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster
[2]: https://registry.terraform.io/modules/terraform-google-modules/kubernetes-engine/google/latest
[3]: https://cloud.google.com/kubernetes-engine/docs/how-to/node-auto-provisioning
---
 ...eom.cluster.yaml => meom-ige.cluster.yaml} |  53 ++-
 terraform/cd.tf                               |  31 ++
 terraform/main.tf                             | 316 +++++++++++-------
 terraform/{ => projects}/cloudbank.tfvars     |   0
 .../hackathon-2i2c-project-alpha.tfvars       |   0
 .../{meom.tfvars => projects/meom-ige.tfvars} |  29 +-
 terraform/{ => projects}/pilot-hubs.tfvars    |   0
 terraform/registry.tf                         |  13 +
 terraform/variables.tf                        |  94 ++++--
 9 files changed, 376 insertions(+), 160 deletions(-)
 rename config/hubs/{meom.cluster.yaml => meom-ige.cluster.yaml} (54%)
 create mode 100644 terraform/cd.tf
 rename terraform/{ => projects}/cloudbank.tfvars (100%)
 rename terraform/{ => projects}/hackathon-2i2c-project-alpha.tfvars (100%)
 rename terraform/{meom.tfvars => projects/meom-ige.tfvars} (56%)
 rename terraform/{ => projects}/pilot-hubs.tfvars (100%)
 create mode 100644 terraform/registry.tf

diff --git a/config/hubs/meom.cluster.yaml b/config/hubs/meom-ige.cluster.yaml
similarity index 54%
rename from config/hubs/meom.cluster.yaml
rename to config/hubs/meom-ige.cluster.yaml
index 31cd2c5e3..f80fba552 100644
--- a/config/hubs/meom.cluster.yaml
+++ b/config/hubs/meom-ige.cluster.yaml
@@ -1,9 +1,9 @@
-name: meom
+name: meom-ige
 provider: gcp
 gcp:
   key: secrets/meom.json
   project: meom-ige-cnrs
-  cluster: meom-cluster
+  cluster: meom-ige-cluster
   zone: us-central1-b
 hubs:
   - name: staging
@@ -32,7 +32,7 @@ hubs:
                 org:
                   name: "SWOT Ocean Pangeo Team"
                   logo_url: https://2i2c.org/media/logo.png
-                  url: https://2i2c.org
+                  url: https://meom-group.github.io/
                 designed_by:
                   name: 2i2c
                   url: https://2i2c.org
@@ -41,8 +41,40 @@ hubs:
                   url: https://2i2c.org
                 funded_by:
                   name: SWOT Ocean Pangeo Team
-                  url: https://2i2c.org
+                  url: https://meom-group.github.io/
           singleuser:
+            profileList:
+              # The mem-guarantees are here so k8s doesn't schedule other pods
+              # on these nodes. They need to be just under total allocatable
+              # RAM on a node, not total node capacity
+              - display_name: "Small"
+                description: "~2 CPU, ~8G RAM"
+                kubespawner_override:
+                  mem_limit: 8G
+                  mem_guarantee: 5.5G
+                  node_selector:
+                    node.kubernetes.io/instance-type: e2-standard-2
+              - display_name: "Medium"
+                description: "~8 CPU, ~32G RAM"
+                kubespawner_override:
+                  mem_limit: 32G
+                  mem_guarantee: 25G
+                  node_selector:
+                    node.kubernetes.io/instance-type: e2-standard-8
+              - display_name: "Large"
+                description: "~16 CPU, ~64G RAM"
+                kubespawner_override:
+                  mem_limit: 64G
+                  mem_guarantee: 55G
+                  node_selector:
+                    node.kubernetes.io/instance-type: e2-standard-16
+              - display_name: "Very Large"
+                description: "~32 CPU, ~128G RAM"
+                kubespawner_override:
+                  mem_limit: 128G
+                  mem_guarantee: 115G
+                  node_selector:
+                    node.kubernetes.io/instance-type: e2-standard-32
             defaultUrl: /lab
             image:
               name: pangeo/pangeo-notebook
@@ -58,7 +90,20 @@ hubs:
               type: LoadBalancer
             https:
               enabled: true
+            chp:
+              resources:
+                requests:
+                  # FIXME: We want no guarantees here!!!
+                  # This is lowest possible value
+                  cpu: 0.01
+                  memory: 1Mi
           hub:
+            resources:
+              requests:
+                # FIXME: We want no guarantees here!!!
+                # This is lowest possible value
+                cpu: 0.01
+                memory: 1Mi
             config:
               Authenticator:
                 allowed_users: &users
diff --git a/terraform/cd.tf b/terraform/cd.tf
new file mode 100644
index 000000000..7aedfbd6f
--- /dev/null
+++ b/terraform/cd.tf
@@ -0,0 +1,31 @@
+/**
+* Setup Service Accounts for authentication during continuous deployment
+*/
+
+// Service account used by GitHub Actions to deploy to the cluster
+resource "google_service_account" "cd_sa" {
+  account_id   = "${var.prefix}-cd-sa"
+  display_name = "Continuous Deployment SA for ${var.prefix}"
+  project      = var.project_id
+}
+
+// Roles the service account needs to deploy hubs to the cluster
+resource "google_project_iam_member" "cd_sa_roles" {
+  for_each = var.cd_sa_roles
+
+  project = var.project_id
+  role    = each.value
+  member  = "serviceAccount:${google_service_account.cd_sa.email}"
+}
+
+// JSON encoded private key to be kept in secrets/* to for the
+// deployment script to authenticate to the cluster
+resource "google_service_account_key" "cd_sa" {
+  service_account_id = google_service_account.cd_sa.name
+  public_key_type    = "TYPE_X509_PEM_FILE"
+}
+
+output "ci_deployer_key" {
+  value     = base64decode(google_service_account_key.cd_sa.private_key)
+  sensitive = true
+}
diff --git a/terraform/main.tf b/terraform/main.tf
index b9f011a63..a8af85916 100644
--- a/terraform/main.tf
+++ b/terraform/main.tf
@@ -5,155 +5,227 @@ terraform {
   }
 }
 
-module "service_accounts" {
-  source        = "terraform-google-modules/service-accounts/google"
-  version       = "~> 2.0"
-  project_id    = var.project_id
-  prefix        = var.prefix
-  generate_keys = true
-  names         = ["cd-sa"]
-  project_roles = [
-    "${var.project_id}=>roles/container.admin",
-    "${var.project_id}=>roles/artifactregistry.writer",
-    # FIXME: This is way too much perms just to ssh into a node
-    "${var.project_id}=>roles/compute.instanceAdmin.v1"
-  ]
+// Service account used by all the nodes and pods in our cluster
+resource "google_service_account" "cluster_sa" {
+  account_id   = "${var.prefix}-cluster-sa"
+  display_name = "Cluster SA for ${var.prefix}"
+  project      = var.project_id
 }
 
-output "ci_deployer_key" {
-  value     = module.service_accounts.keys["cd-sa"]
-  sensitive = true
+// To access GCS buckets with requestor pays, the calling code needs
+// to have serviceusage.services.use permission. We create a role
+// granting just this to provide the cluster SA, so user pods can
+// use it. See https://cloud.google.com/storage/docs/requester-pays
+// for more info
+resource "google_project_iam_custom_role" "identify_project_role" {
+  // Role names can't contain -, so we swap them out. BOO
+  role_id     = replace("${var.prefix}_user_sa_role", "-", "_")
+  project     = var.project_id
+  title       = "Identify as project role for users in ${var.prefix}"
+  description = "A description"
+  permissions = ["serviceusage.services.use"]
 }
 
-resource "google_artifact_registry_repository" "container_repository" {
-  provider = google-beta
-
-  location      = var.region
-  repository_id = "low-touch-hubs"
-  format        = "DOCKER"
-  project       = var.project_id
+resource "google_project_iam_member" "identify_project_binding" {
+  project = var.project_id
+  role    = google_project_iam_custom_role.identify_project_role.name
+  member  = "serviceAccount:${google_service_account.cluster_sa.email}"
 }
 
-// Give the GKE service account access to our artifact registry docker repo
-resource "google_project_iam_member" "project" {
+resource "google_project_iam_member" "cluster_sa_roles" {
+  for_each = var.cluster_sa_roles
+
   project = var.project_id
-  role    = "roles/artifactregistry.reader"
-  member  = "serviceAccount:${module.gke.service_account}"
+  role    = each.value
+  member  = "serviceAccount:${google_service_account.cluster_sa.email}"
 }
 
+resource "google_container_cluster" "cluster" {
+  # config_connector_config is in beta
+  provider = google-beta
+
+  name     = "${var.prefix}-cluster"
+  location = var.zone
+  project  = var.project_id
 
-module "gke" {
-  source                     = "terraform-google-modules/kubernetes-engine/google"
-  project_id                 = var.project_id
-  name                       = "${var.prefix}-cluster"
-  regional                   = var.regional_cluster
-  region                     = var.region
-  zones                      = [var.zone]
-  network                    = "default"
-  subnetwork                 = "default"
-  ip_range_pods              = ""
-  ip_range_services          = ""
-  http_load_balancing        = false
-  horizontal_pod_autoscaling = false
-  network_policy             = true
-  # We explicitly set up a core pool, so don't need the default
+  initial_node_count       = 1
   remove_default_node_pool = true
-  kubernetes_version       = "1.19.9-gke.1400"
-
-
-  node_pools = [
-    {
-      name               = "core-pool"
-      machine_type       = var.core_node_machine_type
-      min_count          = 1
-      max_count          = var.core_node_max_count
-      local_ssd_count    = 0
-      disk_size_gb       = var.core_node_disk_size_gb
-      disk_type          = "pd-standard"
-      image_type         = "COS"
-      auto_repair        = true
-      auto_upgrade       = false
-      preemptible        = false
-      initial_node_count = 1
-      # Let's pin this so we don't upgrade each time terraform runs
-      version = "1.19.9-gke.1400"
-    },
-    {
-      name               = "user-pool"
-      machine_type       = var.user_node_machine_type
-      min_count          = 0
-      max_count          = var.user_node_max_count
-      local_ssd_count    = 0
-      disk_size_gb       = 100
-      disk_type          = "pd-ssd"
-      image_type         = "COS"
-      auto_repair        = true
-      auto_upgrade       = false
-      preemptible        = false
-      initial_node_count = 0
-      # Let's pin this so we don't upgrade each time terraform runs
-      version = "1.19.9-gke.1400"
-    },
-    {
-      name            = "dask-worker-pool"
-      machine_type    = var.dask_worker_machine_type
-      min_count       = 0
-      max_count       = 10
-      local_ssd_count = 0
-      disk_size_gb    = 100
-      # Fast startup is important here, so we get fast SSD disks
-      # This pulls in user images much faster
-      disk_type          = "pd-ssd"
-      image_type         = "COS"
-      auto_repair        = true
-      auto_upgrade       = false
-      preemptible        = true
-      initial_node_count = 0
-      # Let's pin this so we don't upgrade each time terraform runs
-      version = "1.19.9-gke.1400"
-    },
-  ]
-
-  node_pools_oauth_scopes = {
-    all = [
-      # FIXME: Is this the minimal?
-      #
-      "https://www.googleapis.com/auth/cloud-platform",
-    ]
+
+  addons_config {
+    http_load_balancing {
+      // FIXME: This used to not work well with websockets, and
+      // cost extra money as well. Let's validate if this is still
+      // true?
+      disabled = true
+    }
+    horizontal_pod_autoscaling {
+      // This isn't used anywhere, so let's turn this off
+      disabled = true
+    }
+    config_connector_config {
+      enabled = var.config_connector_enabled
+    }
+  }
+
+  release_channel {
+    # We upgrade clusters manually so we can manage downtime of
+    # master *and* nodes. When a cluster is in a release channel,
+    # upgrades (including disruptive node upgrades) happen automatically.
+    # So we disable it.
+    channel = "UNSPECIFIED"
+  }
+
+  network_policy {
+    enabled = var.enable_network_policy
   }
 
-  node_pools_labels = {
-    all = {}
+  node_config {
+    # DO NOT TOUCH THIS BLOCK, IT REPLACES ENTIRE CLUSTER LOL
+    service_account = google_service_account.cluster_sa.email
+  }
+}
+
+resource "google_container_node_pool" "core" {
+  name     = "core-pool"
+  cluster  = google_container_cluster.cluster.name
+  project  = google_container_cluster.cluster.project
+  location = google_container_cluster.cluster.location
+
+
+  initial_node_count = 1
+  autoscaling {
+    min_node_count = 1
+    max_node_count = var.core_node_max_count
+  }
+
+  management {
+    auto_repair = true
+    # Auto upgrade will drain and setup nodes without us knowing,
+    # and this can cause outages when it hits the proxy nodes.
+    auto_upgrade = false
+  }
 
-    core-pool = {
-      default-node-pool              = true
-      "hub.jupyter.org/pool-name"    = "core-pool",
+
+  node_config {
+    labels = {
       "hub.jupyter.org/node-purpose" = "core",
       "k8s.dask.org/node-purpose"    = "core"
     }
-    user-pool = {
-      "hub.jupyter.org/pool-name"    = "user-pool"
-      "hub.jupyter.org/node-purpose" = "user",
-      "k8s.dask.org/node-purpose"    = "scheduler"
-    }
-    dask-worker-pool = {
-      "hub.jupyter.org/pool-name" = "dask-worker-pool"
-      "k8s.dask.org/node-purpose" = "worker"
-    }
+    machine_type = var.core_node_machine_type
+    disk_size_gb = 30
+
+    # Our service account gets all OAuth scopes so it can access
+    # all APIs, but only fine grained permissions + roles are
+    # granted via the service account.
+    service_account = google_service_account.cluster_sa.email
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/cloud-platform"
+    ]
   }
+}
 
-  node_pools_taints = {
-    all = []
+resource "google_container_node_pool" "notebook" {
+  name     = "nb-${each.key}"
+  cluster  = google_container_cluster.cluster.name
+  project  = google_container_cluster.cluster.project
+  location = google_container_cluster.cluster.location
 
-    user-pool = [{
+  for_each = var.notebook_nodes
+
+  initial_node_count = 0
+  autoscaling {
+    min_node_count = each.value.min
+    max_node_count = each.value.max
+  }
+
+  management {
+    auto_repair  = true
+    auto_upgrade = false
+  }
+
+
+  node_config {
+    workload_metadata_config {
+      // Use node concealment - https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata
+      // This exposes the cluster Google SA to the pods, so we can
+      // access GCS appropriately.
+      node_metadata = "SECURE"
+    }
+    labels = {
+      # Notebook pods and dask schedulers can exist here
+      "hub.jupyter.org/node-purpose" = "user",
+      "k8s.dask.org/node-purpose"    = "scheduler",
+    }
+
+    taint = [{
       key    = "hub.jupyter.org_dedicated"
       value  = "user"
       effect = "NO_SCHEDULE"
     }]
-    dask-worker-pool = [{
+    machine_type = each.value.machine_type
+
+    # Our service account gets all OAuth scopes so it can access
+    # all APIs, but only fine grained permissions + roles are
+    # granted via the service account.
+    service_account = google_service_account.cluster_sa.email
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/cloud-platform"
+    ]
+  }
+}
+
+resource "google_container_node_pool" "dask_worker" {
+  name     = "dask-${each.key}"
+  cluster  = google_container_cluster.cluster.name
+  project  = google_container_cluster.cluster.project
+  location = google_container_cluster.cluster.location
+
+  # Default to same config as notebook nodepools config
+  for_each = var.dask_nodes == {} ? var.dask_nodes : var.notebook_nodes
+
+  initial_node_count = 0
+  autoscaling {
+    min_node_count = each.value.min
+    max_node_count = each.value.max
+  }
+
+  management {
+    auto_repair  = true
+    auto_upgrade = false
+  }
+
+  node_config {
+
+    preemptible = true
+    # SSD Disks for dask workers make image pulls much faster
+    # Since we might have many dask workers spinning up at the
+    # same time, the extra cost of using this is probably worth it.
+    disk_type = "pd-ssd"
+
+    workload_metadata_config {
+      // Use node concealment - https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata
+      // This exposes the cluster Google SA to the pods, so we can
+      // access GCS appropriately.
+      node_metadata = "SECURE"
+    }
+    labels = {
+      "k8s.dask.org/node-purpose" = "worker",
+    }
+
+    taint = [{
       key    = "k8s.dask.org_dedicated"
       value  = "worker"
       effect = "NO_SCHEDULE"
     }]
+    machine_type = each.value.machine_type
+
+    # Our service account gets all OAuth scopes so it can access
+    # all APIs, but only fine grained permissions + roles are
+    # granted via the service account.
+    service_account = google_service_account.cluster_sa.email
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/cloud-platform"
+    ]
   }
 }
+
diff --git a/terraform/cloudbank.tfvars b/terraform/projects/cloudbank.tfvars
similarity index 100%
rename from terraform/cloudbank.tfvars
rename to terraform/projects/cloudbank.tfvars
diff --git a/terraform/hackathon-2i2c-project-alpha.tfvars b/terraform/projects/hackathon-2i2c-project-alpha.tfvars
similarity index 100%
rename from terraform/hackathon-2i2c-project-alpha.tfvars
rename to terraform/projects/hackathon-2i2c-project-alpha.tfvars
diff --git a/terraform/meom.tfvars b/terraform/projects/meom-ige.tfvars
similarity index 56%
rename from terraform/meom.tfvars
rename to terraform/projects/meom-ige.tfvars
index e0a97c529..d75f94e9c 100644
--- a/terraform/meom.tfvars
+++ b/terraform/projects/meom-ige.tfvars
@@ -1,4 +1,4 @@
-prefix     = "meom"
+prefix     = "meom-ige"
 project_id = "meom-ige-cnrs"
 
 # Minimum number of nodes required to fit kube-system is either
@@ -12,5 +12,28 @@ project_id = "meom-ige-cnrs"
 # the way to go
 core_node_machine_type = "g1-small"
 
-# Give each user ~4G of RAM and ~2CPU
-user_node_machine_type = "n1-custom-2-4096"
+enable_network_policy    = false
+config_connector_enabled = false
+
+notebook_nodes = {
+  "small" : {
+    min : 0,
+    max : 20,
+    machine_type : "e2-standard-2"
+  },
+  "medium" : {
+    min : 0,
+    max : 20,
+    machine_type : "e2-standard-8"
+  },
+  "large" : {
+    min : 0,
+    max : 20,
+    machine_type : "e2-standard-16"
+  },
+  "very-large" : {
+    min : 0,
+    max : 20,
+    machine_type : "e2-standard-32"
+  }
+}
diff --git a/terraform/pilot-hubs.tfvars b/terraform/projects/pilot-hubs.tfvars
similarity index 100%
rename from terraform/pilot-hubs.tfvars
rename to terraform/projects/pilot-hubs.tfvars
diff --git a/terraform/registry.tf b/terraform/registry.tf
new file mode 100644
index 000000000..29c8e3975
--- /dev/null
+++ b/terraform/registry.tf
@@ -0,0 +1,13 @@
+/**
+* Artifact Registry to store user images for this cluster.
+*
+* Hosting it in the same project makes node startup time faster.
+*/
+resource "google_artifact_registry_repository" "registry" {
+  provider = google-beta
+
+  location      = var.region
+  repository_id = "${var.prefix}-registry"
+  format        = "DOCKER"
+  project       = var.project_id
+}
diff --git a/terraform/variables.tf b/terraform/variables.tf
index eeb8d51a0..22562fcfb 100644
--- a/terraform/variables.tf
+++ b/terraform/variables.tf
@@ -1,54 +1,86 @@
 variable "prefix" {
-  type = string
+  type        = string
+  description = "Prefix used for all objects, to prevent collisions in the project"
 }
 
 variable "project_id" {
-  type = string
-  # This is in Toronto!
-  default = "two-eye-two-see"
+  type        = string
+  description = "ID of the GCP project resources should be created in"
 }
 
-variable "region" {
-  type    = string
-  default = "us-central1"
+variable "notebook_nodes" {
+  type        = map(map(string))
+  description = "Notebook node pools to create"
+  default     = {}
 }
 
-variable "zone" {
-  type    = string
-  default = "us-central1-b"
+variable "dask_nodes" {
+  type        = map(map(string))
+  description = "Dask node pools to create. Defaults to notebook_nodes"
+  default     = {}
 }
 
-variable "regional_cluster" {
-  type    = string
-  default = "false"
+variable "config_connector_enabled" {
+  type        = bool
+  default     = false
+  description = "Enable config connector to manage GCP resources as kubernetes objects"
 }
 
-variable "core_node_machine_type" {
-  type    = string
-  default = "n1-highmem-4"
+variable "cluster_sa_roles" {
+  type = set(string)
+  default = [
+    "roles/logging.logWriter",
+    "roles/monitoring.metricWriter",
+    "roles/monitoring.viewer",
+    "roles/stackdriver.resourceMetadata.writer",
+    "roles/artifactregistry.reader"
+  ]
+  description = "List of roles for the service account the nodes in the cluster run as"
 }
 
-variable "core_node_max_count" {
-  type    = number
-  default = 5
+variable "cd_sa_roles" {
+  type = set(string)
+  default = [
+    "roles/container.admin",
+    "roles/artifactregistry.writer"
+  ]
+  description = "List of roles for the service account used for continuous deployment"
+}
+
+variable "region" {
+  type        = string
+  default     = "us-central1"
+  description = "GCP Region the resources should be created in"
+
+}
+
+variable "zone" {
+  type        = string
+  default     = "us-central1-b"
+  description = "GCP Zone the nodes of the cluster should be created in"
 }
 
-variable "core_node_disk_size_gb" {
-  type    = number
-  default = 50
+variable "regional_cluster" {
+  type        = string
+  default     = "false"
+  description = "Set to 'true' for a HA regional master"
 }
 
-variable "user_node_machine_type" {
-  type    = string
-  default = "n1-standard-4"
+variable "core_node_machine_type" {
+  type        = string
+  default     = "g1-small"
+  description = "Machine type for core nodes"
 }
 
-variable "user_node_max_count" {
-  type    = number
-  default = 10
+variable "core_node_max_count" {
+  type        = number
+  default     = 5
+  description = "Maximum number of core nodes allowed"
 }
 
-variable "dask_worker_machine_type" {
-  type    = string
-  default = "e2-highmem-2"
+
+variable "enable_network_policy" {
+  type        = bool
+  default     = true
+  description = "Enable kubernetes network policy for access to fine-grained firewall rules"
 }

From a34361c10f4886401eb5810a826bc2660691db40 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Thu, 3 Jun 2021 18:13:50 +0530
Subject: [PATCH 07/27] Split cluster terraform setup to its own file

---
 terraform/cluster.tf | 189 ++++++++++++++++++++++++++++++++++++++++++
 terraform/main.tf    | 190 -------------------------------------------
 2 files changed, 189 insertions(+), 190 deletions(-)
 create mode 100644 terraform/cluster.tf

diff --git a/terraform/cluster.tf b/terraform/cluster.tf
new file mode 100644
index 000000000..782f4f33c
--- /dev/null
+++ b/terraform/cluster.tf
@@ -0,0 +1,189 @@
+resource "google_container_cluster" "cluster" {
+  # config_connector_config is in beta
+  provider = google-beta
+
+  name     = "${var.prefix}-cluster"
+  location = var.zone
+  project  = var.project_id
+
+  initial_node_count       = 1
+  remove_default_node_pool = true
+
+  addons_config {
+    http_load_balancing {
+      // FIXME: This used to not work well with websockets, and
+      // cost extra money as well. Let's validate if this is still
+      // true?
+      disabled = true
+    }
+    horizontal_pod_autoscaling {
+      // This isn't used anywhere, so let's turn this off
+      disabled = true
+    }
+    config_connector_config {
+      enabled = var.config_connector_enabled
+    }
+  }
+
+  release_channel {
+    # We upgrade clusters manually so we can manage downtime of
+    # master *and* nodes. When a cluster is in a release channel,
+    # upgrades (including disruptive node upgrades) happen automatically.
+    # So we disable it.
+    channel = "UNSPECIFIED"
+  }
+
+  network_policy {
+    enabled = var.enable_network_policy
+  }
+
+  node_config {
+    # DO NOT TOUCH THIS BLOCK, IT REPLACES ENTIRE CLUSTER LOL
+    service_account = google_service_account.cluster_sa.email
+  }
+}
+
+resource "google_container_node_pool" "core" {
+  name     = "core-pool"
+  cluster  = google_container_cluster.cluster.name
+  project  = google_container_cluster.cluster.project
+  location = google_container_cluster.cluster.location
+
+
+  initial_node_count = 1
+  autoscaling {
+    min_node_count = 1
+    max_node_count = var.core_node_max_count
+  }
+
+  management {
+    auto_repair = true
+    # Auto upgrade will drain and setup nodes without us knowing,
+    # and this can cause outages when it hits the proxy nodes.
+    auto_upgrade = false
+  }
+
+
+  node_config {
+    labels = {
+      "hub.jupyter.org/node-purpose" = "core",
+      "k8s.dask.org/node-purpose"    = "core"
+    }
+    machine_type = var.core_node_machine_type
+    disk_size_gb = 30
+
+    # Our service account gets all OAuth scopes so it can access
+    # all APIs, but only fine grained permissions + roles are
+    # granted via the service account.
+    service_account = google_service_account.cluster_sa.email
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/cloud-platform"
+    ]
+  }
+}
+
+resource "google_container_node_pool" "notebook" {
+  name     = "nb-${each.key}"
+  cluster  = google_container_cluster.cluster.name
+  project  = google_container_cluster.cluster.project
+  location = google_container_cluster.cluster.location
+
+  for_each = var.notebook_nodes
+
+  initial_node_count = 0
+  autoscaling {
+    min_node_count = each.value.min
+    max_node_count = each.value.max
+  }
+
+  management {
+    auto_repair  = true
+    auto_upgrade = false
+  }
+
+
+  node_config {
+    workload_metadata_config {
+      // Use node concealment - https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata
+      // This exposes the cluster Google SA to the pods, so we can
+      // access GCS appropriately.
+      node_metadata = "SECURE"
+    }
+    labels = {
+      # Notebook pods and dask schedulers can exist here
+      "hub.jupyter.org/node-purpose" = "user",
+      "k8s.dask.org/node-purpose"    = "scheduler",
+    }
+
+    taint = [{
+      key    = "hub.jupyter.org_dedicated"
+      value  = "user"
+      effect = "NO_SCHEDULE"
+    }]
+    machine_type = each.value.machine_type
+
+    # Our service account gets all OAuth scopes so it can access
+    # all APIs, but only fine grained permissions + roles are
+    # granted via the service account.
+    service_account = google_service_account.cluster_sa.email
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/cloud-platform"
+    ]
+  }
+}
+
+resource "google_container_node_pool" "dask_worker" {
+  name     = "dask-${each.key}"
+  cluster  = google_container_cluster.cluster.name
+  project  = google_container_cluster.cluster.project
+  location = google_container_cluster.cluster.location
+
+  # Default to same config as notebook nodepools config
+  for_each = var.dask_nodes == {} ? var.dask_nodes : var.notebook_nodes
+
+  initial_node_count = 0
+  autoscaling {
+    min_node_count = each.value.min
+    max_node_count = each.value.max
+  }
+
+  management {
+    auto_repair  = true
+    auto_upgrade = false
+  }
+
+  node_config {
+
+    preemptible = true
+    # SSD Disks for dask workers make image pulls much faster
+    # Since we might have many dask workers spinning up at the
+    # same time, the extra cost of using this is probably worth it.
+    disk_type = "pd-ssd"
+
+    workload_metadata_config {
+      // Use node concealment - https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata
+      // This exposes the cluster Google SA to the pods, so we can
+      // access GCS appropriately.
+      node_metadata = "SECURE"
+    }
+    labels = {
+      "k8s.dask.org/node-purpose" = "worker",
+    }
+
+    taint = [{
+      key    = "k8s.dask.org_dedicated"
+      value  = "worker"
+      effect = "NO_SCHEDULE"
+    }]
+    machine_type = each.value.machine_type
+
+    # Our service account gets all OAuth scopes so it can access
+    # all APIs, but only fine grained permissions + roles are
+    # granted via the service account.
+    service_account = google_service_account.cluster_sa.email
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/cloud-platform"
+    ]
+  }
+}
+
diff --git a/terraform/main.tf b/terraform/main.tf
index a8af85916..8bc667a6a 100644
--- a/terraform/main.tf
+++ b/terraform/main.tf
@@ -39,193 +39,3 @@ resource "google_project_iam_member" "cluster_sa_roles" {
   role    = each.value
   member  = "serviceAccount:${google_service_account.cluster_sa.email}"
 }
-
-resource "google_container_cluster" "cluster" {
-  # config_connector_config is in beta
-  provider = google-beta
-
-  name     = "${var.prefix}-cluster"
-  location = var.zone
-  project  = var.project_id
-
-  initial_node_count       = 1
-  remove_default_node_pool = true
-
-  addons_config {
-    http_load_balancing {
-      // FIXME: This used to not work well with websockets, and
-      // cost extra money as well. Let's validate if this is still
-      // true?
-      disabled = true
-    }
-    horizontal_pod_autoscaling {
-      // This isn't used anywhere, so let's turn this off
-      disabled = true
-    }
-    config_connector_config {
-      enabled = var.config_connector_enabled
-    }
-  }
-
-  release_channel {
-    # We upgrade clusters manually so we can manage downtime of
-    # master *and* nodes. When a cluster is in a release channel,
-    # upgrades (including disruptive node upgrades) happen automatically.
-    # So we disable it.
-    channel = "UNSPECIFIED"
-  }
-
-  network_policy {
-    enabled = var.enable_network_policy
-  }
-
-  node_config {
-    # DO NOT TOUCH THIS BLOCK, IT REPLACES ENTIRE CLUSTER LOL
-    service_account = google_service_account.cluster_sa.email
-  }
-}
-
-resource "google_container_node_pool" "core" {
-  name     = "core-pool"
-  cluster  = google_container_cluster.cluster.name
-  project  = google_container_cluster.cluster.project
-  location = google_container_cluster.cluster.location
-
-
-  initial_node_count = 1
-  autoscaling {
-    min_node_count = 1
-    max_node_count = var.core_node_max_count
-  }
-
-  management {
-    auto_repair = true
-    # Auto upgrade will drain and setup nodes without us knowing,
-    # and this can cause outages when it hits the proxy nodes.
-    auto_upgrade = false
-  }
-
-
-  node_config {
-    labels = {
-      "hub.jupyter.org/node-purpose" = "core",
-      "k8s.dask.org/node-purpose"    = "core"
-    }
-    machine_type = var.core_node_machine_type
-    disk_size_gb = 30
-
-    # Our service account gets all OAuth scopes so it can access
-    # all APIs, but only fine grained permissions + roles are
-    # granted via the service account.
-    service_account = google_service_account.cluster_sa.email
-    oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
-    ]
-  }
-}
-
-resource "google_container_node_pool" "notebook" {
-  name     = "nb-${each.key}"
-  cluster  = google_container_cluster.cluster.name
-  project  = google_container_cluster.cluster.project
-  location = google_container_cluster.cluster.location
-
-  for_each = var.notebook_nodes
-
-  initial_node_count = 0
-  autoscaling {
-    min_node_count = each.value.min
-    max_node_count = each.value.max
-  }
-
-  management {
-    auto_repair  = true
-    auto_upgrade = false
-  }
-
-
-  node_config {
-    workload_metadata_config {
-      // Use node concealment - https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata
-      // This exposes the cluster Google SA to the pods, so we can
-      // access GCS appropriately.
-      node_metadata = "SECURE"
-    }
-    labels = {
-      # Notebook pods and dask schedulers can exist here
-      "hub.jupyter.org/node-purpose" = "user",
-      "k8s.dask.org/node-purpose"    = "scheduler",
-    }
-
-    taint = [{
-      key    = "hub.jupyter.org_dedicated"
-      value  = "user"
-      effect = "NO_SCHEDULE"
-    }]
-    machine_type = each.value.machine_type
-
-    # Our service account gets all OAuth scopes so it can access
-    # all APIs, but only fine grained permissions + roles are
-    # granted via the service account.
-    service_account = google_service_account.cluster_sa.email
-    oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
-    ]
-  }
-}
-
-resource "google_container_node_pool" "dask_worker" {
-  name     = "dask-${each.key}"
-  cluster  = google_container_cluster.cluster.name
-  project  = google_container_cluster.cluster.project
-  location = google_container_cluster.cluster.location
-
-  # Default to same config as notebook nodepools config
-  for_each = var.dask_nodes == {} ? var.dask_nodes : var.notebook_nodes
-
-  initial_node_count = 0
-  autoscaling {
-    min_node_count = each.value.min
-    max_node_count = each.value.max
-  }
-
-  management {
-    auto_repair  = true
-    auto_upgrade = false
-  }
-
-  node_config {
-
-    preemptible = true
-    # SSD Disks for dask workers make image pulls much faster
-    # Since we might have many dask workers spinning up at the
-    # same time, the extra cost of using this is probably worth it.
-    disk_type = "pd-ssd"
-
-    workload_metadata_config {
-      // Use node concealment - https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata
-      // This exposes the cluster Google SA to the pods, so we can
-      // access GCS appropriately.
-      node_metadata = "SECURE"
-    }
-    labels = {
-      "k8s.dask.org/node-purpose" = "worker",
-    }
-
-    taint = [{
-      key    = "k8s.dask.org_dedicated"
-      value  = "worker"
-      effect = "NO_SCHEDULE"
-    }]
-    machine_type = each.value.machine_type
-
-    # Our service account gets all OAuth scopes so it can access
-    # all APIs, but only fine grained permissions + roles are
-    # granted via the service account.
-    service_account = google_service_account.cluster_sa.email
-    oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
-    ]
-  }
-}
-

From 2679ae8b22eb8c984bf52b92f9d09e976eb85c1b Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Fri, 4 Jun 2021 00:56:22 +0530
Subject: [PATCH 08/27] Add a data & scratch GCS bucket

- Data bucket for storing per-hub data products
- Scratch bucket for *per-user* data stuff
---
 config/hubs/meom-ige.cluster.yaml  |  3 +++
 terraform/buckets.tf               | 18 ++++++++++++++++++
 terraform/projects/meom-ige.tfvars |  5 +++++
 terraform/variables.tf             |  6 ++++++
 4 files changed, 32 insertions(+)
 create mode 100644 terraform/buckets.tf

diff --git a/config/hubs/meom-ige.cluster.yaml b/config/hubs/meom-ige.cluster.yaml
index f80fba552..e6deddacc 100644
--- a/config/hubs/meom-ige.cluster.yaml
+++ b/config/hubs/meom-ige.cluster.yaml
@@ -43,6 +43,9 @@ hubs:
                   name: SWOT Ocean Pangeo Team
                   url: https://meom-group.github.io/
           singleuser:
+            extraEnv:
+              DATA_BUCKET: gcs://meom-ige-data
+              SCRATCH_BUCKET: 'gcs://meom-ige-scratch/$(JUPYTERHUB_USER)'
             profileList:
               # The mem-guarantees are here so k8s doesn't schedule other pods
               # on these nodes. They need to be just under total allocatable
diff --git a/terraform/buckets.tf b/terraform/buckets.tf
new file mode 100644
index 000000000..32904d118
--- /dev/null
+++ b/terraform/buckets.tf
@@ -0,0 +1,18 @@
+/**
+* GCS buckets for use by hub users
+*/
+
+resource "google_storage_bucket" "user_buckets" {
+  for_each = var.user_buckets
+  name     = "${var.prefix}-${each.key}"
+  location = var.region
+  project  = var.project_id
+}
+
+resource "google_storage_bucket_iam_member" "member" {
+
+  for_each = var.user_buckets
+  bucket   = google_storage_bucket.user_buckets[each.key].name
+  role     = "roles/storage.admin"
+  member   = "serviceAccount:${google_service_account.cluster_sa.email}"
+}
diff --git a/terraform/projects/meom-ige.tfvars b/terraform/projects/meom-ige.tfvars
index d75f94e9c..0477e06a4 100644
--- a/terraform/projects/meom-ige.tfvars
+++ b/terraform/projects/meom-ige.tfvars
@@ -37,3 +37,8 @@ notebook_nodes = {
     machine_type : "e2-standard-32"
   }
 }
+
+user_buckets = [
+  "scratch",
+  "data"
+]
\ No newline at end of file
diff --git a/terraform/variables.tf b/terraform/variables.tf
index 22562fcfb..3c85f6f6c 100644
--- a/terraform/variables.tf
+++ b/terraform/variables.tf
@@ -84,3 +84,9 @@ variable "enable_network_policy" {
   default     = true
   description = "Enable kubernetes network policy for access to fine-grained firewall rules"
 }
+
+variable "user_buckets" {
+  type        = set(any)
+  default     = []
+  description = "Buckets to create for the project, they will be prefixed with {var.prefix}-"
+}

From ad49f4e0603c98a00ec8bdf487f96b4b6afa1c0f Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Tue, 8 Jun 2021 16:42:27 +0530
Subject: [PATCH 09/27] Fix dask worker config defaulting to notebook node
 config

- Testing against {} doesn't seem to work. Checking length is
  a bit more obtuse, but works consistently.
- Fix what goes in the True / False arms of the ternary operator -
  was swapped out earlier.
---
 terraform/cluster.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/terraform/cluster.tf b/terraform/cluster.tf
index 782f4f33c..683d62a64 100644
--- a/terraform/cluster.tf
+++ b/terraform/cluster.tf
@@ -139,7 +139,7 @@ resource "google_container_node_pool" "dask_worker" {
   location = google_container_cluster.cluster.location
 
   # Default to same config as notebook nodepools config
-  for_each = var.dask_nodes == {} ? var.dask_nodes : var.notebook_nodes
+  for_each = length(var.dask_nodes) == 0 ? var.notebook_nodes : var.dask_nodes
 
   initial_node_count = 0
   autoscaling {

From 01699d3af5e59d6e300acd79a6540b1d7850ae0b Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Tue, 8 Jun 2021 16:45:38 +0530
Subject: [PATCH 10/27] Enable workload identity when config connector is
 enabled

---
 terraform/cluster.tf | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/terraform/cluster.tf b/terraform/cluster.tf
index 683d62a64..198fd561b 100644
--- a/terraform/cluster.tf
+++ b/terraform/cluster.tf
@@ -25,6 +25,15 @@ resource "google_container_cluster" "cluster" {
     }
   }
 
+  dynamic "workload_identity_config" {
+    # Setup workload identity only if we're using config connector, otherwise
+    # just metadata concealment is used
+    for_each = var.config_connector_enabled == "" ? [] : [1]
+    content {
+      identity_namespace = "${var.project_id}.svc.id.goog"
+    }
+  }
+
   release_channel {
     # We upgrade clusters manually so we can manage downtime of
     # master *and* nodes. When a cluster is in a release channel,
@@ -104,10 +113,13 @@ resource "google_container_node_pool" "notebook" {
 
   node_config {
     workload_metadata_config {
-      // Use node concealment - https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata
-      // This exposes the cluster Google SA to the pods, so we can
-      // access GCS appropriately.
-      node_metadata = "SECURE"
+      # Config Connector requires workload identity to be enabled (via GKE_METADATA_SERVER).
+      # If config connector is not necessary, we use simple metadata concealment
+      # (https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata)
+      # to expose the node CA to users safely.
+      # FIXME: This should be a bit more fine-grained - it should be possible to disable
+      # config connector and completely hide all node metadata from user pods
+      node_metadata = var.config_connector_enabled ? "GKE_METADATA_SERVER" : "SECURE"
     }
     labels = {
       # Notebook pods and dask schedulers can exist here
@@ -161,10 +173,13 @@ resource "google_container_node_pool" "dask_worker" {
     disk_type = "pd-ssd"
 
     workload_metadata_config {
-      // Use node concealment - https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata
-      // This exposes the cluster Google SA to the pods, so we can
-      // access GCS appropriately.
-      node_metadata = "SECURE"
+      # Config Connector requires workload identity to be enabled (via GKE_METADATA_SERVER).
+      # If config connector is not necessary, we use simple metadata concealment
+      # (https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata)
+      # to expose the node CA to users safely.
+      # FIXME: This should be a bit more fine-grained - it should be possible to disable
+      # config connector and completely hide all node metadata from user pods
+      node_metadata = var.config_connector_enabled ? "GKE_METADATA_SERVER" : "SECURE"
     }
     labels = {
       "k8s.dask.org/node-purpose" = "worker",

From 7e6c5d194b0c0c8f17bc1cc9b71a88ac4344110a Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 9 Jun 2021 01:41:25 +0530
Subject: [PATCH 11/27] Optimize autoscaler profile for batch workloads

---
 terraform/cluster.tf | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/terraform/cluster.tf b/terraform/cluster.tf
index 198fd561b..65ccc55c0 100644
--- a/terraform/cluster.tf
+++ b/terraform/cluster.tf
@@ -42,6 +42,14 @@ resource "google_container_cluster" "cluster" {
     channel = "UNSPECIFIED"
   }
 
+  cluster_autoscaling {
+    # This disables node autoprovisioning, not cluster autoscaling!
+    enabled = false
+    # Use a scheduler + autoscaling profile optimized for batch workloads like ours
+    # https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler#autoscaling_profiles
+    autoscaling_profile = "OPTIMIZE_UTILIZATION"
+  }
+
   network_policy {
     enabled = var.enable_network_policy
   }

From 02c4d6ffa1495121762a3b21fc8e31caaea70686 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 9 Jun 2021 16:05:40 +0530
Subject: [PATCH 12/27] Document why we use cloud-platform access scope

---
 terraform/cluster.tf | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/terraform/cluster.tf b/terraform/cluster.tf
index 65ccc55c0..42d4fdd59 100644
--- a/terraform/cluster.tf
+++ b/terraform/cluster.tf
@@ -91,7 +91,8 @@ resource "google_container_node_pool" "core" {
 
     # Our service account gets all OAuth scopes so it can access
     # all APIs, but only fine grained permissions + roles are
-    # granted via the service account.
+    # granted via the service account. This follows Google's
+    # recommendation at https://cloud.google.com/compute/docs/access/service-accounts#associating_a_service_account_to_an_instance
     service_account = google_service_account.cluster_sa.email
     oauth_scopes = [
       "https://www.googleapis.com/auth/cloud-platform"
@@ -144,7 +145,8 @@ resource "google_container_node_pool" "notebook" {
 
     # Our service account gets all OAuth scopes so it can access
     # all APIs, but only fine grained permissions + roles are
-    # granted via the service account.
+    # granted via the service account. This follows Google's
+    # recommendation at https://cloud.google.com/compute/docs/access/service-accounts#associating_a_service_account_to_an_instance
     service_account = google_service_account.cluster_sa.email
     oauth_scopes = [
       "https://www.googleapis.com/auth/cloud-platform"
@@ -202,7 +204,8 @@ resource "google_container_node_pool" "dask_worker" {
 
     # Our service account gets all OAuth scopes so it can access
     # all APIs, but only fine grained permissions + roles are
-    # granted via the service account.
+    # granted via the service account. This follows Google's
+    # recommendation at https://cloud.google.com/compute/docs/access/service-accounts#associating_a_service_account_to_an_instance
     service_account = google_service_account.cluster_sa.email
     oauth_scopes = [
       "https://www.googleapis.com/auth/cloud-platform"

From 6088bf9b5b3b64f7ce7dc873bb17b787912c8b3f Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Mon, 14 Jun 2021 18:26:59 +0530
Subject: [PATCH 13/27] Add warning about initial node count

---
 terraform/cluster.tf | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/terraform/cluster.tf b/terraform/cluster.tf
index 42d4fdd59..fa8534f3b 100644
--- a/terraform/cluster.tf
+++ b/terraform/cluster.tf
@@ -108,7 +108,11 @@ resource "google_container_node_pool" "notebook" {
 
   for_each = var.notebook_nodes
 
-  initial_node_count = 0
+  # WARNING: Do not change this value, it will cause the nodepool
+  # to be destroyed & re-created. If you want to increase number of
+  # nodes in a node pool, set the min count to that number and then
+  # scale the pool manually.
+  initial_node_count = each.value.min
   autoscaling {
     min_node_count = each.value.min
     max_node_count = each.value.max
@@ -163,6 +167,10 @@ resource "google_container_node_pool" "dask_worker" {
   # Default to same config as notebook nodepools config
   for_each = length(var.dask_nodes) == 0 ? var.notebook_nodes : var.dask_nodes
 
+  # WARNING: Do not change this value, it will cause the nodepool
+  # to be destroyed & re-created. If you want to increase number of
+  # nodes in a node pool, set the min count to that number and then
+  # scale the pool manually.
   initial_node_count = 0
   autoscaling {
     min_node_count = each.value.min

From e90bca82ff518ee2b008ae492bd7f28b51a16017 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Mon, 14 Jun 2021 20:28:09 +0530
Subject: [PATCH 14/27] Autogenreate Terraform variable documentation

Calls terraform-docs dynamically from sphinx
---
 docs/conf.py                  | 100 ++++++++++++++++++------------
 docs/environment.yml          |  13 ++++
 docs/index.md                 |   1 +
 docs/requirements.txt         |   7 ---
 docs/topic/terraform/index.md |   7 +++
 terraform/.terraform-docs.yml |   9 +++
 terraform/variables.tf        | 113 +++++++++++++++++++++++++++++-----
 7 files changed, 187 insertions(+), 63 deletions(-)
 create mode 100644 docs/environment.yml
 delete mode 100644 docs/requirements.txt
 create mode 100644 docs/topic/terraform/index.md
 create mode 100644 terraform/.terraform-docs.yml

diff --git a/docs/conf.py b/docs/conf.py
index c731b4d8e..cb3cd7f07 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -62,43 +62,63 @@
 from yaml import safe_load
 import pandas as pd
 from pathlib import Path
-
-# Grab the latest list of clusters defined in pilot-hubs/
-clusters = Path("../config/hubs").glob("*")
-# Add list of repos managed outside pilot-hubs
-hub_list = [{
-    'name': 'University of Toronto',
-    'domain': 'jupyter.utoronto.ca',
-    'id': 'utoronto',
-    'template': 'base-hub ([deployment repo](https://github.com/utoronto-2i2c/jupyterhub-deploy/))'
-}]
-for cluster_info in clusters:
-    if "schema" in cluster_info.name:
-        continue
-    # For each cluster, grab it's YAML w/ the config for each hub
-    yaml = cluster_info.read_text()
-    cluster = safe_load(yaml)
-
-    # For each hub in cluster, grab its metadata and add it to the list
-    for hub in cluster['hubs']:
-        config = hub['config']
-        # Config is sometimes nested
-        if 'basehub' in config:
-            hub_config = config['basehub']['jupyterhub']
-        else:
-            hub_config = config['jupyterhub']
-        # Domain can be a list
-        if isinstance(hub['domain'], list):
-            hub['domain'] = hub['domain'][0]
-
-        hub_list.append({
-            'name': hub_config['custom']['homepage']['templateVars']['org']['name'],
-            'domain': f"[{hub['domain']}](https://{hub['domain']})",
-            "id": hub['name'],
-            "template": hub['template'],
-        })
-df = pd.DataFrame(hub_list)
-path_tmp = Path("tmp")
-path_tmp.mkdir(exist_ok=True)
-path_table = path_tmp / "hub-table.csv"
-df.to_csv(path_table, index=None)
\ No newline at end of file
+import subprocess
+
+def render_hubs():
+    # Grab the latest list of clusters defined in pilot-hubs/
+    clusters = Path("../config/hubs").glob("*")
+    # Add list of repos managed outside pilot-hubs
+    hub_list = [{
+        'name': 'University of Toronto',
+        'domain': 'jupyter.utoronto.ca',
+        'id': 'utoronto',
+        'template': 'base-hub ([deployment repo](https://github.com/utoronto-2i2c/jupyterhub-deploy/))'
+    }]
+    for cluster_info in clusters:
+        if "schema" in cluster_info.name:
+            continue
+        # For each cluster, grab it's YAML w/ the config for each hub
+        yaml = cluster_info.read_text()
+        cluster = safe_load(yaml)
+
+        # For each hub in cluster, grab its metadata and add it to the list
+        for hub in cluster['hubs']:
+            config = hub['config']
+            # Config is sometimes nested
+            if 'basehub' in config:
+                hub_config = config['basehub']['jupyterhub']
+            else:
+                hub_config = config['jupyterhub']
+            # Domain can be a list
+            if isinstance(hub['domain'], list):
+                hub['domain'] = hub['domain'][0]
+
+            hub_list.append({
+                'name': hub_config['custom']['homepage']['templateVars']['org']['name'],
+                'domain': f"[{hub['domain']}](https://{hub['domain']})",
+                "id": hub['name'],
+                "template": hub['template'],
+            })
+    df = pd.DataFrame(hub_list)
+    path_tmp = Path("tmp")
+    path_tmp.mkdir(exist_ok=True)
+    path_table = path_tmp / "hub-table.csv"
+    df.to_csv(path_table, index=None)
+
+
+def render_tfdocs():
+    tf_path = Path('../terraform')
+    # Output path is relative to terraform directory
+    output_path = Path('../docs/topic/terraform/reference.md')
+
+    # Template for output file is in ../terraform/.terraform-docs.yml
+    subprocess.check_call([
+        'terraform-docs', 'markdown',
+        f"--output-file={output_path}",
+        str(tf_path)
+    ])
+
+
+
+render_hubs()
+render_tfdocs()
\ No newline at end of file
diff --git a/docs/environment.yml b/docs/environment.yml
new file mode 100644
index 000000000..3add731f1
--- /dev/null
+++ b/docs/environment.yml
@@ -0,0 +1,13 @@
+channels:
+- conda-forge
+dependencies:
+- go-terraform-docs
+- pip
+- pip:
+  - myst-parser[sphinx,linkify]
+  - sphinx-book-theme
+  - sphinx-panels
+  - sphinx-autobuild
+  - pandas
+  - pyyaml
+  - requests
diff --git a/docs/index.md b/docs/index.md
index ec6b28e3c..2f3ba1719 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -42,6 +42,7 @@ Topic guides go more in-depth on a particular topic.
 topic/config.md
 topic/hub-templates.md
 topic/storage-layer.md
+topic/terraform/index.md
 ```
 
 ## Reference
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 3883cb447..000000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-myst-parser[sphinx,linkify]
-sphinx-book-theme
-sphinx-panels
-sphinx-autobuild
-pandas
-pyyaml
-requests
diff --git a/docs/topic/terraform/index.md b/docs/topic/terraform/index.md
new file mode 100644
index 000000000..7e572d918
--- /dev/null
+++ b/docs/topic/terraform/index.md
@@ -0,0 +1,7 @@
+# Terraform Configuration
+
+```{toctree}
+:maxdepth: 1
+
+reference.md
+```
\ No newline at end of file
diff --git a/terraform/.terraform-docs.yml b/terraform/.terraform-docs.yml
new file mode 100644
index 000000000..f6449a2c6
--- /dev/null
+++ b/terraform/.terraform-docs.yml
@@ -0,0 +1,9 @@
+output:
+  mode: replace
+  template: |-
+    # Reference
+
+    <!-- Autogenerated TF Docs -->
+    <!-- BEGIN_TF_DOCS -->
+    {{ .Content }}
+    <!-- END_TF_DOCS -->
\ No newline at end of file
diff --git a/terraform/variables.tf b/terraform/variables.tf
index 3c85f6f6c..2d8704e04 100644
--- a/terraform/variables.tf
+++ b/terraform/variables.tf
@@ -1,11 +1,26 @@
 variable "prefix" {
   type        = string
-  description = "Prefix used for all objects, to prevent collisions in the project"
+  description = <<-EOT
+  Prefix for all objects created by terraform.
+
+  Primary identifier to 'group' together resources created by
+  this terraform module. Prevents clashes with other resources
+  in the cloud project / account.
+
+  Should not be changed after first terraform apply - doing so
+  will recreate all resources.
+
+  Should not end with a '-', that is automatically added.
+  EOT
 }
 
 variable "project_id" {
   type        = string
-  description = "ID of the GCP project resources should be created in"
+  description = <<-EOT
+  GCP Project ID to create resources in.
+
+  Should be the id, rather than display name of the project.
+  EOT
 }
 
 variable "notebook_nodes" {
@@ -23,7 +38,17 @@ variable "dask_nodes" {
 variable "config_connector_enabled" {
   type        = bool
   default     = false
-  description = "Enable config connector to manage GCP resources as kubernetes objects"
+  description = <<-EOT
+  Enable GKE Config Connector to manage GCP resources via kubernetes.
+
+  GKE Config Connector (https://cloud.google.com/config-connector/docs/overview)
+  allows creating GCP resources (like buckets, VMs, etc) via creating Kubernetes
+  Custom Resources. We use this to create buckets on a per-hub level,
+  and could use it for other purposes in the future.
+
+  Enabling this increases base cost, as config connector related pods
+  needs to run on the cluster.
+  EOT
 }
 
 variable "cluster_sa_roles" {
@@ -35,7 +60,15 @@ variable "cluster_sa_roles" {
     "roles/stackdriver.resourceMetadata.writer",
     "roles/artifactregistry.reader"
   ]
-  description = "List of roles for the service account the nodes in the cluster run as"
+  description = <<-EOT
+  List of roles granted to the SA assumed by cluster nodes.
+
+  The defaults grant just enough access for the components on the node
+  to write metrics & logs to stackdriver, and pull images from artifact registry.
+
+  https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster
+  has more information.
+  EOT
 }
 
 variable "cd_sa_roles" {
@@ -44,45 +77,93 @@ variable "cd_sa_roles" {
     "roles/container.admin",
     "roles/artifactregistry.writer"
   ]
-  description = "List of roles for the service account used for continuous deployment"
+  description = <<-EOT
+  List of roles granted to the SA used by our CI/CD pipeline.
+
+  We want to automatically build / push images, and deploy to
+  the kubernetes cluster from CI/CD (on GitHub actions, mostly).
+  A JSON key for this will be generated (with
+  `terraform output -raw ci_deployer_key`) and stored in the
+  repo in encrypted form.
+
+  The default provides *full* access to the entire kubernetes
+  cluster! This is dangerous, but it is unclear how to tamp
+  it down.
+  EOT
 }
 
 variable "region" {
   type        = string
   default     = "us-central1"
-  description = "GCP Region the resources should be created in"
+  description = <<-EOT
+  GCP Region the cluster & resources will be placed in.
+
+  For research clusters, this should be closest to where
+  your source data is.
+
+  This does not imply that the cluster will be a regional
+  cluster.
+  EOT
 
 }
 
 variable "zone" {
   type        = string
   default     = "us-central1-b"
-  description = "GCP Zone the nodes of the cluster should be created in"
-}
+  description = <<-EOT
+  GCP Zone the cluster & nodes will be set up in.
 
-variable "regional_cluster" {
-  type        = string
-  default     = "false"
-  description = "Set to 'true' for a HA regional master"
+  Even with a regional cluster, all the cluster nodes will
+  be on a single zone. NFS and supporting VMs will need to
+  be in this zone as well.
+  EOT
 }
 
 variable "core_node_machine_type" {
   type        = string
   default     = "g1-small"
-  description = "Machine type for core nodes"
+  description = <<-EOT
+  Machine type to use for core nodes.
+
+  Core nodes will always be on, and count as 'base cost'
+  for a cluster. We should try to run with as few of them
+  as possible.
+
+  For single-tenant clusters, a single g1-small node seems
+  enough - if network policy and config connector are not on.
+  For others, please experiment to see what fits.
+  EOT
 }
 
 variable "core_node_max_count" {
   type        = number
   default     = 5
-  description = "Maximum number of core nodes allowed"
-}
+  description = <<-EOT
+  Maximum number of core nodes available.
+
+  Core nodes can scale up to this many nodes if necessary.
+  They are part of the 'base cost', should be kept to a minimum.
+  This number should be small enough to prevent runaway scaling,
+  but large enough to support ocassional spikes for whatever reason.
 
+  Minimum node count is fixed at 1.
+  EOT
+}
 
 variable "enable_network_policy" {
   type        = bool
   default     = true
-  description = "Enable kubernetes network policy for access to fine-grained firewall rules"
+  description = <<-EOT
+  Enable kubernetes network policy enforcement.
+
+  Our z2jh deploys NetworkPolicies by default - but they are
+  not enforced unless enforcement is turned on here. This takes
+  up some cluster resources, so we could turn it off in cases
+  where we are trying to minimize base cost.
+
+  https://cloud.google.com/kubernetes-engine/docs/how-to/network-policy
+  has more information.
+  EOT
 }
 
 variable "user_buckets" {

From bc4469c7c362151cd523c689bf69cc29600a33ea Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Tue, 15 Jun 2021 16:50:07 +0530
Subject: [PATCH 15/27] Point RTD to our environment.yml file

---
 .readthedocs.yaml    | 11 +++++++++++
 docs/environment.yml |  1 +
 2 files changed, 12 insertions(+)
 create mode 100644 .readthedocs.yaml

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 000000000..6499d94dd
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,11 @@
+vesion: 2
+
+conda:
+  environment: docs/environment.yml
+
+build:
+  image: latest
+
+python:
+  version: 3.8
+  install: []
\ No newline at end of file
diff --git a/docs/environment.yml b/docs/environment.yml
index 3add731f1..fda88096a 100644
--- a/docs/environment.yml
+++ b/docs/environment.yml
@@ -3,6 +3,7 @@ channels:
 dependencies:
 - go-terraform-docs
 - pip
+- python=3.8
 - pip:
   - myst-parser[sphinx,linkify]
   - sphinx-book-theme

From 4a43fd7a4e0b31c0064a20aa098cc1cc45c83c71 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Tue, 15 Jun 2021 18:52:56 +0530
Subject: [PATCH 16/27] Add terraform conventions doc

---
 docs/topic/terraform/conventions.md | 49 +++++++++++++++++++++++++++++
 docs/topic/terraform/index.md       |  1 +
 2 files changed, 50 insertions(+)
 create mode 100644 docs/topic/terraform/conventions.md

diff --git a/docs/topic/terraform/conventions.md b/docs/topic/terraform/conventions.md
new file mode 100644
index 000000000..c47dbe293
--- /dev/null
+++ b/docs/topic/terraform/conventions.md
@@ -0,0 +1,49 @@
+# Conventions
+
+## Workspaces
+
+We use [terraform workspaces](https://www.terraform.io/docs/language/state/workspaces.html)
+to maintain separate terraform states about different clusters we manage.
+There should be one workspace per cluster, with the same name as the `.tfvars`
+file with variable definitions for that cluster.
+
+Workspaces are stored centrally in the `two-eye-two-see-org` GCP project, even
+when we use Terraform for projects running on AWS / Azure. You must have
+access to this project before you can use terraform for our infrastructure.
+
+## Core node size
+
+In each cluster, we have a *core node pool* that is fairly static in size
+and always running. It needs enough capacity to run:
+
+1. Kubernetes system components - network policy enforcement, config connector
+   components, cluster autoscaler, kube-dns, etc.
+
+2. Per-cluster support components - like prometheus, grafana, cert-manager,
+   etc.
+
+3. Hub core components - the hub, proxy, userscheduler, etc
+
+4. (Optional) Dask gatway core components - the API gateway, controller, etc.
+
+Since the core nodes are *always running*, they form a big chunk of the
+cluster's *base cost* - the amount of money it costs each day, regardless
+of current number of running users. Picking an apporpriate node size and
+count here has a big effect.
+
+### On GKE
+
+GKE makes sizing this nodepool difficult, since `kube-system` components can take up quite
+a bit of resources. Even though the kind of clusters we run will most likely
+not stress components like `kube-dns` that much, there's no option to provide
+them fewer resource requests. So this will be our primary limitation in
+many ways.
+
+Adding [Config Connector](https://cloud.google.com/config-connector/docs/overview)
+or enabling [Network Policy](https://cloud.google.com/kubernetes-engine/docs/how-to/network-policy)
+requires more resources as well.
+
+With poorly structured experimentation, the current recommendation is to run
+3 `g1-small` instances for a cluster without config connector or network policy,
+or a single `n1-highmem-4` instance for a cluster with either of those options
+turned on. This needs to be better investigated.
\ No newline at end of file
diff --git a/docs/topic/terraform/index.md b/docs/topic/terraform/index.md
index 7e572d918..ad3d70e3c 100644
--- a/docs/topic/terraform/index.md
+++ b/docs/topic/terraform/index.md
@@ -3,5 +3,6 @@
 ```{toctree}
 :maxdepth: 1
 
+conventions.md
 reference.md
 ```
\ No newline at end of file

From 16b66ffdcbd3834b76b7d01e99812d14cd3e7bc3 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 16 Jun 2021 16:24:49 +0530
Subject: [PATCH 17/27] Create GKE-specific cluster design docs

- Remove terraform-specific dir, not enough docs there.
  Autogenerated reference docs will be moved to a different
  PR as I can't seem to get conda to work
---
 docs/index.md                       |  3 +-
 docs/topic/cluster-design.md        | 75 +++++++++++++++++++++++++++++
 docs/topic/terraform.md             | 16 ++++++
 docs/topic/terraform/conventions.md | 49 -------------------
 docs/topic/terraform/index.md       |  8 ---
 5 files changed, 93 insertions(+), 58 deletions(-)
 create mode 100644 docs/topic/cluster-design.md
 create mode 100644 docs/topic/terraform.md
 delete mode 100644 docs/topic/terraform/conventions.md
 delete mode 100644 docs/topic/terraform/index.md

diff --git a/docs/index.md b/docs/index.md
index 2f3ba1719..75e6818b6 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -42,7 +42,8 @@ Topic guides go more in-depth on a particular topic.
 topic/config.md
 topic/hub-templates.md
 topic/storage-layer.md
-topic/terraform/index.md
+topic/terraform.md
+topic/cluster-design.md
 ```
 
 ## Reference
diff --git a/docs/topic/cluster-design.md b/docs/topic/cluster-design.md
new file mode 100644
index 000000000..d17ece45b
--- /dev/null
+++ b/docs/topic/cluster-design.md
@@ -0,0 +1,75 @@
+# Cluster design considerations
+
+## GKE
+
+## Core node size
+
+In each cluster, we have a *core node pool* that is fairly static in size
+and always running. It needs enough capacity to run:
+
+1. Kubernetes system components - network policy enforcement, config connector
+   components, cluster autoscaler, kube-dns, etc.
+
+2. Per-cluster support components - like prometheus, grafana, cert-manager,
+   etc.
+
+3. Hub core components - the hub, proxy, userscheduler, etc
+
+4. (Optional) Dask gatway core components - the API gateway, controller, etc.
+
+Since the core nodes are *always running*, they form a big chunk of the
+cluster's *base cost* - the amount of money it costs each day, regardless
+of current number of running users. Picking an apporpriate node size and
+count here has a big effect.
+
+### On GKE
+
+GKE makes sizing this nodepool difficult, since `kube-system` components can take up quite
+a bit of resources. Even though the kind of clusters we run will most likely
+not stress components like `kube-dns` that much, there's no option to provide
+them fewer resource requests. So this will be our primary limitation in
+many ways.
+
+Adding [Config Connector](https://cloud.google.com/config-connector/docs/overview)
+or enabling [Network Policy](https://cloud.google.com/kubernetes-engine/docs/how-to/network-policy)
+requires more resources as well.
+
+With poorly structured experimentation, the current recommendation is to run
+3 `g1-small` instances for a cluster without config connector or network policy,
+or a single `n1-highmem-4` instance for a cluster with either of those options
+turned on. This needs to be better investigated.
+
+## Network Policy
+
+When hubs belonging to multiple organizations are run on the same cluster,
+we **must** enable [NetworkPolicy enforcement](https://cloud.google.com/kubernetes-engine/docs/how-to/network-policy)
+to isolate them from each other.
+
+## Cloud access credentials for hub users
+
+For hub users to access cloud resources (like storage buckets), they will need
+to be authorized via a [GCP ServiceAccount](https://cloud.google.com/iam/docs/service-accounts).
+This is different from a [Kubernetes ServiceAccount](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/),
+which is used to authenticate and authorize accss to kubernetes resources (like spawning pods).
+
+For dask hubs, we want to provide users with write access to at least one storage
+bucket they can use for temporary data storage. User pods need to be given access to
+a GCP ServiceAccount that has write permissions to this bucket. There are two ways
+to do this:
+
+1. Provide appropriate permissions to the GCP ServiceAccount used by the node the user
+   pods are running on. When used with [Metadata Concealment](https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata#overview),
+   user pods can read / write from storage buckets. However, this grants the same permissions
+   to *all* pods on the cluster, and hence is unsuitable for clusters with multiple
+   hubs running for different organizations.
+
+2. Use the [GKE Cloud Config Connector](https://cloud.google.com/config-connector/docs/overview) to
+   create a GCP ServiceAccount + Storage Bucket for each hub via helm. This requires using
+   [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) and
+   is incompatible with (1). This is required for multi-tenant clusters, since users on a hub
+   have much tighter scoped permissions.
+
+Long-term, (2) is the appropriate way to do this for everyone. However, it affects the size
+of the core node pool, since it runs some components in the cluster. For now, we use (1) for
+single-tenant clusters, and (2) for multi-tenant clusters. If nobody wants a scratch GCS bucket,
+neither option is required.
\ No newline at end of file
diff --git a/docs/topic/terraform.md b/docs/topic/terraform.md
new file mode 100644
index 000000000..dd35ef962
--- /dev/null
+++ b/docs/topic/terraform.md
@@ -0,0 +1,16 @@
+# Terraform
+
+[Terraform](https://www.terraform.io/) is used to manage our infrastructure
+on Google Cloud Platform. The source files are under `terraform/` in this repo,
+and variables defining each cluster we manage are under `terraform/projects`.
+
+## Workspaces
+
+We use [terraform workspaces](https://www.terraform.io/docs/language/state/workspaces.html)
+to maintain separate terraform states about different clusters we manage.
+There should be one workspace per cluster, with the same name as the `.tfvars`
+file with variable definitions for that cluster.
+
+Workspaces are stored centrally in the `two-eye-two-see-org` GCP project, even
+when we use Terraform for projects running on AWS / Azure. You must have
+access to this project before you can use terraform for our infrastructure.
diff --git a/docs/topic/terraform/conventions.md b/docs/topic/terraform/conventions.md
deleted file mode 100644
index c47dbe293..000000000
--- a/docs/topic/terraform/conventions.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Conventions
-
-## Workspaces
-
-We use [terraform workspaces](https://www.terraform.io/docs/language/state/workspaces.html)
-to maintain separate terraform states about different clusters we manage.
-There should be one workspace per cluster, with the same name as the `.tfvars`
-file with variable definitions for that cluster.
-
-Workspaces are stored centrally in the `two-eye-two-see-org` GCP project, even
-when we use Terraform for projects running on AWS / Azure. You must have
-access to this project before you can use terraform for our infrastructure.
-
-## Core node size
-
-In each cluster, we have a *core node pool* that is fairly static in size
-and always running. It needs enough capacity to run:
-
-1. Kubernetes system components - network policy enforcement, config connector
-   components, cluster autoscaler, kube-dns, etc.
-
-2. Per-cluster support components - like prometheus, grafana, cert-manager,
-   etc.
-
-3. Hub core components - the hub, proxy, userscheduler, etc
-
-4. (Optional) Dask gatway core components - the API gateway, controller, etc.
-
-Since the core nodes are *always running*, they form a big chunk of the
-cluster's *base cost* - the amount of money it costs each day, regardless
-of current number of running users. Picking an apporpriate node size and
-count here has a big effect.
-
-### On GKE
-
-GKE makes sizing this nodepool difficult, since `kube-system` components can take up quite
-a bit of resources. Even though the kind of clusters we run will most likely
-not stress components like `kube-dns` that much, there's no option to provide
-them fewer resource requests. So this will be our primary limitation in
-many ways.
-
-Adding [Config Connector](https://cloud.google.com/config-connector/docs/overview)
-or enabling [Network Policy](https://cloud.google.com/kubernetes-engine/docs/how-to/network-policy)
-requires more resources as well.
-
-With poorly structured experimentation, the current recommendation is to run
-3 `g1-small` instances for a cluster without config connector or network policy,
-or a single `n1-highmem-4` instance for a cluster with either of those options
-turned on. This needs to be better investigated.
\ No newline at end of file
diff --git a/docs/topic/terraform/index.md b/docs/topic/terraform/index.md
deleted file mode 100644
index ad3d70e3c..000000000
--- a/docs/topic/terraform/index.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Terraform Configuration
-
-```{toctree}
-:maxdepth: 1
-
-conventions.md
-reference.md
-```
\ No newline at end of file

From 9ac2c050bd3abe536c7beec101f5e3d4053a59a3 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 16 Jun 2021 16:41:46 +0530
Subject: [PATCH 18/27] Add comments on config connector / netpol settings

---
 terraform/projects/cloudbank.tfvars  | 4 ++++
 terraform/projects/meom-ige.tfvars   | 4 ++++
 terraform/projects/pilot-hubs.tfvars | 3 +++
 3 files changed, 11 insertions(+)

diff --git a/terraform/projects/cloudbank.tfvars b/terraform/projects/cloudbank.tfvars
index 723529467..123d8e54a 100644
--- a/terraform/projects/cloudbank.tfvars
+++ b/terraform/projects/cloudbank.tfvars
@@ -3,7 +3,11 @@ project_id = "cb-1003-1696"
 
 core_node_machine_type = "n1-highmem-4"
 
+# Multi-tenant cluster, network policy is required to enforce separation between hubs
 enable_network_policy    = true
+
+# No plans to provide storage buckets to users on this hub, so no need to deploy
+# config connector
 config_connector_enabled = false
 
 notebook_nodes = {
diff --git a/terraform/projects/meom-ige.tfvars b/terraform/projects/meom-ige.tfvars
index 0477e06a4..5d127ac3d 100644
--- a/terraform/projects/meom-ige.tfvars
+++ b/terraform/projects/meom-ige.tfvars
@@ -12,7 +12,11 @@ project_id = "meom-ige-cnrs"
 # the way to go
 core_node_machine_type = "g1-small"
 
+# Single-tenant cluster, network policy not needed
 enable_network_policy    = false
+
+# Single tenant cluster, so bucket access is provided via
+# metadata concealment + node SA. Config Connector not needed.
 config_connector_enabled = false
 
 notebook_nodes = {
diff --git a/terraform/projects/pilot-hubs.tfvars b/terraform/projects/pilot-hubs.tfvars
index 2ed7f3a47..4d566b6b8 100644
--- a/terraform/projects/pilot-hubs.tfvars
+++ b/terraform/projects/pilot-hubs.tfvars
@@ -3,7 +3,10 @@ project_id = "two-eye-two-see"
 
 core_node_machine_type = "n1-highmem-4"
 
+# Multi-tenant cluster, network policy is required to enforce separation between hubs
 enable_network_policy    = true
+
+# Some hubs want a storage bucket, so we need to have config connector enabled
 config_connector_enabled = true
 
 notebook_nodes = {

From c77ef3e671d3cf1795621d83ea1fa98939f29e47 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 16 Jun 2021 16:44:28 +0530
Subject: [PATCH 19/27] Don't try to auto-built tf reference docs

Can't seem to get conda to work on RTD for now
---
 .readthedocs.yaml                          |  6 ++----
 docs/conf.py                               | 18 +-----------------
 docs/{environment.yml => requirements.txt} |  0
 3 files changed, 3 insertions(+), 21 deletions(-)
 rename docs/{environment.yml => requirements.txt} (100%)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 6499d94dd..5f5650890 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -1,11 +1,9 @@
 vesion: 2
 
-conda:
-  environment: docs/environment.yml
-
 build:
   image: latest
 
 python:
   version: 3.8
-  install: []
\ No newline at end of file
+  install:
+    requirements: docs/requirements.txt
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index cb3cd7f07..dbfe69161 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -105,20 +105,4 @@ def render_hubs():
     path_table = path_tmp / "hub-table.csv"
     df.to_csv(path_table, index=None)
 
-
-def render_tfdocs():
-    tf_path = Path('../terraform')
-    # Output path is relative to terraform directory
-    output_path = Path('../docs/topic/terraform/reference.md')
-
-    # Template for output file is in ../terraform/.terraform-docs.yml
-    subprocess.check_call([
-        'terraform-docs', 'markdown',
-        f"--output-file={output_path}",
-        str(tf_path)
-    ])
-
-
-
-render_hubs()
-render_tfdocs()
\ No newline at end of file
+render_hubs()
\ No newline at end of file
diff --git a/docs/environment.yml b/docs/requirements.txt
similarity index 100%
rename from docs/environment.yml
rename to docs/requirements.txt

From ebdb697d9a944fd71f6447a5db16f9722ee86663 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 16 Jun 2021 16:46:59 +0530
Subject: [PATCH 20/27] Fix project SA description

---
 terraform/main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/terraform/main.tf b/terraform/main.tf
index 8bc667a6a..b134f1fe7 100644
--- a/terraform/main.tf
+++ b/terraform/main.tf
@@ -22,7 +22,7 @@ resource "google_project_iam_custom_role" "identify_project_role" {
   role_id     = replace("${var.prefix}_user_sa_role", "-", "_")
   project     = var.project_id
   title       = "Identify as project role for users in ${var.prefix}"
-  description = "A description"
+  description = "Minimal role for hub users on ${var.prefix} to identify as current project"
   permissions = ["serviceusage.services.use"]
 }
 

From 70b39e80243438945943abe5acceec2ac761188c Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 16 Jun 2021 16:49:44 +0530
Subject: [PATCH 21/27] Fix requirements.txt syntax

Leftover from environment.yml conversion
---
 docs/requirements.txt | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index fda88096a..3883cb447 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,14 +1,7 @@
-channels:
-- conda-forge
-dependencies:
-- go-terraform-docs
-- pip
-- python=3.8
-- pip:
-  - myst-parser[sphinx,linkify]
-  - sphinx-book-theme
-  - sphinx-panels
-  - sphinx-autobuild
-  - pandas
-  - pyyaml
-  - requests
+myst-parser[sphinx,linkify]
+sphinx-book-theme
+sphinx-panels
+sphinx-autobuild
+pandas
+pyyaml
+requests

From 9248021e20d0db621a94702e74aee75dddf6f880 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Fri, 18 Jun 2021 20:55:55 +0530
Subject: [PATCH 22/27] Add bigger instances for meom-ige

---
 config/hubs/meom-ige.cluster.yaml  | 7 +++++++
 docs/topic/terraform.md            | 2 +-
 terraform/projects/meom-ige.tfvars | 9 ++++++++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/config/hubs/meom-ige.cluster.yaml b/config/hubs/meom-ige.cluster.yaml
index e6deddacc..cc2af84a7 100644
--- a/config/hubs/meom-ige.cluster.yaml
+++ b/config/hubs/meom-ige.cluster.yaml
@@ -78,6 +78,13 @@ hubs:
                   mem_guarantee: 115G
                   node_selector:
                     node.kubernetes.io/instance-type: e2-standard-32
+              - display_name: "Huge"
+                description: "~64 CPU, ~256G RAM"
+                kubespawner_override:
+                  mem_limit: 256G
+                  mem_guarantee: 230G
+                  node_selector:
+                    node.kubernetes.io/instance-type: n2-standard-64
             defaultUrl: /lab
             image:
               name: pangeo/pangeo-notebook
diff --git a/docs/topic/terraform.md b/docs/topic/terraform.md
index dd35ef962..3ff13cbc5 100644
--- a/docs/topic/terraform.md
+++ b/docs/topic/terraform.md
@@ -13,4 +13,4 @@ file with variable definitions for that cluster.
 
 Workspaces are stored centrally in the `two-eye-two-see-org` GCP project, even
 when we use Terraform for projects running on AWS / Azure. You must have
-access to this project before you can use terraform for our infrastructure.
+access to this project before you can use terraform for our infrastructure.
\ No newline at end of file
diff --git a/terraform/projects/meom-ige.tfvars b/terraform/projects/meom-ige.tfvars
index 5d127ac3d..c95fdd06f 100644
--- a/terraform/projects/meom-ige.tfvars
+++ b/terraform/projects/meom-ige.tfvars
@@ -39,7 +39,14 @@ notebook_nodes = {
     min : 0,
     max : 20,
     machine_type : "e2-standard-32"
-  }
+  },
+  "huge" : {
+    min : 0,
+    max : 20,
+    # e2 instances only go upto 32 cores
+    machine_type : "n2-standard-64"
+  },
+
 }
 
 user_buckets = [

From eaa8b7165d1f296990d9c80eae9703521d46393e Mon Sep 17 00:00:00 2001
From: Yuvi Panda <yuvipanda@gmail.com>
Date: Wed, 23 Jun 2021 22:04:51 +0530
Subject: [PATCH 23/27] Fix typo

Co-authored-by: Sarah Gibson <44771837+sgibson91@users.noreply.github.com>
---
 docs/topic/cluster-design.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/topic/cluster-design.md b/docs/topic/cluster-design.md
index d17ece45b..d9e9a5a83 100644
--- a/docs/topic/cluster-design.md
+++ b/docs/topic/cluster-design.md
@@ -50,7 +50,7 @@ to isolate them from each other.
 For hub users to access cloud resources (like storage buckets), they will need
 to be authorized via a [GCP ServiceAccount](https://cloud.google.com/iam/docs/service-accounts).
 This is different from a [Kubernetes ServiceAccount](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/),
-which is used to authenticate and authorize accss to kubernetes resources (like spawning pods).
+which is used to authenticate and authorize access to kubernetes resources (like spawning pods).
 
 For dask hubs, we want to provide users with write access to at least one storage
 bucket they can use for temporary data storage. User pods need to be given access to
@@ -72,4 +72,4 @@ to do this:
 Long-term, (2) is the appropriate way to do this for everyone. However, it affects the size
 of the core node pool, since it runs some components in the cluster. For now, we use (1) for
 single-tenant clusters, and (2) for multi-tenant clusters. If nobody wants a scratch GCS bucket,
-neither option is required.
\ No newline at end of file
+neither option is required.

From c000c242492869e631dd832989993f3d1384d015 Mon Sep 17 00:00:00 2001
From: Yuvi Panda <yuvipanda@gmail.com>
Date: Wed, 23 Jun 2021 22:05:11 +0530
Subject: [PATCH 24/27] Fix typo

Co-authored-by: Sarah Gibson <44771837+sgibson91@users.noreply.github.com>
---
 .readthedocs.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 5f5650890..1fb83398f 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -1,4 +1,4 @@
-vesion: 2
+version: 2
 
 build:
   image: latest
@@ -6,4 +6,4 @@ build:
 python:
   version: 3.8
   install:
-    requirements: docs/requirements.txt
\ No newline at end of file
+    requirements: docs/requirements.txt

From f04bc51da45b38e3b64c350f37f64f39ebc9352e Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Thu, 24 Jun 2021 12:02:26 +0530
Subject: [PATCH 25/27] Revert "Don't try to auto-built tf reference docs"

This wasn't working earlier because of a typo in
.readthedocs.yml that I didn't spot

This reverts commit c77ef3e671d3cf1795621d83ea1fa98939f29e47.
---
 .readthedocs.yaml                          |  6 ++++--
 docs/conf.py                               | 18 +++++++++++++++++-
 docs/{requirements.txt => environment.yml} |  0
 3 files changed, 21 insertions(+), 3 deletions(-)
 rename docs/{requirements.txt => environment.yml} (100%)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 1fb83398f..e7cf22285 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -1,9 +1,11 @@
 version: 2
 
+conda:
+  environment: docs/environment.yml
+
 build:
   image: latest
 
 python:
   version: 3.8
-  install:
-    requirements: docs/requirements.txt
+  install: []
diff --git a/docs/conf.py b/docs/conf.py
index dbfe69161..cb3cd7f07 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -105,4 +105,20 @@ def render_hubs():
     path_table = path_tmp / "hub-table.csv"
     df.to_csv(path_table, index=None)
 
-render_hubs()
\ No newline at end of file
+
+def render_tfdocs():
+    tf_path = Path('../terraform')
+    # Output path is relative to terraform directory
+    output_path = Path('../docs/topic/terraform/reference.md')
+
+    # Template for output file is in ../terraform/.terraform-docs.yml
+    subprocess.check_call([
+        'terraform-docs', 'markdown',
+        f"--output-file={output_path}",
+        str(tf_path)
+    ])
+
+
+
+render_hubs()
+render_tfdocs()
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/environment.yml
similarity index 100%
rename from docs/requirements.txt
rename to docs/environment.yml

From e7242a944755eae38f4885aef6b2377f29bfb2ac Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Thu, 24 Jun 2021 12:06:35 +0530
Subject: [PATCH 26/27] Revert "Fix requirements.txt syntax"

Since conda seems to now work with RTD

This reverts commit 70b39e80243438945943abe5acceec2ac761188c.
---
 docs/environment.yml | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/docs/environment.yml b/docs/environment.yml
index 3883cb447..fda88096a 100644
--- a/docs/environment.yml
+++ b/docs/environment.yml
@@ -1,7 +1,14 @@
-myst-parser[sphinx,linkify]
-sphinx-book-theme
-sphinx-panels
-sphinx-autobuild
-pandas
-pyyaml
-requests
+channels:
+- conda-forge
+dependencies:
+- go-terraform-docs
+- pip
+- python=3.8
+- pip:
+  - myst-parser[sphinx,linkify]
+  - sphinx-book-theme
+  - sphinx-panels
+  - sphinx-autobuild
+  - pandas
+  - pyyaml
+  - requests

From f8cf3f734117777f0fadd9ead2ff8526b9a50c1a Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Thu, 24 Jun 2021 15:21:49 +0530
Subject: [PATCH 27/27] Don't autorender tfdocs

--output requires a newer version of terraform-docs on
conda-forge
(https://github.com/conda-forge/go-terraform-docs-feedstock/pull/25).
---
 docs/conf.py | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index cb3cd7f07..2f3d86dc1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -106,19 +106,4 @@ def render_hubs():
     df.to_csv(path_table, index=None)
 
 
-def render_tfdocs():
-    tf_path = Path('../terraform')
-    # Output path is relative to terraform directory
-    output_path = Path('../docs/topic/terraform/reference.md')
-
-    # Template for output file is in ../terraform/.terraform-docs.yml
-    subprocess.check_call([
-        'terraform-docs', 'markdown',
-        f"--output-file={output_path}",
-        str(tf_path)
-    ])
-
-
-
-render_hubs()
-render_tfdocs()
\ No newline at end of file
+render_hubs()
\ No newline at end of file