diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000000..e7cf222854 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,11 @@ +version: 2 + +conda: + environment: docs/environment.yml + +build: + image: latest + +python: + version: 3.8 + install: [] diff --git a/config/hubs/2i2c.cluster.yaml b/config/hubs/2i2c.cluster.yaml index 26c7fbc917..234f5cc306 100644 --- a/config/hubs/2i2c.cluster.yaml +++ b/config/hubs/2i2c.cluster.yaml @@ -6,6 +6,16 @@ gcp: project: two-eye-two-see cluster: pilot-hubs-cluster zone: us-central1-b +support: + config: + grafana: + ingress: + hosts: + - grafana.pilot.2i2c.cloud + tls: + - secretName: grafana-tls + hosts: + - grafana.pilot.2i2c.cloud hubs: - name: staging domain: staging.pilot.2i2c.cloud diff --git a/config/hubs/meom-ige.cluster.yaml b/config/hubs/meom-ige.cluster.yaml new file mode 100644 index 0000000000..cc2af84a7f --- /dev/null +++ b/config/hubs/meom-ige.cluster.yaml @@ -0,0 +1,144 @@ +name: meom-ige +provider: gcp +gcp: + key: secrets/meom.json + project: meom-ige-cnrs + cluster: meom-ige-cluster + zone: us-central1-b +hubs: + - name: staging + domain: staging.meom-ige.2i2c.cloud + template: daskhub + auth0: + connection: github + config: &meomConfig + basehub: + nfsPVC: + nfs: + # from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html + mountOptions: + - rsize=1048576 + - wsize=1048576 + - timeo=600 + - soft # We pick soft over hard, so NFS lockups don't lead to hung processes + - retrans=2 + - noresvport + serverIP: nfs-server-01 + baseShareName: /export/home-01/homes/ + jupyterhub: + custom: + homepage: + templateVars: + org: + name: "SWOT Ocean Pangeo Team" + logo_url: https://2i2c.org/media/logo.png + url: https://meom-group.github.io/ + designed_by: + name: 2i2c + url: https://2i2c.org + operated_by: + name: 2i2c + url: https://2i2c.org + funded_by: + name: SWOT Ocean Pangeo Team + url: https://meom-group.github.io/ + singleuser: + extraEnv: + DATA_BUCKET: gcs://meom-ige-data + SCRATCH_BUCKET: 'gcs://meom-ige-scratch/$(JUPYTERHUB_USER)' + profileList: + # The mem-guarantees are here so k8s doesn't schedule other pods + # on these nodes. They need to be just under total allocatable + # RAM on a node, not total node capacity + - display_name: "Small" + description: "~2 CPU, ~8G RAM" + kubespawner_override: + mem_limit: 8G + mem_guarantee: 5.5G + node_selector: + node.kubernetes.io/instance-type: e2-standard-2 + - display_name: "Medium" + description: "~8 CPU, ~32G RAM" + kubespawner_override: + mem_limit: 32G + mem_guarantee: 25G + node_selector: + node.kubernetes.io/instance-type: e2-standard-8 + - display_name: "Large" + description: "~16 CPU, ~64G RAM" + kubespawner_override: + mem_limit: 64G + mem_guarantee: 55G + node_selector: + node.kubernetes.io/instance-type: e2-standard-16 + - display_name: "Very Large" + description: "~32 CPU, ~128G RAM" + kubespawner_override: + mem_limit: 128G + mem_guarantee: 115G + node_selector: + node.kubernetes.io/instance-type: e2-standard-32 + - display_name: "Huge" + description: "~64 CPU, ~256G RAM" + kubespawner_override: + mem_limit: 256G + mem_guarantee: 230G + node_selector: + node.kubernetes.io/instance-type: n2-standard-64 + defaultUrl: /lab + image: + name: pangeo/pangeo-notebook + tag: 2021.02.19 + scheduling: + userPlaceholder: + enabled: false + replicas: 0 + userScheduler: + enabled: false + proxy: + service: + type: LoadBalancer + https: + enabled: true + chp: + resources: + requests: + # FIXME: We want no guarantees here!!! + # This is lowest possible value + cpu: 0.01 + memory: 1Mi + hub: + resources: + requests: + # FIXME: We want no guarantees here!!! + # This is lowest possible value + cpu: 0.01 + memory: 1Mi + config: + Authenticator: + allowed_users: &users + - roxyboy + - lesommer + - auraoupa + - yuvipanda + - choldgraf + - GeorgianaElena + admin_users: *users + + allowNamedServers: true + networkPolicy: + # FIXME: For dask gateway + enabled: false + readinessProbe: + enabled: false + dask-gateway: + extraConfig: + idle: | + # timeout after 30 minutes of inactivity + c.KubeClusterConfig.idle_timeout = 1800 + - name: prod + domain: meom-ige.2i2c.cloud + template: daskhub + auth0: + connection: github + config: *meomConfig diff --git a/deployer/__main__.py b/deployer/__main__.py index 70f2fabeb0..8dab3b3957 100644 --- a/deployer/__main__.py +++ b/deployer/__main__.py @@ -29,6 +29,23 @@ def build(cluster_name): cluster.build_image() +def deploy_support(cluster_name): + """ + Deploy support components to a cluster + """ + + # Validate our config with JSON Schema first before continuing + validate(cluster_name) + + + config_file_path = Path(os.getcwd()) / "config/hubs" / f'{cluster_name}.cluster.yaml' + with open(config_file_path) as f: + cluster = Cluster(yaml.load(f)) + + if cluster.support: + with cluster.auth(): + cluster.deploy_support() + def deploy(cluster_name, hub_name, skip_hub_health_test, config_path): """ Deploy one or more hubs in a given cluster @@ -97,6 +114,7 @@ def main(): build_parser = subparsers.add_parser("build") deploy_parser = subparsers.add_parser("deploy") validate_parser = subparsers.add_parser("validate") + deploy_support_parser = subparsers.add_parser("deploy-support") build_parser.add_argument("cluster_name") @@ -107,6 +125,8 @@ def main(): validate_parser.add_argument("cluster_name") + deploy_support_parser.add_argument("cluster_name") + args = argparser.parse_args() if args.action == "build": @@ -115,6 +135,8 @@ def main(): deploy(args.cluster_name, args.hub_name, args.skip_hub_health_test, args.config_path) elif args.action == 'validate': validate(args.cluster_name) + elif args.action == 'deploy-support': + deploy_support(args.cluster_name) else: # Print help message and exit when no arguments are passed # FIXME: Is there a better way to do this? diff --git a/deployer/hub.py b/deployer/hub.py index 5c020d1b10..b418839813 100644 --- a/deployer/hub.py +++ b/deployer/hub.py @@ -29,6 +29,7 @@ def __init__(self, spec): Hub(self, hub_yaml) for hub_yaml in self.spec['hubs'] ] + self.support = self.spec.get('support', {}) def build_image(self): self.ensure_docker_credhelpers() @@ -77,6 +78,32 @@ def ensure_docker_credhelpers(self): with open(dockercfg_path, 'w') as f: json.dump(config, f, indent=4) + def deploy_support(self): + cert_manager_version = 'v1.3.1' + + print("Provisioning cert-manager...") + subprocess.check_call([ + 'helm', 'upgrade', '--install', '--create-namespace', + '--namespace', 'cert-manager', + 'cert-manager', 'jetstack/cert-manager', + '--version', cert_manager_version, + '--set', 'installCRDs=true' + ]) + print("Done!") + + print("Support charts...") + + with tempfile.NamedTemporaryFile(mode='w') as f: + yaml.dump(self.support.get('config', {}), f) + f.flush() + subprocess.check_call([ + 'helm', 'upgrade', '--install', '--create-namespace', + '--namespace', 'support', + 'support', 'support', + '-f', f.name, + '--wait' + ]) + print("Done!") def auth_kubeconfig(self): """ diff --git a/docs/conf.py b/docs/conf.py index c731b4d8e3..2f3d86dc18 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -62,43 +62,48 @@ from yaml import safe_load import pandas as pd from pathlib import Path - -# Grab the latest list of clusters defined in pilot-hubs/ -clusters = Path("../config/hubs").glob("*") -# Add list of repos managed outside pilot-hubs -hub_list = [{ - 'name': 'University of Toronto', - 'domain': 'jupyter.utoronto.ca', - 'id': 'utoronto', - 'template': 'base-hub ([deployment repo](https://github.com/utoronto-2i2c/jupyterhub-deploy/))' -}] -for cluster_info in clusters: - if "schema" in cluster_info.name: - continue - # For each cluster, grab it's YAML w/ the config for each hub - yaml = cluster_info.read_text() - cluster = safe_load(yaml) - - # For each hub in cluster, grab its metadata and add it to the list - for hub in cluster['hubs']: - config = hub['config'] - # Config is sometimes nested - if 'basehub' in config: - hub_config = config['basehub']['jupyterhub'] - else: - hub_config = config['jupyterhub'] - # Domain can be a list - if isinstance(hub['domain'], list): - hub['domain'] = hub['domain'][0] - - hub_list.append({ - 'name': hub_config['custom']['homepage']['templateVars']['org']['name'], - 'domain': f"[{hub['domain']}](https://{hub['domain']})", - "id": hub['name'], - "template": hub['template'], - }) -df = pd.DataFrame(hub_list) -path_tmp = Path("tmp") -path_tmp.mkdir(exist_ok=True) -path_table = path_tmp / "hub-table.csv" -df.to_csv(path_table, index=None) \ No newline at end of file +import subprocess + +def render_hubs(): + # Grab the latest list of clusters defined in pilot-hubs/ + clusters = Path("../config/hubs").glob("*") + # Add list of repos managed outside pilot-hubs + hub_list = [{ + 'name': 'University of Toronto', + 'domain': 'jupyter.utoronto.ca', + 'id': 'utoronto', + 'template': 'base-hub ([deployment repo](https://github.com/utoronto-2i2c/jupyterhub-deploy/))' + }] + for cluster_info in clusters: + if "schema" in cluster_info.name: + continue + # For each cluster, grab it's YAML w/ the config for each hub + yaml = cluster_info.read_text() + cluster = safe_load(yaml) + + # For each hub in cluster, grab its metadata and add it to the list + for hub in cluster['hubs']: + config = hub['config'] + # Config is sometimes nested + if 'basehub' in config: + hub_config = config['basehub']['jupyterhub'] + else: + hub_config = config['jupyterhub'] + # Domain can be a list + if isinstance(hub['domain'], list): + hub['domain'] = hub['domain'][0] + + hub_list.append({ + 'name': hub_config['custom']['homepage']['templateVars']['org']['name'], + 'domain': f"[{hub['domain']}](https://{hub['domain']})", + "id": hub['name'], + "template": hub['template'], + }) + df = pd.DataFrame(hub_list) + path_tmp = Path("tmp") + path_tmp.mkdir(exist_ok=True) + path_table = path_tmp / "hub-table.csv" + df.to_csv(path_table, index=None) + + +render_hubs() \ No newline at end of file diff --git a/docs/environment.yml b/docs/environment.yml new file mode 100644 index 0000000000..fda88096a9 --- /dev/null +++ b/docs/environment.yml @@ -0,0 +1,14 @@ +channels: +- conda-forge +dependencies: +- go-terraform-docs +- pip +- python=3.8 +- pip: + - myst-parser[sphinx,linkify] + - sphinx-book-theme + - sphinx-panels + - sphinx-autobuild + - pandas + - pyyaml + - requests diff --git a/docs/index.md b/docs/index.md index ec6b28e3c8..75e6818b6e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -42,6 +42,8 @@ Topic guides go more in-depth on a particular topic. topic/config.md topic/hub-templates.md topic/storage-layer.md +topic/terraform.md +topic/cluster-design.md ``` ## Reference diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 3883cb447e..0000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -myst-parser[sphinx,linkify] -sphinx-book-theme -sphinx-panels -sphinx-autobuild -pandas -pyyaml -requests diff --git a/docs/topic/cluster-design.md b/docs/topic/cluster-design.md new file mode 100644 index 0000000000..d9e9a5a835 --- /dev/null +++ b/docs/topic/cluster-design.md @@ -0,0 +1,75 @@ +# Cluster design considerations + +## GKE + +## Core node size + +In each cluster, we have a *core node pool* that is fairly static in size +and always running. It needs enough capacity to run: + +1. Kubernetes system components - network policy enforcement, config connector + components, cluster autoscaler, kube-dns, etc. + +2. Per-cluster support components - like prometheus, grafana, cert-manager, + etc. + +3. Hub core components - the hub, proxy, userscheduler, etc + +4. (Optional) Dask gatway core components - the API gateway, controller, etc. + +Since the core nodes are *always running*, they form a big chunk of the +cluster's *base cost* - the amount of money it costs each day, regardless +of current number of running users. Picking an apporpriate node size and +count here has a big effect. + +### On GKE + +GKE makes sizing this nodepool difficult, since `kube-system` components can take up quite +a bit of resources. Even though the kind of clusters we run will most likely +not stress components like `kube-dns` that much, there's no option to provide +them fewer resource requests. So this will be our primary limitation in +many ways. + +Adding [Config Connector](https://cloud.google.com/config-connector/docs/overview) +or enabling [Network Policy](https://cloud.google.com/kubernetes-engine/docs/how-to/network-policy) +requires more resources as well. + +With poorly structured experimentation, the current recommendation is to run +3 `g1-small` instances for a cluster without config connector or network policy, +or a single `n1-highmem-4` instance for a cluster with either of those options +turned on. This needs to be better investigated. + +## Network Policy + +When hubs belonging to multiple organizations are run on the same cluster, +we **must** enable [NetworkPolicy enforcement](https://cloud.google.com/kubernetes-engine/docs/how-to/network-policy) +to isolate them from each other. + +## Cloud access credentials for hub users + +For hub users to access cloud resources (like storage buckets), they will need +to be authorized via a [GCP ServiceAccount](https://cloud.google.com/iam/docs/service-accounts). +This is different from a [Kubernetes ServiceAccount](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/), +which is used to authenticate and authorize access to kubernetes resources (like spawning pods). + +For dask hubs, we want to provide users with write access to at least one storage +bucket they can use for temporary data storage. User pods need to be given access to +a GCP ServiceAccount that has write permissions to this bucket. There are two ways +to do this: + +1. Provide appropriate permissions to the GCP ServiceAccount used by the node the user + pods are running on. When used with [Metadata Concealment](https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata#overview), + user pods can read / write from storage buckets. However, this grants the same permissions + to *all* pods on the cluster, and hence is unsuitable for clusters with multiple + hubs running for different organizations. + +2. Use the [GKE Cloud Config Connector](https://cloud.google.com/config-connector/docs/overview) to + create a GCP ServiceAccount + Storage Bucket for each hub via helm. This requires using + [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) and + is incompatible with (1). This is required for multi-tenant clusters, since users on a hub + have much tighter scoped permissions. + +Long-term, (2) is the appropriate way to do this for everyone. However, it affects the size +of the core node pool, since it runs some components in the cluster. For now, we use (1) for +single-tenant clusters, and (2) for multi-tenant clusters. If nobody wants a scratch GCS bucket, +neither option is required. diff --git a/docs/topic/terraform.md b/docs/topic/terraform.md new file mode 100644 index 0000000000..3ff13cbc54 --- /dev/null +++ b/docs/topic/terraform.md @@ -0,0 +1,16 @@ +# Terraform + +[Terraform](https://www.terraform.io/) is used to manage our infrastructure +on Google Cloud Platform. The source files are under `terraform/` in this repo, +and variables defining each cluster we manage are under `terraform/projects`. + +## Workspaces + +We use [terraform workspaces](https://www.terraform.io/docs/language/state/workspaces.html) +to maintain separate terraform states about different clusters we manage. +There should be one workspace per cluster, with the same name as the `.tfvars` +file with variable definitions for that cluster. + +Workspaces are stored centrally in the `two-eye-two-see-org` GCP project, even +when we use Terraform for projects running on AWS / Azure. You must have +access to this project before you can use terraform for our infrastructure. \ No newline at end of file diff --git a/hub-templates/basehub/values.yaml b/hub-templates/basehub/values.yaml index 60a41b75e0..0b9b9a6258 100644 --- a/hub-templates/basehub/values.yaml +++ b/hub-templates/basehub/values.yaml @@ -86,6 +86,8 @@ jupyterhub: limits: memory: 1Gi traefik: + image: + tag: v2.4.8 nodeSelector: hub.jupyter.org/node-purpose: core resources: diff --git a/support/Chart.yaml b/support/Chart.yaml index 0e8d264acf..9d50e68e2a 100644 --- a/support/Chart.yaml +++ b/support/Chart.yaml @@ -7,24 +7,18 @@ dependencies: # Prometheus for collection of metrics. # https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus - name: prometheus - version: 11.15.0 + version: 14.1.1 repository: https://prometheus-community.github.io/helm-charts # Grafana for dashboarding of metrics. # https://github.com/grafana/helm-charts/tree/main/charts/grafana - name: grafana - version: 5.6.4 + version: 6.11.0 repository: https://grafana.github.io/helm-charts # ingress-nginx for a k8s Ingress resource controller that routes traffic from # a single IP entrypoint to various services exposed via k8s Ingress resources # that references this controller. - name: ingress-nginx - version: 2.15.0 - repository: https://kubernetes.github.io/ingress-nginx - - # cert-manager for acquisition of TLS certificates - # https://github.com/jetstack/cert-manager/tree/master/deploy/charts/cert-manager - - name: cert-manager - version: v1.0.0-beta.1 - repository: https://charts.jetstack.io + version: 3.33.0 + repository: https://kubernetes.github.io/ingress-nginx \ No newline at end of file diff --git a/support/values.yaml b/support/values.yaml index f9f002143a..01aed37a8c 100644 --- a/support/values.yaml +++ b/support/values.yaml @@ -15,18 +15,20 @@ prometheus: # Deploy onto user nodes key: hub.jupyter.org_dedicated value: user + - effect: NoSchedule + # Deploy onto user nodes + key: k8s.dask.org_dedicated + value: worker updateStrategy: type: RollingUpdate pushgateway: enabled: false - rbac: - create: true server: resources: # Without this, prometheus can easily starve users requests: cpu: 0.2 - memory: 768Mi + memory: 512Mi limits: cpu: 1 memory: 2Gi @@ -55,12 +57,7 @@ grafana: annotations: kubernetes.io/ingress.class: nginx cert-manager.io/cluster-issuer: letsencrypt-prod - hosts: - - grafana.pilot.2i2c.cloud - # grafana.ini: - # server: - # root_url: http://grafana.datahub.berkeley.edu/ datasources: datasources.yaml: apiVersion: 1 diff --git a/terraform/.terraform-docs.yml b/terraform/.terraform-docs.yml new file mode 100644 index 0000000000..f6449a2c65 --- /dev/null +++ b/terraform/.terraform-docs.yml @@ -0,0 +1,9 @@ +output: + mode: replace + template: |- + # Reference + + + + {{ .Content }} + \ No newline at end of file diff --git a/terraform/buckets.tf b/terraform/buckets.tf new file mode 100644 index 0000000000..32904d1184 --- /dev/null +++ b/terraform/buckets.tf @@ -0,0 +1,18 @@ +/** +* GCS buckets for use by hub users +*/ + +resource "google_storage_bucket" "user_buckets" { + for_each = var.user_buckets + name = "${var.prefix}-${each.key}" + location = var.region + project = var.project_id +} + +resource "google_storage_bucket_iam_member" "member" { + + for_each = var.user_buckets + bucket = google_storage_bucket.user_buckets[each.key].name + role = "roles/storage.admin" + member = "serviceAccount:${google_service_account.cluster_sa.email}" +} diff --git a/terraform/cd.tf b/terraform/cd.tf new file mode 100644 index 0000000000..7aedfbd6fc --- /dev/null +++ b/terraform/cd.tf @@ -0,0 +1,31 @@ +/** +* Setup Service Accounts for authentication during continuous deployment +*/ + +// Service account used by GitHub Actions to deploy to the cluster +resource "google_service_account" "cd_sa" { + account_id = "${var.prefix}-cd-sa" + display_name = "Continuous Deployment SA for ${var.prefix}" + project = var.project_id +} + +// Roles the service account needs to deploy hubs to the cluster +resource "google_project_iam_member" "cd_sa_roles" { + for_each = var.cd_sa_roles + + project = var.project_id + role = each.value + member = "serviceAccount:${google_service_account.cd_sa.email}" +} + +// JSON encoded private key to be kept in secrets/* to for the +// deployment script to authenticate to the cluster +resource "google_service_account_key" "cd_sa" { + service_account_id = google_service_account.cd_sa.name + public_key_type = "TYPE_X509_PEM_FILE" +} + +output "ci_deployer_key" { + value = base64decode(google_service_account_key.cd_sa.private_key) + sensitive = true +} diff --git a/terraform/cluster.tf b/terraform/cluster.tf new file mode 100644 index 0000000000..fa8534f3b3 --- /dev/null +++ b/terraform/cluster.tf @@ -0,0 +1,223 @@ +resource "google_container_cluster" "cluster" { + # config_connector_config is in beta + provider = google-beta + + name = "${var.prefix}-cluster" + location = var.zone + project = var.project_id + + initial_node_count = 1 + remove_default_node_pool = true + + addons_config { + http_load_balancing { + // FIXME: This used to not work well with websockets, and + // cost extra money as well. Let's validate if this is still + // true? + disabled = true + } + horizontal_pod_autoscaling { + // This isn't used anywhere, so let's turn this off + disabled = true + } + config_connector_config { + enabled = var.config_connector_enabled + } + } + + dynamic "workload_identity_config" { + # Setup workload identity only if we're using config connector, otherwise + # just metadata concealment is used + for_each = var.config_connector_enabled == "" ? [] : [1] + content { + identity_namespace = "${var.project_id}.svc.id.goog" + } + } + + release_channel { + # We upgrade clusters manually so we can manage downtime of + # master *and* nodes. When a cluster is in a release channel, + # upgrades (including disruptive node upgrades) happen automatically. + # So we disable it. + channel = "UNSPECIFIED" + } + + cluster_autoscaling { + # This disables node autoprovisioning, not cluster autoscaling! + enabled = false + # Use a scheduler + autoscaling profile optimized for batch workloads like ours + # https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler#autoscaling_profiles + autoscaling_profile = "OPTIMIZE_UTILIZATION" + } + + network_policy { + enabled = var.enable_network_policy + } + + node_config { + # DO NOT TOUCH THIS BLOCK, IT REPLACES ENTIRE CLUSTER LOL + service_account = google_service_account.cluster_sa.email + } +} + +resource "google_container_node_pool" "core" { + name = "core-pool" + cluster = google_container_cluster.cluster.name + project = google_container_cluster.cluster.project + location = google_container_cluster.cluster.location + + + initial_node_count = 1 + autoscaling { + min_node_count = 1 + max_node_count = var.core_node_max_count + } + + management { + auto_repair = true + # Auto upgrade will drain and setup nodes without us knowing, + # and this can cause outages when it hits the proxy nodes. + auto_upgrade = false + } + + + node_config { + labels = { + "hub.jupyter.org/node-purpose" = "core", + "k8s.dask.org/node-purpose" = "core" + } + machine_type = var.core_node_machine_type + disk_size_gb = 30 + + # Our service account gets all OAuth scopes so it can access + # all APIs, but only fine grained permissions + roles are + # granted via the service account. This follows Google's + # recommendation at https://cloud.google.com/compute/docs/access/service-accounts#associating_a_service_account_to_an_instance + service_account = google_service_account.cluster_sa.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + } +} + +resource "google_container_node_pool" "notebook" { + name = "nb-${each.key}" + cluster = google_container_cluster.cluster.name + project = google_container_cluster.cluster.project + location = google_container_cluster.cluster.location + + for_each = var.notebook_nodes + + # WARNING: Do not change this value, it will cause the nodepool + # to be destroyed & re-created. If you want to increase number of + # nodes in a node pool, set the min count to that number and then + # scale the pool manually. + initial_node_count = each.value.min + autoscaling { + min_node_count = each.value.min + max_node_count = each.value.max + } + + management { + auto_repair = true + auto_upgrade = false + } + + + node_config { + workload_metadata_config { + # Config Connector requires workload identity to be enabled (via GKE_METADATA_SERVER). + # If config connector is not necessary, we use simple metadata concealment + # (https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata) + # to expose the node CA to users safely. + # FIXME: This should be a bit more fine-grained - it should be possible to disable + # config connector and completely hide all node metadata from user pods + node_metadata = var.config_connector_enabled ? "GKE_METADATA_SERVER" : "SECURE" + } + labels = { + # Notebook pods and dask schedulers can exist here + "hub.jupyter.org/node-purpose" = "user", + "k8s.dask.org/node-purpose" = "scheduler", + } + + taint = [{ + key = "hub.jupyter.org_dedicated" + value = "user" + effect = "NO_SCHEDULE" + }] + machine_type = each.value.machine_type + + # Our service account gets all OAuth scopes so it can access + # all APIs, but only fine grained permissions + roles are + # granted via the service account. This follows Google's + # recommendation at https://cloud.google.com/compute/docs/access/service-accounts#associating_a_service_account_to_an_instance + service_account = google_service_account.cluster_sa.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + } +} + +resource "google_container_node_pool" "dask_worker" { + name = "dask-${each.key}" + cluster = google_container_cluster.cluster.name + project = google_container_cluster.cluster.project + location = google_container_cluster.cluster.location + + # Default to same config as notebook nodepools config + for_each = length(var.dask_nodes) == 0 ? var.notebook_nodes : var.dask_nodes + + # WARNING: Do not change this value, it will cause the nodepool + # to be destroyed & re-created. If you want to increase number of + # nodes in a node pool, set the min count to that number and then + # scale the pool manually. + initial_node_count = 0 + autoscaling { + min_node_count = each.value.min + max_node_count = each.value.max + } + + management { + auto_repair = true + auto_upgrade = false + } + + node_config { + + preemptible = true + # SSD Disks for dask workers make image pulls much faster + # Since we might have many dask workers spinning up at the + # same time, the extra cost of using this is probably worth it. + disk_type = "pd-ssd" + + workload_metadata_config { + # Config Connector requires workload identity to be enabled (via GKE_METADATA_SERVER). + # If config connector is not necessary, we use simple metadata concealment + # (https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata) + # to expose the node CA to users safely. + # FIXME: This should be a bit more fine-grained - it should be possible to disable + # config connector and completely hide all node metadata from user pods + node_metadata = var.config_connector_enabled ? "GKE_METADATA_SERVER" : "SECURE" + } + labels = { + "k8s.dask.org/node-purpose" = "worker", + } + + taint = [{ + key = "k8s.dask.org_dedicated" + value = "worker" + effect = "NO_SCHEDULE" + }] + machine_type = each.value.machine_type + + # Our service account gets all OAuth scopes so it can access + # all APIs, but only fine grained permissions + roles are + # granted via the service account. This follows Google's + # recommendation at https://cloud.google.com/compute/docs/access/service-accounts#associating_a_service_account_to_an_instance + service_account = google_service_account.cluster_sa.email + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + } +} + diff --git a/terraform/main.tf b/terraform/main.tf index b9f011a634..b134f1fe7e 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -5,155 +5,37 @@ terraform { } } -module "service_accounts" { - source = "terraform-google-modules/service-accounts/google" - version = "~> 2.0" - project_id = var.project_id - prefix = var.prefix - generate_keys = true - names = ["cd-sa"] - project_roles = [ - "${var.project_id}=>roles/container.admin", - "${var.project_id}=>roles/artifactregistry.writer", - # FIXME: This is way too much perms just to ssh into a node - "${var.project_id}=>roles/compute.instanceAdmin.v1" - ] +// Service account used by all the nodes and pods in our cluster +resource "google_service_account" "cluster_sa" { + account_id = "${var.prefix}-cluster-sa" + display_name = "Cluster SA for ${var.prefix}" + project = var.project_id } -output "ci_deployer_key" { - value = module.service_accounts.keys["cd-sa"] - sensitive = true +// To access GCS buckets with requestor pays, the calling code needs +// to have serviceusage.services.use permission. We create a role +// granting just this to provide the cluster SA, so user pods can +// use it. See https://cloud.google.com/storage/docs/requester-pays +// for more info +resource "google_project_iam_custom_role" "identify_project_role" { + // Role names can't contain -, so we swap them out. BOO + role_id = replace("${var.prefix}_user_sa_role", "-", "_") + project = var.project_id + title = "Identify as project role for users in ${var.prefix}" + description = "Minimal role for hub users on ${var.prefix} to identify as current project" + permissions = ["serviceusage.services.use"] } -resource "google_artifact_registry_repository" "container_repository" { - provider = google-beta - - location = var.region - repository_id = "low-touch-hubs" - format = "DOCKER" - project = var.project_id -} - -// Give the GKE service account access to our artifact registry docker repo -resource "google_project_iam_member" "project" { +resource "google_project_iam_member" "identify_project_binding" { project = var.project_id - role = "roles/artifactregistry.reader" - member = "serviceAccount:${module.gke.service_account}" + role = google_project_iam_custom_role.identify_project_role.name + member = "serviceAccount:${google_service_account.cluster_sa.email}" } +resource "google_project_iam_member" "cluster_sa_roles" { + for_each = var.cluster_sa_roles -module "gke" { - source = "terraform-google-modules/kubernetes-engine/google" - project_id = var.project_id - name = "${var.prefix}-cluster" - regional = var.regional_cluster - region = var.region - zones = [var.zone] - network = "default" - subnetwork = "default" - ip_range_pods = "" - ip_range_services = "" - http_load_balancing = false - horizontal_pod_autoscaling = false - network_policy = true - # We explicitly set up a core pool, so don't need the default - remove_default_node_pool = true - kubernetes_version = "1.19.9-gke.1400" - - - node_pools = [ - { - name = "core-pool" - machine_type = var.core_node_machine_type - min_count = 1 - max_count = var.core_node_max_count - local_ssd_count = 0 - disk_size_gb = var.core_node_disk_size_gb - disk_type = "pd-standard" - image_type = "COS" - auto_repair = true - auto_upgrade = false - preemptible = false - initial_node_count = 1 - # Let's pin this so we don't upgrade each time terraform runs - version = "1.19.9-gke.1400" - }, - { - name = "user-pool" - machine_type = var.user_node_machine_type - min_count = 0 - max_count = var.user_node_max_count - local_ssd_count = 0 - disk_size_gb = 100 - disk_type = "pd-ssd" - image_type = "COS" - auto_repair = true - auto_upgrade = false - preemptible = false - initial_node_count = 0 - # Let's pin this so we don't upgrade each time terraform runs - version = "1.19.9-gke.1400" - }, - { - name = "dask-worker-pool" - machine_type = var.dask_worker_machine_type - min_count = 0 - max_count = 10 - local_ssd_count = 0 - disk_size_gb = 100 - # Fast startup is important here, so we get fast SSD disks - # This pulls in user images much faster - disk_type = "pd-ssd" - image_type = "COS" - auto_repair = true - auto_upgrade = false - preemptible = true - initial_node_count = 0 - # Let's pin this so we don't upgrade each time terraform runs - version = "1.19.9-gke.1400" - }, - ] - - node_pools_oauth_scopes = { - all = [ - # FIXME: Is this the minimal? - # - "https://www.googleapis.com/auth/cloud-platform", - ] - } - - node_pools_labels = { - all = {} - - core-pool = { - default-node-pool = true - "hub.jupyter.org/pool-name" = "core-pool", - "hub.jupyter.org/node-purpose" = "core", - "k8s.dask.org/node-purpose" = "core" - } - user-pool = { - "hub.jupyter.org/pool-name" = "user-pool" - "hub.jupyter.org/node-purpose" = "user", - "k8s.dask.org/node-purpose" = "scheduler" - } - dask-worker-pool = { - "hub.jupyter.org/pool-name" = "dask-worker-pool" - "k8s.dask.org/node-purpose" = "worker" - } - } - - node_pools_taints = { - all = [] - - user-pool = [{ - key = "hub.jupyter.org_dedicated" - value = "user" - effect = "NO_SCHEDULE" - }] - dask-worker-pool = [{ - key = "k8s.dask.org_dedicated" - value = "worker" - effect = "NO_SCHEDULE" - }] - } + project = var.project_id + role = each.value + member = "serviceAccount:${google_service_account.cluster_sa.email}" } diff --git a/terraform/cloudbank.tfvars b/terraform/projects/cloudbank.tfvars similarity index 66% rename from terraform/cloudbank.tfvars rename to terraform/projects/cloudbank.tfvars index 7235294676..123d8e54ae 100644 --- a/terraform/cloudbank.tfvars +++ b/terraform/projects/cloudbank.tfvars @@ -3,7 +3,11 @@ project_id = "cb-1003-1696" core_node_machine_type = "n1-highmem-4" +# Multi-tenant cluster, network policy is required to enforce separation between hubs enable_network_policy = true + +# No plans to provide storage buckets to users on this hub, so no need to deploy +# config connector config_connector_enabled = false notebook_nodes = { diff --git a/terraform/hackathon-2i2c-project-alpha.tfvars b/terraform/projects/hackathon-2i2c-project-alpha.tfvars similarity index 100% rename from terraform/hackathon-2i2c-project-alpha.tfvars rename to terraform/projects/hackathon-2i2c-project-alpha.tfvars diff --git a/terraform/projects/meom-ige.tfvars b/terraform/projects/meom-ige.tfvars new file mode 100644 index 0000000000..c95fdd06fa --- /dev/null +++ b/terraform/projects/meom-ige.tfvars @@ -0,0 +1,55 @@ +prefix = "meom-ige" +project_id = "meom-ige-cnrs" + +# Minimum number of nodes required to fit kube-system is either +# 2 n1-highcpu-2 nodes, or 3 g1-small nodes. If you don't enable +# networkpolicy, you can get away with 1 n1-custom-4-3840 node - +# but with that enable, calico-typha wants 2 replicas that +# must run on two nodes since they both want the same hostport. +# 3 g1-small is 13$ a month, wile a single n2-highcpu-2 is +# already 36$ a month. We want very low base price, and +# our core nodes will barely see any CPU usage, so g1-small is +# the way to go +core_node_machine_type = "g1-small" + +# Single-tenant cluster, network policy not needed +enable_network_policy = false + +# Single tenant cluster, so bucket access is provided via +# metadata concealment + node SA. Config Connector not needed. +config_connector_enabled = false + +notebook_nodes = { + "small" : { + min : 0, + max : 20, + machine_type : "e2-standard-2" + }, + "medium" : { + min : 0, + max : 20, + machine_type : "e2-standard-8" + }, + "large" : { + min : 0, + max : 20, + machine_type : "e2-standard-16" + }, + "very-large" : { + min : 0, + max : 20, + machine_type : "e2-standard-32" + }, + "huge" : { + min : 0, + max : 20, + # e2 instances only go upto 32 cores + machine_type : "n2-standard-64" + }, + +} + +user_buckets = [ + "scratch", + "data" +] \ No newline at end of file diff --git a/terraform/pilot-hubs.tfvars b/terraform/projects/pilot-hubs.tfvars similarity index 69% rename from terraform/pilot-hubs.tfvars rename to terraform/projects/pilot-hubs.tfvars index 2ed7f3a47e..4d566b6b83 100644 --- a/terraform/pilot-hubs.tfvars +++ b/terraform/projects/pilot-hubs.tfvars @@ -3,7 +3,10 @@ project_id = "two-eye-two-see" core_node_machine_type = "n1-highmem-4" +# Multi-tenant cluster, network policy is required to enforce separation between hubs enable_network_policy = true + +# Some hubs want a storage bucket, so we need to have config connector enabled config_connector_enabled = true notebook_nodes = { diff --git a/terraform/registry.tf b/terraform/registry.tf new file mode 100644 index 0000000000..29c8e3975a --- /dev/null +++ b/terraform/registry.tf @@ -0,0 +1,13 @@ +/** +* Artifact Registry to store user images for this cluster. +* +* Hosting it in the same project makes node startup time faster. +*/ +resource "google_artifact_registry_repository" "registry" { + provider = google-beta + + location = var.region + repository_id = "${var.prefix}-registry" + format = "DOCKER" + project = var.project_id +} diff --git a/terraform/variables.tf b/terraform/variables.tf index eeb8d51a09..2d8704e04b 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -1,54 +1,173 @@ variable "prefix" { - type = string + type = string + description = <<-EOT + Prefix for all objects created by terraform. + + Primary identifier to 'group' together resources created by + this terraform module. Prevents clashes with other resources + in the cloud project / account. + + Should not be changed after first terraform apply - doing so + will recreate all resources. + + Should not end with a '-', that is automatically added. + EOT } variable "project_id" { - type = string - # This is in Toronto! - default = "two-eye-two-see" + type = string + description = <<-EOT + GCP Project ID to create resources in. + + Should be the id, rather than display name of the project. + EOT +} + +variable "notebook_nodes" { + type = map(map(string)) + description = "Notebook node pools to create" + default = {} +} + +variable "dask_nodes" { + type = map(map(string)) + description = "Dask node pools to create. Defaults to notebook_nodes" + default = {} +} + +variable "config_connector_enabled" { + type = bool + default = false + description = <<-EOT + Enable GKE Config Connector to manage GCP resources via kubernetes. + + GKE Config Connector (https://cloud.google.com/config-connector/docs/overview) + allows creating GCP resources (like buckets, VMs, etc) via creating Kubernetes + Custom Resources. We use this to create buckets on a per-hub level, + and could use it for other purposes in the future. + + Enabling this increases base cost, as config connector related pods + needs to run on the cluster. + EOT +} + +variable "cluster_sa_roles" { + type = set(string) + default = [ + "roles/logging.logWriter", + "roles/monitoring.metricWriter", + "roles/monitoring.viewer", + "roles/stackdriver.resourceMetadata.writer", + "roles/artifactregistry.reader" + ] + description = <<-EOT + List of roles granted to the SA assumed by cluster nodes. + + The defaults grant just enough access for the components on the node + to write metrics & logs to stackdriver, and pull images from artifact registry. + + https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster + has more information. + EOT +} + +variable "cd_sa_roles" { + type = set(string) + default = [ + "roles/container.admin", + "roles/artifactregistry.writer" + ] + description = <<-EOT + List of roles granted to the SA used by our CI/CD pipeline. + + We want to automatically build / push images, and deploy to + the kubernetes cluster from CI/CD (on GitHub actions, mostly). + A JSON key for this will be generated (with + `terraform output -raw ci_deployer_key`) and stored in the + repo in encrypted form. + + The default provides *full* access to the entire kubernetes + cluster! This is dangerous, but it is unclear how to tamp + it down. + EOT } variable "region" { - type = string - default = "us-central1" + type = string + default = "us-central1" + description = <<-EOT + GCP Region the cluster & resources will be placed in. + + For research clusters, this should be closest to where + your source data is. + + This does not imply that the cluster will be a regional + cluster. + EOT + } variable "zone" { - type = string - default = "us-central1-b" -} + type = string + default = "us-central1-b" + description = <<-EOT + GCP Zone the cluster & nodes will be set up in. -variable "regional_cluster" { - type = string - default = "false" + Even with a regional cluster, all the cluster nodes will + be on a single zone. NFS and supporting VMs will need to + be in this zone as well. + EOT } variable "core_node_machine_type" { - type = string - default = "n1-highmem-4" + type = string + default = "g1-small" + description = <<-EOT + Machine type to use for core nodes. + + Core nodes will always be on, and count as 'base cost' + for a cluster. We should try to run with as few of them + as possible. + + For single-tenant clusters, a single g1-small node seems + enough - if network policy and config connector are not on. + For others, please experiment to see what fits. + EOT } variable "core_node_max_count" { - type = number - default = 5 -} + type = number + default = 5 + description = <<-EOT + Maximum number of core nodes available. -variable "core_node_disk_size_gb" { - type = number - default = 50 -} + Core nodes can scale up to this many nodes if necessary. + They are part of the 'base cost', should be kept to a minimum. + This number should be small enough to prevent runaway scaling, + but large enough to support ocassional spikes for whatever reason. -variable "user_node_machine_type" { - type = string - default = "n1-standard-4" + Minimum node count is fixed at 1. + EOT } -variable "user_node_max_count" { - type = number - default = 10 +variable "enable_network_policy" { + type = bool + default = true + description = <<-EOT + Enable kubernetes network policy enforcement. + + Our z2jh deploys NetworkPolicies by default - but they are + not enforced unless enforcement is turned on here. This takes + up some cluster resources, so we could turn it off in cases + where we are trying to minimize base cost. + + https://cloud.google.com/kubernetes-engine/docs/how-to/network-policy + has more information. + EOT } -variable "dask_worker_machine_type" { - type = string - default = "e2-highmem-2" +variable "user_buckets" { + type = set(any) + default = [] + description = "Buckets to create for the project, they will be prefixed with {var.prefix}-" }