From 6ad1c811bc7fab11966bf77bdca6087480e65fed Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 25 Aug 2020 14:17:08 +0530 Subject: [PATCH 01/20] Add initial terraform code This time, I'm going to try make *everything* be fully automated - not just the helm setup. This set of terraform scripts shall be run manually by hand - I don't trust it for automation yet. --- main.tf | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++ test.tfvars | 2 + variables.tf | 25 ++++++++ 3 files changed, 204 insertions(+) create mode 100644 main.tf create mode 100644 test.tfvars create mode 100644 variables.tf diff --git a/main.tf b/main.tf new file mode 100644 index 0000000000..8ec85db751 --- /dev/null +++ b/main.tf @@ -0,0 +1,177 @@ +provider "azurerm" { + # whilst the `version` attribute is optional, we recommend pinning to a given version of the Provider + version = "=2.20.0" + features {} +} + +provider "local" { + version = "1.4.0" +} + +resource "azurerm_resource_group" "jupyterhub" { + name = "${var.prefix}-rg" + location = var.region +} + +resource "azurerm_virtual_network" "jupyterhub" { + name = "${var.prefix}-network" + location = azurerm_resource_group.jupyterhub.location + resource_group_name = azurerm_resource_group.jupyterhub.name + address_space = ["10.0.0.0/8"] +} + +resource "azurerm_subnet" "node_subnet" { + name = "${var.prefix}-node-subnet" + virtual_network_name = azurerm_virtual_network.jupyterhub.name + resource_group_name = azurerm_resource_group.jupyterhub.name + address_prefixes = ["10.1.0.0/16"] +} + +resource "azurerm_kubernetes_cluster" "jupyterhub" { + name = "${var.prefix}-cluster" + location = azurerm_resource_group.jupyterhub.location + resource_group_name = azurerm_resource_group.jupyterhub.name + dns_prefix = "${var.prefix}-cluster" + + # Core node-pool + default_node_pool { + name = "core" + node_count = 1 + vm_size = var.core_vm_size + os_disk_size_gb = 100 + enable_auto_scaling = true + min_count = 1 + max_count = 8 + vnet_subnet_id = azurerm_subnet.node_subnet.id + node_labels = { + "hub.jupyter.org/pool-name" = "core-pool" + } + } + + identity { + type = "SystemAssigned" + } + + network_profile { + # I don't trust Azure CNI + network_plugin = "kubenet" + network_policy = "calico" + } + + tags = { + Environment = "Production" + ManagedBy = "2i2c" + } +} + +resource "azurerm_kubernetes_cluster_node_pool" "user_pool" { + name = "user" + kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id + vm_size = var.user_vm_size + node_count = 1 + enable_auto_scaling = true + os_disk_size_gb = 200 + node_taints = ["hub.jupyter.org_dedicated=user:NoSchedule"] + vnet_subnet_id = azurerm_subnet.node_subnet.id + node_labels = { + "hub.jupyter.org/pool-name" = "user-alpha-pool" + } + + min_count = 1 + max_count = 100 + tags = { + Environment = "Production" + ManagedBy = "2i2c" + } +} + +# NFS VM +resource "azurerm_network_interface" "nfs_vm" { + name = "${var.prefix}-nfs-vm-inet" + location = azurerm_resource_group.jupyterhub.location + resource_group_name = azurerm_resource_group.jupyterhub.name + + ip_configuration { + name = "internal" + subnet_id = azurerm_subnet.node_subnet.id + private_ip_address_allocation = "Dynamic" + } +} + +resource "azurerm_network_security_group" "nfs_vm" { + name = "${var.prefix}-nfs-vm-nsg" + location = azurerm_resource_group.jupyterhub.location + resource_group_name = azurerm_resource_group.jupyterhub.name + + # SSH from the world + security_rule { + access = "Allow" + direction = "Inbound" + name = "ssh" + priority = 100 + protocol = "Tcp" + source_port_range = "*" + source_address_prefix = "*" + destination_port_range = "22" + destination_address_prefix = "*" + } + + # NFS from internal network + security_rule { + access = "Allow" + direction = "Inbound" + name = "nfs" + priority = 101 + protocol = "Tcp" + source_port_range = "*" + source_address_prefix = "*" + destination_port_range = "2049" + destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address + } +} + +resource "azurerm_network_interface_security_group_association" "main" { + network_interface_id = azurerm_network_interface.nfs_vm_pub.id + network_security_group_id = azurerm_network_security_group.nfs_vm.id +} + + +resource "azurerm_linux_virtual_machine" "nfs_vm" { + name = "${var.prefix}-nfs-vm" + resource_group_name = azurerm_resource_group.jupyterhub.name + location = azurerm_resource_group.jupyterhub.location + size = "Standard_F2" + admin_username = "hubadmin" + + network_interface_ids = [ + azurerm_network_interface.nfs_vm.id, + azurerm_network_interface.nfs_vm_pub.id + ] + + admin_ssh_key { + username = "hubadmin" + public_key = file("${path.module}/ssh-key.pub") + } + + os_disk { + caching = "ReadWrite" + storage_account_type = "StandardSSD_LRS" + disk_size_gb = 250 + } + + source_image_reference { + publisher = "Canonical" + offer = "0001-com-ubuntu-server-focal" + sku = "20_04-lts" + version = "latest" + } +} + + +output "kubeconfig" { + value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw +} + +output "nfs_public_ip" { + value = azurerm_public_ip.nfs_vm.ip_address +} diff --git a/test.tfvars b/test.tfvars new file mode 100644 index 0000000000..05630e7b63 --- /dev/null +++ b/test.tfvars @@ -0,0 +1,2 @@ +prefix = "jupyterhub-2i2c" +ssh_pub_key = "ssh-key.pub" \ No newline at end of file diff --git a/variables.tf b/variables.tf new file mode 100644 index 0000000000..1f2a56571e --- /dev/null +++ b/variables.tf @@ -0,0 +1,25 @@ +variable "prefix" { + type = string +} + +variable "region" { + type = string + # This is in Toronto! + default = "Canada Central" +} + +variable "user_vm_size" { + type = string + # VM with 32G of RAM, 8 cores, and ssd base disk + default = "Standard_E8s_v3" +} + +variable "core_vm_size" { + type = string + # 16GB of RAM, 2 cores, ssd base disk + default = "Standard_E2s_v3" +} + +variable "ssh_pub_key" { + type = string +} From 2bf6fc278c6663c138eb53fccfb10f8cb2712f9c Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 25 Aug 2020 17:38:47 +0530 Subject: [PATCH 02/20] Add disks to NFS machine - Setup a lifecycle hook to make sure our disk does not get destroyed - Remove the public IP setup. We just ProxyCommand via the kubernetes cluster instead --- main.tf | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/main.tf b/main.tf index 8ec85db751..c44c3de30e 100644 --- a/main.tf +++ b/main.tf @@ -113,7 +113,7 @@ resource "azurerm_network_security_group" "nfs_vm" { source_port_range = "*" source_address_prefix = "*" destination_port_range = "22" - destination_address_prefix = "*" + destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address } # NFS from internal network @@ -131,11 +131,10 @@ resource "azurerm_network_security_group" "nfs_vm" { } resource "azurerm_network_interface_security_group_association" "main" { - network_interface_id = azurerm_network_interface.nfs_vm_pub.id + network_interface_id = azurerm_network_interface.nfs_vm.id network_security_group_id = azurerm_network_security_group.nfs_vm.id } - resource "azurerm_linux_virtual_machine" "nfs_vm" { name = "${var.prefix}-nfs-vm" resource_group_name = azurerm_resource_group.jupyterhub.name @@ -145,7 +144,6 @@ resource "azurerm_linux_virtual_machine" "nfs_vm" { network_interface_ids = [ azurerm_network_interface.nfs_vm.id, - azurerm_network_interface.nfs_vm_pub.id ] admin_ssh_key { @@ -167,11 +165,30 @@ resource "azurerm_linux_virtual_machine" "nfs_vm" { } } +resource "azurerm_managed_disk" "nfs_data_disk_1" { + name = "${var.prefix}-nfs-data-disk-1" + location = azurerm_resource_group.jupyterhub.location + resource_group_name = azurerm_resource_group.jupyterhub.name + storage_account_type = "StandardSSD_LRS" + create_option = "Empty" + disk_size_gb = "100" -output "kubeconfig" { - value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw + lifecycle { + # Terraform plz never destroy data thx + prevent_destroy = true + } + tags = { + Environment = "Production" + } +} + +resource "azurerm_virtual_machine_data_disk_attachment" "nfs_data_disk_1" { + virtual_machine_id = azurerm_linux_virtual_machine.nfs_vm.id + managed_disk_id = azurerm_managed_disk.nfs_data_disk_1.id + lun = 0 + caching = "None" } -output "nfs_public_ip" { - value = azurerm_public_ip.nfs_vm.ip_address +output "kubeconfig" { + value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw } From c445b1e77138fc015a3c539f6ad3ba4a4d44a081 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 25 Aug 2020 18:36:49 +0530 Subject: [PATCH 03/20] Setup NFS - Add playbook for NFS server, so we don't hand-setup that - Mount NFS on the user pods! --- nfs-playbook.yaml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 nfs-playbook.yaml diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml new file mode 100644 index 0000000000..e89aca7d60 --- /dev/null +++ b/nfs-playbook.yaml @@ -0,0 +1,35 @@ +- name: nfs server setup + hosts: all + connection: ssh + become: true + handlers: + - name: re-export NFS Shares + command: + cmd: exportfs -ra + tasks: + - name: Install NFS packages + apt: + pkg: + - nfs-kernel-server + - nfs-common + - xfsprogs + - name: Setup XFS + filesystem: + fstype: xfs + dev: /dev/disk/azure/scsi1/lun0 + + - name: Mount disk + mount: + path: /export/disk1 + src: /dev/disk/azure/scsi1/lun0 + state: mounted + fstype: xfs + opts: inode64,prjquota + + - name: setup exports file + notify: + - re-export NFS Shares + copy: + dest: /etc/exports + content: > + /export/disk1 10.0.0.0/8(all_squash,anonuid=1000,anongid=1000,no_subtree_check,rw,sync) From 0e91e6dbc07dec5af1877da2c0a0d3f455df4a9d Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 25 Aug 2020 20:27:37 +0530 Subject: [PATCH 04/20] Put home directories in a subdir of the disk I don't like putting home directories straight on a disk. Call it superstition --- nfs-playbook.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml index e89aca7d60..fc8c07bccb 100644 --- a/nfs-playbook.yaml +++ b/nfs-playbook.yaml @@ -26,6 +26,14 @@ fstype: xfs opts: inode64,prjquota + - name: Create home container directory + file: + state: directory + owner: "1000" + group: "1000" + path: /export/disk1/homes + mode: 0700 + - name: setup exports file notify: - re-export NFS Shares From dc35b80390f2c7133af47f4ffe1b2e44c5db69e7 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 25 Aug 2020 21:14:01 +0530 Subject: [PATCH 05/20] Resize XFS partition if block size is bigger XFS can be live resized up, not down. Ansible will do this for us now, when we run it. --- main.tf | 2 +- nfs-playbook.yaml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/main.tf b/main.tf index c44c3de30e..bd3f2cbdb7 100644 --- a/main.tf +++ b/main.tf @@ -171,7 +171,7 @@ resource "azurerm_managed_disk" "nfs_data_disk_1" { resource_group_name = azurerm_resource_group.jupyterhub.name storage_account_type = "StandardSSD_LRS" create_option = "Empty" - disk_size_gb = "100" + disk_size_gb = "200" lifecycle { # Terraform plz never destroy data thx diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml index fc8c07bccb..44525185a2 100644 --- a/nfs-playbook.yaml +++ b/nfs-playbook.yaml @@ -17,6 +17,7 @@ filesystem: fstype: xfs dev: /dev/disk/azure/scsi1/lun0 + resizefs: true - name: Mount disk mount: From 838f33b0736ec3f1487b01c05b3bdbcefa961774 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Wed, 26 Aug 2020 14:03:37 +0530 Subject: [PATCH 06/20] Templatize nfs-playbook & generate values from terraform --- ansible-hosts.yaml | 9 +++++++++ main.tf | 23 +++++++++++++++++++++++ nfs-playbook.yaml | 10 +++++----- 3 files changed, 37 insertions(+), 5 deletions(-) create mode 100755 ansible-hosts.yaml diff --git a/ansible-hosts.yaml b/ansible-hosts.yaml new file mode 100755 index 0000000000..d950ea1c53 --- /dev/null +++ b/ansible-hosts.yaml @@ -0,0 +1,9 @@ +"nfs_servers": + "hosts": + "jupyterhub-2i2c-nfs-vm": + "ansible_ssh_common_args": "-o ProxyCommand='../proxycommand.py %h %p'" + "ansible_ssh_private_key_file": "../ssh-key" + "ansible_user": "hubadmin" + "vars": + "disk_lun": 0 + "disk_name": "jupyterhub-2i2c-nfs-data-disk-1" diff --git a/main.tf b/main.tf index bd3f2cbdb7..a838b5fb50 100644 --- a/main.tf +++ b/main.tf @@ -189,6 +189,29 @@ resource "azurerm_virtual_machine_data_disk_attachment" "nfs_data_disk_1" { caching = "None" } +locals { + ansible_hosts = { + "nfs_servers" = { + hosts = { + (azurerm_linux_virtual_machine.nfs_vm.name) = { + ansible_ssh_common_args = "-o ProxyCommand='../proxycommand.py %h %p'" + ansible_user = "hubadmin" + ansible_ssh_private_key_file = "../ssh-key" + } + } + "vars" = { + disk_name = (azurerm_managed_disk.nfs_data_disk_1.name) + disk_lun = (azurerm_virtual_machine_data_disk_attachment.nfs_data_disk_1.lun) + } + } + } +} + +resource "local_file" "ansible_hosts_file" { + content = yamlencode(local.ansible_hosts) + filename = "ansible-hosts.yaml" +} + output "kubeconfig" { value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw } diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml index 44525185a2..15e1f6c452 100644 --- a/nfs-playbook.yaml +++ b/nfs-playbook.yaml @@ -1,5 +1,5 @@ - name: nfs server setup - hosts: all + hosts: nfs_servers connection: ssh become: true handlers: @@ -16,13 +16,13 @@ - name: Setup XFS filesystem: fstype: xfs - dev: /dev/disk/azure/scsi1/lun0 + dev: /dev/disk/azure/scsi1/lun{{ disk_lun }} resizefs: true - name: Mount disk mount: - path: /export/disk1 - src: /dev/disk/azure/scsi1/lun0 + path: /export/{{ disk_name }} + src: /dev/disk/azure/scsi1/lun{{ disk_lun }} state: mounted fstype: xfs opts: inode64,prjquota @@ -32,7 +32,7 @@ state: directory owner: "1000" group: "1000" - path: /export/disk1/homes + path: /export/{{disk_name}}/homes mode: 0700 - name: setup exports file From 564bb6cea29c6f557bec3f52d89cab9c5af9def1 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Wed, 26 Aug 2020 15:54:02 +0530 Subject: [PATCH 07/20] Use templated disk name in ansible Missed a spot --- nfs-playbook.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml index 15e1f6c452..1c76304388 100644 --- a/nfs-playbook.yaml +++ b/nfs-playbook.yaml @@ -41,4 +41,4 @@ copy: dest: /etc/exports content: > - /export/disk1 10.0.0.0/8(all_squash,anonuid=1000,anongid=1000,no_subtree_check,rw,sync) + /export/{{disk_name}} 10.0.0.0/8(all_squash,anonuid=1000,anongid=1000,no_subtree_check,rw,sync) From 245fbb4257e0221cfb5173b9da143bee8a008e53 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Wed, 26 Aug 2020 15:59:16 +0530 Subject: [PATCH 08/20] Setup ssh key for AKS nodes too Very, *very* useful when shit goes bad --- main.tf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/main.tf b/main.tf index a838b5fb50..ef26933e73 100644 --- a/main.tf +++ b/main.tf @@ -33,6 +33,12 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { resource_group_name = azurerm_resource_group.jupyterhub.name dns_prefix = "${var.prefix}-cluster" + linux_profile { + admin_username = "hubadmin" + ssh_key { + key_data = file("${path.module}/ssh-key.pub") + } + } # Core node-pool default_node_pool { name = "core" From 553f3eed759fcff35207bd72f539546e594bf776 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 31 Aug 2020 21:29:20 +0530 Subject: [PATCH 09/20] Add azurerm state storage backend I lost the previous terraform state because it was just on my laptop... This keeps all state off my laptop, and allows multiple users to do it --- main.tf | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/main.tf b/main.tf index ef26933e73..9f95cee802 100644 --- a/main.tf +++ b/main.tf @@ -4,6 +4,16 @@ provider "azurerm" { features {} } +terraform { + backend "azurerm" { + resource_group_name = "terraform-state" + storage_account_name = "utorontoterraformstate" + container_name = "terraformstate" + key = "prod.terraform.tfstate" + } +} + + provider "local" { version = "1.4.0" } From 8dd694ef416ea24994453c9a1a520743b0969245 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 31 Aug 2020 22:31:23 +0530 Subject: [PATCH 10/20] Add proxycommand script + pub key Azure is weird about access to VMs and asks you to do terrible unsecure things (copy a private key? off my machine?! OVER MY DEAD BODY, SIR). We hack around it in a cool way that I should really make into a package --- ansible-hosts.yaml | 4 ++-- main.tf | 4 ++-- proxycommand.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ ssh-key.pub | 1 + 4 files changed, 49 insertions(+), 4 deletions(-) create mode 100755 proxycommand.py create mode 120000 ssh-key.pub diff --git a/ansible-hosts.yaml b/ansible-hosts.yaml index d950ea1c53..c4d38f01fc 100755 --- a/ansible-hosts.yaml +++ b/ansible-hosts.yaml @@ -1,8 +1,8 @@ "nfs_servers": "hosts": "jupyterhub-2i2c-nfs-vm": - "ansible_ssh_common_args": "-o ProxyCommand='../proxycommand.py %h %p'" - "ansible_ssh_private_key_file": "../ssh-key" + "ansible_ssh_common_args": "-o ProxyCommand='./proxycommand.py %h %p'" + "ansible_ssh_private_key_file": "../secrets/ssh-key.unsafe" "ansible_user": "hubadmin" "vars": "disk_lun": 0 diff --git a/main.tf b/main.tf index 9f95cee802..797753295e 100644 --- a/main.tf +++ b/main.tf @@ -210,9 +210,9 @@ locals { "nfs_servers" = { hosts = { (azurerm_linux_virtual_machine.nfs_vm.name) = { - ansible_ssh_common_args = "-o ProxyCommand='../proxycommand.py %h %p'" + ansible_ssh_common_args = "-o ProxyCommand='./proxycommand.py %h %p'" ansible_user = "hubadmin" - ansible_ssh_private_key_file = "../ssh-key" + ansible_ssh_private_key_file = "../secrets/ssh-key.unsafe" } } "vars" = { diff --git a/proxycommand.py b/proxycommand.py new file mode 100755 index 0000000000..ffa8849b84 --- /dev/null +++ b/proxycommand.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import sys +import subprocess +import time + + +POD_NAME = "ssh-proxycommand-pod" +POD_IMAGE = "alpine/socat" +HOST = sys.argv[1] +PORT = sys.argv[2] + +# Just 'sleep infinity' doesn't handle signals properly +SCRIPT = "trap 'trap - INT; kill \"$!\"; exit' INT; exec sleep infinity & wait $!" + +log = open('log', 'w') + +def delete_pod(): + try: + subprocess.check_output([ + 'kubectl', 'delete', 'pod', POD_NAME, '--wait', '--now' + ]) + except subprocess.CalledProcessError as e: + print(e.stdout) +delete_pod() + +try: + subprocess.check_call([ + 'kubectl', 'run', '--image', POD_IMAGE, '--command', '--wait', + POD_NAME, '--', "/bin/sh", "-c", SCRIPT + ]) + + + time.sleep(2) + + print("starting", file=log, flush=True) + subprocess.check_call([ + 'kubectl', 'exec', '-i', POD_NAME, '--', + 'socat', '-', f"tcp:{HOST}:{PORT}" + ]) + print("ending", file=log, flush=True) +finally: + print("deleting", file=log, flush=True) + delete_pod() + diff --git a/ssh-key.pub b/ssh-key.pub new file mode 120000 index 0000000000..da373ad07a --- /dev/null +++ b/ssh-key.pub @@ -0,0 +1 @@ +../secrets/ssh-key.pub \ No newline at end of file From aa1063b00967a7148dcfac9b55bf5d67a76f5fb0 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Sat, 12 Sep 2020 02:32:16 +0530 Subject: [PATCH 11/20] Cleanup NFS setup - Use a bigger VM size - Use PremiumLRS - we can downgrade later if necessary --- main.tf | 9 +++++---- variables.tf | 12 ++++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/main.tf b/main.tf index 797753295e..b51decea3f 100644 --- a/main.tf +++ b/main.tf @@ -53,6 +53,7 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { default_node_pool { name = "core" node_count = 1 + # Unfortunately, changing anything about VM type / size recreates *whole cluster vm_size = var.core_vm_size os_disk_size_gb = 100 enable_auto_scaling = true @@ -155,7 +156,7 @@ resource "azurerm_linux_virtual_machine" "nfs_vm" { name = "${var.prefix}-nfs-vm" resource_group_name = azurerm_resource_group.jupyterhub.name location = azurerm_resource_group.jupyterhub.location - size = "Standard_F2" + size = var.nfs_vm_size admin_username = "hubadmin" network_interface_ids = [ @@ -168,9 +169,9 @@ resource "azurerm_linux_virtual_machine" "nfs_vm" { } os_disk { - caching = "ReadWrite" + caching = "None" storage_account_type = "StandardSSD_LRS" - disk_size_gb = 250 + disk_size_gb = 100 } source_image_reference { @@ -185,7 +186,7 @@ resource "azurerm_managed_disk" "nfs_data_disk_1" { name = "${var.prefix}-nfs-data-disk-1" location = azurerm_resource_group.jupyterhub.location resource_group_name = azurerm_resource_group.jupyterhub.name - storage_account_type = "StandardSSD_LRS" + storage_account_type = "Premium_LRS" create_option = "Empty" disk_size_gb = "200" diff --git a/variables.tf b/variables.tf index 1f2a56571e..c4db2e6459 100644 --- a/variables.tf +++ b/variables.tf @@ -16,8 +16,16 @@ variable "user_vm_size" { variable "core_vm_size" { type = string - # 16GB of RAM, 2 cores, ssd base disk - default = "Standard_E2s_v3" + # 8GB of RAM, 4 CPU cores, ssd base disk + # UNFORTUNATELY changing this triggers a k8s cluster recreation + # BOOOO + default = "Standard_F4s_v2" +} + +variable "nfs_vm_size" { + type = string + # 8GB of RAM, 4 CPU cores, ssd base disk + default = "Standard_F4s_v2" } variable "ssh_pub_key" { From 1ceabc72226195f6b2cb9434f1fdb35282e6560d Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Sun, 13 Sep 2020 21:39:11 +0530 Subject: [PATCH 12/20] Bump up resources for core node pool Prometheus just *eats* resources --- variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/variables.tf b/variables.tf index c4db2e6459..35aa76acaf 100644 --- a/variables.tf +++ b/variables.tf @@ -19,7 +19,7 @@ variable "core_vm_size" { # 8GB of RAM, 4 CPU cores, ssd base disk # UNFORTUNATELY changing this triggers a k8s cluster recreation # BOOOO - default = "Standard_F4s_v2" + default = "Standard_E4s_v3" } variable "nfs_vm_size" { From c6df0686dc72f980be3d1197ae33b2a824262e23 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 14 Sep 2020 14:09:35 +0530 Subject: [PATCH 13/20] Switch to azure container registry - Faster pulls, we use the 'premium' tier. We might not actually need to, but we can downgrade later. - Service Principals are kinda weird here, so let's just use the admin username / password. This is dangerous and stupif. Also needs a new feature in hubploy to really work --- main.tf | 23 +++++++++++++++++++++++ variables.tf | 6 ++++++ 2 files changed, 29 insertions(+) diff --git a/main.tf b/main.tf index b51decea3f..f9d13d12d9 100644 --- a/main.tf +++ b/main.tf @@ -102,6 +102,16 @@ resource "azurerm_kubernetes_cluster_node_pool" "user_pool" { } } +# AZure container registry + +resource "azurerm_container_registry" "container_registry" { + # meh, only alphanumberic chars. No separators. BE CONSISTENT, AZURE + name = var.global_container_registry_name + resource_group_name = azurerm_resource_group.jupyterhub.name + location = azurerm_resource_group.jupyterhub.location + sku = "premium" + admin_enabled = true +} # NFS VM resource "azurerm_network_interface" "nfs_vm" { name = "${var.prefix}-nfs-vm-inet" @@ -207,6 +217,15 @@ resource "azurerm_virtual_machine_data_disk_attachment" "nfs_data_disk_1" { } locals { + registry_creds = { + "singleuser" = { + "imagePullSecret" = { + "username": azurerm_container_registry.container_registry.admin_username, + "password": azurerm_container_registry.container_registry.admin_password, + "registry": "https://${azurerm_container_registry.container_registry.login_server}" + } + } + } ansible_hosts = { "nfs_servers" = { hosts = { @@ -232,3 +251,7 @@ resource "local_file" "ansible_hosts_file" { output "kubeconfig" { value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw } + +output "registry_creds_config" { + value = jsonencode(local.registry_creds) +} \ No newline at end of file diff --git a/variables.tf b/variables.tf index 35aa76acaf..768f5f52a7 100644 --- a/variables.tf +++ b/variables.tf @@ -28,6 +28,12 @@ variable "nfs_vm_size" { default = "Standard_F4s_v2" } +variable "global_container_registry_name" { + type = string + # This needs to be globally unique + default = "containerregistry2i2cutoronto" +} + variable "ssh_pub_key" { type = string } From b2c9c3d31859054fcb29a6a9f236eaedb9bc395e Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 14 Sep 2020 14:19:07 +0530 Subject: [PATCH 14/20] Upgrade version of kubernetes Some of the failures we were seeing - of pod spawns getting 'stuck', might be bugs in AKS versions. See https://github.com/jupyterhub/kubespawner/pull/433. Hopefully upgrading fixes it? --- main.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/main.tf b/main.tf index f9d13d12d9..ede4bd4d6c 100644 --- a/main.tf +++ b/main.tf @@ -42,6 +42,7 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { location = azurerm_resource_group.jupyterhub.location resource_group_name = azurerm_resource_group.jupyterhub.name dns_prefix = "${var.prefix}-cluster" + kubernetes_version = "1.18.8" linux_profile { admin_username = "hubadmin" @@ -63,6 +64,8 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { node_labels = { "hub.jupyter.org/pool-name" = "core-pool" } + + orchestrator_version = "1.18.8" } identity { @@ -90,6 +93,8 @@ resource "azurerm_kubernetes_cluster_node_pool" "user_pool" { os_disk_size_gb = 200 node_taints = ["hub.jupyter.org_dedicated=user:NoSchedule"] vnet_subnet_id = azurerm_subnet.node_subnet.id + + orchestrator_version = "1.18.8" node_labels = { "hub.jupyter.org/pool-name" = "user-alpha-pool" } From 326855c81a8f9eb5402e93e574627b6574fd664b Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 14 Sep 2020 14:21:59 +0530 Subject: [PATCH 15/20] Tell autoscaler to get rid of unready nodes fast AKS nodes seem to be unready a lot --- main.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/main.tf b/main.tf index ede4bd4d6c..f371c586e7 100644 --- a/main.tf +++ b/main.tf @@ -68,6 +68,11 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { orchestrator_version = "1.18.8" } + auto_scaler_profile { + # Let's get rid of unready nodes ASAP + # Azure nodes love being unready + scale_down_unready = "1m" + } identity { type = "SystemAssigned" } From 7db56de297fd73e9b7d2cdf77ad12765b0725bc0 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Wed, 6 Jan 2021 18:23:49 +0530 Subject: [PATCH 16/20] Don't specify node_count explicitly Terraform will try to downscale the cluster if we set this. If unset, it'll let it be. --- main.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/main.tf b/main.tf index f371c586e7..4a382a3eff 100644 --- a/main.tf +++ b/main.tf @@ -93,7 +93,6 @@ resource "azurerm_kubernetes_cluster_node_pool" "user_pool" { name = "user" kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id vm_size = var.user_vm_size - node_count = 1 enable_auto_scaling = true os_disk_size_gb = 200 node_taints = ["hub.jupyter.org_dedicated=user:NoSchedule"] From c6acd32789ae60f0d61d91c66f3b13bf2fff0e58 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Wed, 6 Jan 2021 18:24:12 +0530 Subject: [PATCH 17/20] Increase NFS disk size to 1T 200G is probably too small for the classes we are planning on. Ref #51 --- main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.tf b/main.tf index 4a382a3eff..bfe1c80c2e 100644 --- a/main.tf +++ b/main.tf @@ -207,7 +207,7 @@ resource "azurerm_managed_disk" "nfs_data_disk_1" { resource_group_name = azurerm_resource_group.jupyterhub.name storage_account_type = "Premium_LRS" create_option = "Empty" - disk_size_gb = "200" + disk_size_gb = "1024" lifecycle { # Terraform plz never destroy data thx From 42cc84592f447f4bc6e6610ab88439249347283c Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Wed, 6 Jan 2021 18:45:16 +0530 Subject: [PATCH 18/20] Collect prometheus stats from NFS server as well The NFS server is our biggest single point of failure, and we should keep a good eye on it. Ref #51 --- main.tf | 13 +++++++++++++ nfs-playbook.yaml | 5 +++++ 2 files changed, 18 insertions(+) diff --git a/main.tf b/main.tf index bfe1c80c2e..c01e91b48d 100644 --- a/main.tf +++ b/main.tf @@ -164,6 +164,19 @@ resource "azurerm_network_security_group" "nfs_vm" { destination_port_range = "2049" destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address } + # + # Prometheus from internal network + security_rule { + access = "Allow" + direction = "Inbound" + name = "prometheus" + priority = 102 + protocol = "Tcp" + source_port_range = "*" + source_address_prefix = "*" + destination_port_range = "9100" + destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address + } } resource "azurerm_network_interface_security_group_association" "main" { diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml index 1c76304388..a5a9029662 100644 --- a/nfs-playbook.yaml +++ b/nfs-playbook.yaml @@ -42,3 +42,8 @@ dest: /etc/exports content: > /export/{{disk_name}} 10.0.0.0/8(all_squash,anonuid=1000,anongid=1000,no_subtree_check,rw,sync) + + - name: Install prometheus-node-exporter + apt: + pkg: + - prometheus-node-exporter From f332450ee2f5b3c4be0c8ad41810b7724369ae7c Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Wed, 17 Feb 2021 13:15:57 +0200 Subject: [PATCH 19/20] Bump hub version to 0.11 --- main.tf | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/main.tf b/main.tf index c01e91b48d..9a82a8c011 100644 --- a/main.tf +++ b/main.tf @@ -240,12 +240,10 @@ resource "azurerm_virtual_machine_data_disk_attachment" "nfs_data_disk_1" { locals { registry_creds = { - "singleuser" = { - "imagePullSecret" = { - "username": azurerm_container_registry.container_registry.admin_username, - "password": azurerm_container_registry.container_registry.admin_password, - "registry": "https://${azurerm_container_registry.container_registry.login_server}" - } + "imagePullSecret" = { + "username": azurerm_container_registry.container_registry.admin_username, + "password": azurerm_container_registry.container_registry.admin_password, + "registry": "https://${azurerm_container_registry.container_registry.login_server}" } } ansible_hosts = { From 854951fbd6365c8900c19b98cae2c471942a02ee Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 13 Jul 2021 16:57:35 +0530 Subject: [PATCH 20/20] Remove unused ssh public key --- ssh-key.pub | 1 - 1 file changed, 1 deletion(-) delete mode 120000 ssh-key.pub diff --git a/ssh-key.pub b/ssh-key.pub deleted file mode 120000 index da373ad07a..0000000000 --- a/ssh-key.pub +++ /dev/null @@ -1 +0,0 @@ -../secrets/ssh-key.pub \ No newline at end of file