From 6ad1c811bc7fab11966bf77bdca6087480e65fed Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Tue, 25 Aug 2020 14:17:08 +0530
Subject: [PATCH 01/20] Add initial terraform code

This time, I'm going to try make *everything* be fully
automated - not just the helm setup. This set of terraform
scripts shall be run manually by hand - I don't trust it
for automation yet.
---
 main.tf      | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++
 test.tfvars  |   2 +
 variables.tf |  25 ++++++++
 3 files changed, 204 insertions(+)
 create mode 100644 main.tf
 create mode 100644 test.tfvars
 create mode 100644 variables.tf

diff --git a/main.tf b/main.tf
new file mode 100644
index 0000000000..8ec85db751
--- /dev/null
+++ b/main.tf
@@ -0,0 +1,177 @@
+provider "azurerm" {
+  # whilst the `version` attribute is optional, we recommend pinning to a given version of the Provider
+  version = "=2.20.0"
+  features {}
+}
+
+provider "local" {
+  version = "1.4.0"
+}
+
+resource "azurerm_resource_group" "jupyterhub" {
+  name     = "${var.prefix}-rg"
+  location = var.region
+}
+
+resource "azurerm_virtual_network" "jupyterhub" {
+  name                = "${var.prefix}-network"
+  location            = azurerm_resource_group.jupyterhub.location
+  resource_group_name = azurerm_resource_group.jupyterhub.name
+  address_space       = ["10.0.0.0/8"]
+}
+
+resource "azurerm_subnet" "node_subnet" {
+  name                 = "${var.prefix}-node-subnet"
+  virtual_network_name = azurerm_virtual_network.jupyterhub.name
+  resource_group_name  = azurerm_resource_group.jupyterhub.name
+  address_prefixes     = ["10.1.0.0/16"]
+}
+
+resource "azurerm_kubernetes_cluster" "jupyterhub" {
+  name                = "${var.prefix}-cluster"
+  location            = azurerm_resource_group.jupyterhub.location
+  resource_group_name = azurerm_resource_group.jupyterhub.name
+  dns_prefix          = "${var.prefix}-cluster"
+
+  # Core node-pool
+  default_node_pool {
+    name                = "core"
+    node_count          = 1
+    vm_size             = var.core_vm_size
+    os_disk_size_gb     = 100
+    enable_auto_scaling = true
+    min_count           = 1
+    max_count           = 8
+    vnet_subnet_id      = azurerm_subnet.node_subnet.id
+    node_labels = {
+      "hub.jupyter.org/pool-name" = "core-pool"
+    }
+  }
+
+  identity {
+    type = "SystemAssigned"
+  }
+
+  network_profile {
+    # I don't trust Azure CNI
+    network_plugin = "kubenet"
+    network_policy = "calico"
+  }
+
+  tags = {
+    Environment = "Production"
+    ManagedBy   = "2i2c"
+  }
+}
+
+resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
+  name                  = "user"
+  kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
+  vm_size               = var.user_vm_size
+  node_count            = 1
+  enable_auto_scaling   = true
+  os_disk_size_gb       = 200
+  node_taints           = ["hub.jupyter.org_dedicated=user:NoSchedule"]
+  vnet_subnet_id        = azurerm_subnet.node_subnet.id
+  node_labels = {
+    "hub.jupyter.org/pool-name" = "user-alpha-pool"
+  }
+
+  min_count = 1
+  max_count = 100
+  tags = {
+    Environment = "Production"
+    ManagedBy   = "2i2c"
+  }
+}
+
+# NFS VM
+resource "azurerm_network_interface" "nfs_vm" {
+  name                = "${var.prefix}-nfs-vm-inet"
+  location            = azurerm_resource_group.jupyterhub.location
+  resource_group_name = azurerm_resource_group.jupyterhub.name
+
+  ip_configuration {
+    name                          = "internal"
+    subnet_id                     = azurerm_subnet.node_subnet.id
+    private_ip_address_allocation = "Dynamic"
+  }
+}
+
+resource "azurerm_network_security_group" "nfs_vm" {
+  name                = "${var.prefix}-nfs-vm-nsg"
+  location            = azurerm_resource_group.jupyterhub.location
+  resource_group_name = azurerm_resource_group.jupyterhub.name
+
+  # SSH from the world
+  security_rule {
+    access                     = "Allow"
+    direction                  = "Inbound"
+    name                       = "ssh"
+    priority                   = 100
+    protocol                   = "Tcp"
+    source_port_range          = "*"
+    source_address_prefix      = "*"
+    destination_port_range     = "22"
+    destination_address_prefix = "*"
+  }
+
+  # NFS from internal network
+  security_rule {
+    access                     = "Allow"
+    direction                  = "Inbound"
+    name                       = "nfs"
+    priority                   = 101
+    protocol                   = "Tcp"
+    source_port_range          = "*"
+    source_address_prefix      = "*"
+    destination_port_range     = "2049"
+    destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address
+  }
+}
+
+resource "azurerm_network_interface_security_group_association" "main" {
+  network_interface_id      = azurerm_network_interface.nfs_vm_pub.id
+  network_security_group_id = azurerm_network_security_group.nfs_vm.id
+}
+
+
+resource "azurerm_linux_virtual_machine" "nfs_vm" {
+  name                = "${var.prefix}-nfs-vm"
+  resource_group_name = azurerm_resource_group.jupyterhub.name
+  location            = azurerm_resource_group.jupyterhub.location
+  size                = "Standard_F2"
+  admin_username      = "hubadmin"
+
+  network_interface_ids = [
+    azurerm_network_interface.nfs_vm.id,
+    azurerm_network_interface.nfs_vm_pub.id
+  ]
+
+  admin_ssh_key {
+    username   = "hubadmin"
+    public_key = file("${path.module}/ssh-key.pub")
+  }
+
+  os_disk {
+    caching              = "ReadWrite"
+    storage_account_type = "StandardSSD_LRS"
+    disk_size_gb         = 250
+  }
+
+  source_image_reference {
+    publisher = "Canonical"
+    offer     = "0001-com-ubuntu-server-focal"
+    sku       = "20_04-lts"
+    version   = "latest"
+  }
+}
+
+
+output "kubeconfig" {
+  value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw
+}
+
+output "nfs_public_ip" {
+  value = azurerm_public_ip.nfs_vm.ip_address
+}
diff --git a/test.tfvars b/test.tfvars
new file mode 100644
index 0000000000..05630e7b63
--- /dev/null
+++ b/test.tfvars
@@ -0,0 +1,2 @@
+prefix = "jupyterhub-2i2c"
+ssh_pub_key = "ssh-key.pub"
\ No newline at end of file
diff --git a/variables.tf b/variables.tf
new file mode 100644
index 0000000000..1f2a56571e
--- /dev/null
+++ b/variables.tf
@@ -0,0 +1,25 @@
+variable "prefix" {
+  type = string
+}
+
+variable "region" {
+  type = string
+  # This is in Toronto!
+  default = "Canada Central"
+}
+
+variable "user_vm_size" {
+  type = string
+  # VM with 32G of RAM, 8 cores, and ssd base disk
+  default = "Standard_E8s_v3"
+}
+
+variable "core_vm_size" {
+  type = string
+  # 16GB of RAM, 2 cores, ssd base disk
+  default = "Standard_E2s_v3"
+}
+
+variable "ssh_pub_key" {
+  type = string
+}

From 2bf6fc278c6663c138eb53fccfb10f8cb2712f9c Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Tue, 25 Aug 2020 17:38:47 +0530
Subject: [PATCH 02/20] Add disks to NFS machine

- Setup a lifecycle hook to make sure our disk does
  not get destroyed
- Remove the public IP setup. We just ProxyCommand via
  the kubernetes cluster instead
---
 main.tf | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/main.tf b/main.tf
index 8ec85db751..c44c3de30e 100644
--- a/main.tf
+++ b/main.tf
@@ -113,7 +113,7 @@ resource "azurerm_network_security_group" "nfs_vm" {
     source_port_range          = "*"
     source_address_prefix      = "*"
     destination_port_range     = "22"
-    destination_address_prefix = "*"
+    destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address
   }
 
   # NFS from internal network
@@ -131,11 +131,10 @@ resource "azurerm_network_security_group" "nfs_vm" {
 }
 
 resource "azurerm_network_interface_security_group_association" "main" {
-  network_interface_id      = azurerm_network_interface.nfs_vm_pub.id
+  network_interface_id      = azurerm_network_interface.nfs_vm.id
   network_security_group_id = azurerm_network_security_group.nfs_vm.id
 }
 
-
 resource "azurerm_linux_virtual_machine" "nfs_vm" {
   name                = "${var.prefix}-nfs-vm"
   resource_group_name = azurerm_resource_group.jupyterhub.name
@@ -145,7 +144,6 @@ resource "azurerm_linux_virtual_machine" "nfs_vm" {
 
   network_interface_ids = [
     azurerm_network_interface.nfs_vm.id,
-    azurerm_network_interface.nfs_vm_pub.id
   ]
 
   admin_ssh_key {
@@ -167,11 +165,30 @@ resource "azurerm_linux_virtual_machine" "nfs_vm" {
   }
 }
 
+resource "azurerm_managed_disk" "nfs_data_disk_1" {
+  name                 = "${var.prefix}-nfs-data-disk-1"
+  location             = azurerm_resource_group.jupyterhub.location
+  resource_group_name  = azurerm_resource_group.jupyterhub.name
+  storage_account_type = "StandardSSD_LRS"
+  create_option        = "Empty"
+  disk_size_gb         = "100"
 
-output "kubeconfig" {
-  value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw
+  lifecycle {
+    # Terraform plz never destroy data thx
+    prevent_destroy = true
+  }
+  tags = {
+    Environment = "Production"
+  }
+}
+
+resource "azurerm_virtual_machine_data_disk_attachment" "nfs_data_disk_1" {
+  virtual_machine_id = azurerm_linux_virtual_machine.nfs_vm.id
+  managed_disk_id    = azurerm_managed_disk.nfs_data_disk_1.id
+  lun                = 0
+  caching            = "None"
 }
 
-output "nfs_public_ip" {
-  value = azurerm_public_ip.nfs_vm.ip_address
+output "kubeconfig" {
+  value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw
 }

From c445b1e77138fc015a3c539f6ad3ba4a4d44a081 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Tue, 25 Aug 2020 18:36:49 +0530
Subject: [PATCH 03/20] Setup NFS

- Add playbook for NFS server, so we don't hand-setup that
- Mount NFS on the user pods!
---
 nfs-playbook.yaml | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 nfs-playbook.yaml

diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml
new file mode 100644
index 0000000000..e89aca7d60
--- /dev/null
+++ b/nfs-playbook.yaml
@@ -0,0 +1,35 @@
+- name: nfs server setup
+  hosts: all
+  connection: ssh
+  become: true
+  handlers:
+    - name: re-export NFS Shares
+      command:
+        cmd: exportfs -ra
+  tasks:
+    - name: Install NFS packages
+      apt:
+        pkg:
+          - nfs-kernel-server
+          - nfs-common
+          - xfsprogs
+    - name: Setup XFS
+      filesystem:
+        fstype: xfs
+        dev: /dev/disk/azure/scsi1/lun0
+
+    - name: Mount disk
+      mount:
+        path: /export/disk1
+        src: /dev/disk/azure/scsi1/lun0
+        state: mounted
+        fstype: xfs
+        opts: inode64,prjquota
+
+    - name: setup exports file
+      notify:
+        - re-export NFS Shares
+      copy:
+        dest: /etc/exports
+        content: >
+          /export/disk1 10.0.0.0/8(all_squash,anonuid=1000,anongid=1000,no_subtree_check,rw,sync)

From 0e91e6dbc07dec5af1877da2c0a0d3f455df4a9d Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Tue, 25 Aug 2020 20:27:37 +0530
Subject: [PATCH 04/20] Put home directories in a subdir of the disk

I don't like putting home directories straight on a
disk. Call it superstition
---
 nfs-playbook.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml
index e89aca7d60..fc8c07bccb 100644
--- a/nfs-playbook.yaml
+++ b/nfs-playbook.yaml
@@ -26,6 +26,14 @@
         fstype: xfs
         opts: inode64,prjquota
 
+    - name: Create home container directory
+      file:
+        state: directory
+        owner: "1000"
+        group: "1000"
+        path: /export/disk1/homes
+        mode: 0700
+
     - name: setup exports file
       notify:
         - re-export NFS Shares

From dc35b80390f2c7133af47f4ffe1b2e44c5db69e7 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Tue, 25 Aug 2020 21:14:01 +0530
Subject: [PATCH 05/20] Resize XFS partition if block size is bigger

XFS can be live resized up, not down. Ansible will do
this for us now, when we run it.
---
 main.tf           | 2 +-
 nfs-playbook.yaml | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.tf b/main.tf
index c44c3de30e..bd3f2cbdb7 100644
--- a/main.tf
+++ b/main.tf
@@ -171,7 +171,7 @@ resource "azurerm_managed_disk" "nfs_data_disk_1" {
   resource_group_name  = azurerm_resource_group.jupyterhub.name
   storage_account_type = "StandardSSD_LRS"
   create_option        = "Empty"
-  disk_size_gb         = "100"
+  disk_size_gb         = "200"
 
   lifecycle {
     # Terraform plz never destroy data thx
diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml
index fc8c07bccb..44525185a2 100644
--- a/nfs-playbook.yaml
+++ b/nfs-playbook.yaml
@@ -17,6 +17,7 @@
       filesystem:
         fstype: xfs
         dev: /dev/disk/azure/scsi1/lun0
+        resizefs: true
 
     - name: Mount disk
       mount:

From 838f33b0736ec3f1487b01c05b3bdbcefa961774 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 26 Aug 2020 14:03:37 +0530
Subject: [PATCH 06/20] Templatize nfs-playbook & generate values from
 terraform

---
 ansible-hosts.yaml |  9 +++++++++
 main.tf            | 23 +++++++++++++++++++++++
 nfs-playbook.yaml  | 10 +++++-----
 3 files changed, 37 insertions(+), 5 deletions(-)
 create mode 100755 ansible-hosts.yaml

diff --git a/ansible-hosts.yaml b/ansible-hosts.yaml
new file mode 100755
index 0000000000..d950ea1c53
--- /dev/null
+++ b/ansible-hosts.yaml
@@ -0,0 +1,9 @@
+"nfs_servers":
+  "hosts":
+    "jupyterhub-2i2c-nfs-vm":
+      "ansible_ssh_common_args": "-o ProxyCommand='../proxycommand.py %h %p'"
+      "ansible_ssh_private_key_file": "../ssh-key"
+      "ansible_user": "hubadmin"
+  "vars":
+    "disk_lun": 0
+    "disk_name": "jupyterhub-2i2c-nfs-data-disk-1"
diff --git a/main.tf b/main.tf
index bd3f2cbdb7..a838b5fb50 100644
--- a/main.tf
+++ b/main.tf
@@ -189,6 +189,29 @@ resource "azurerm_virtual_machine_data_disk_attachment" "nfs_data_disk_1" {
   caching            = "None"
 }
 
+locals {
+  ansible_hosts = {
+    "nfs_servers" = {
+      hosts = {
+        (azurerm_linux_virtual_machine.nfs_vm.name) = {
+          ansible_ssh_common_args      = "-o ProxyCommand='../proxycommand.py %h %p'"
+          ansible_user                 = "hubadmin"
+          ansible_ssh_private_key_file = "../ssh-key"
+        }
+      }
+      "vars" = {
+        disk_name = (azurerm_managed_disk.nfs_data_disk_1.name)
+        disk_lun  = (azurerm_virtual_machine_data_disk_attachment.nfs_data_disk_1.lun)
+      }
+    }
+  }
+}
+
+resource "local_file" "ansible_hosts_file" {
+  content  = yamlencode(local.ansible_hosts)
+  filename = "ansible-hosts.yaml"
+}
+
 output "kubeconfig" {
   value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw
 }
diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml
index 44525185a2..15e1f6c452 100644
--- a/nfs-playbook.yaml
+++ b/nfs-playbook.yaml
@@ -1,5 +1,5 @@
 - name: nfs server setup
-  hosts: all
+  hosts: nfs_servers
   connection: ssh
   become: true
   handlers:
@@ -16,13 +16,13 @@
     - name: Setup XFS
       filesystem:
         fstype: xfs
-        dev: /dev/disk/azure/scsi1/lun0
+        dev: /dev/disk/azure/scsi1/lun{{ disk_lun }}
         resizefs: true
 
     - name: Mount disk
       mount:
-        path: /export/disk1
-        src: /dev/disk/azure/scsi1/lun0
+        path: /export/{{ disk_name }}
+        src: /dev/disk/azure/scsi1/lun{{ disk_lun }}
         state: mounted
         fstype: xfs
         opts: inode64,prjquota
@@ -32,7 +32,7 @@
         state: directory
         owner: "1000"
         group: "1000"
-        path: /export/disk1/homes
+        path: /export/{{disk_name}}/homes
         mode: 0700
 
     - name: setup exports file

From 564bb6cea29c6f557bec3f52d89cab9c5af9def1 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 26 Aug 2020 15:54:02 +0530
Subject: [PATCH 07/20] Use templated disk name in ansible

Missed a spot
---
 nfs-playbook.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml
index 15e1f6c452..1c76304388 100644
--- a/nfs-playbook.yaml
+++ b/nfs-playbook.yaml
@@ -41,4 +41,4 @@
       copy:
         dest: /etc/exports
         content: >
-          /export/disk1 10.0.0.0/8(all_squash,anonuid=1000,anongid=1000,no_subtree_check,rw,sync)
+          /export/{{disk_name}} 10.0.0.0/8(all_squash,anonuid=1000,anongid=1000,no_subtree_check,rw,sync)

From 245fbb4257e0221cfb5173b9da143bee8a008e53 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 26 Aug 2020 15:59:16 +0530
Subject: [PATCH 08/20] Setup ssh key for AKS nodes too

Very, *very* useful when shit goes bad
---
 main.tf | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/main.tf b/main.tf
index a838b5fb50..ef26933e73 100644
--- a/main.tf
+++ b/main.tf
@@ -33,6 +33,12 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
   resource_group_name = azurerm_resource_group.jupyterhub.name
   dns_prefix          = "${var.prefix}-cluster"
 
+  linux_profile {
+    admin_username = "hubadmin"
+    ssh_key {
+      key_data = file("${path.module}/ssh-key.pub")
+    }
+  }
   # Core node-pool
   default_node_pool {
     name                = "core"

From 553f3eed759fcff35207bd72f539546e594bf776 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Mon, 31 Aug 2020 21:29:20 +0530
Subject: [PATCH 09/20] Add azurerm state storage backend

I lost the previous terraform state because it was just
on my laptop...

This keeps all state off my laptop, and allows multiple users to
do it
---
 main.tf | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/main.tf b/main.tf
index ef26933e73..9f95cee802 100644
--- a/main.tf
+++ b/main.tf
@@ -4,6 +4,16 @@ provider "azurerm" {
   features {}
 }
 
+terraform {
+  backend "azurerm" {
+    resource_group_name  = "terraform-state"
+    storage_account_name = "utorontoterraformstate"
+    container_name       = "terraformstate"
+    key                  = "prod.terraform.tfstate"
+  }
+}
+
+
 provider "local" {
   version = "1.4.0"
 }

From 8dd694ef416ea24994453c9a1a520743b0969245 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Mon, 31 Aug 2020 22:31:23 +0530
Subject: [PATCH 10/20] Add proxycommand script + pub key

Azure is weird about access to VMs and asks you
to do terrible unsecure things (copy a private key?
off my machine?! OVER MY DEAD BODY, SIR). We hack
around it in a cool way that I should really make
into a package
---
 ansible-hosts.yaml |  4 ++--
 main.tf            |  4 ++--
 proxycommand.py    | 44 ++++++++++++++++++++++++++++++++++++++++++++
 ssh-key.pub        |  1 +
 4 files changed, 49 insertions(+), 4 deletions(-)
 create mode 100755 proxycommand.py
 create mode 120000 ssh-key.pub

diff --git a/ansible-hosts.yaml b/ansible-hosts.yaml
index d950ea1c53..c4d38f01fc 100755
--- a/ansible-hosts.yaml
+++ b/ansible-hosts.yaml
@@ -1,8 +1,8 @@
 "nfs_servers":
   "hosts":
     "jupyterhub-2i2c-nfs-vm":
-      "ansible_ssh_common_args": "-o ProxyCommand='../proxycommand.py %h %p'"
-      "ansible_ssh_private_key_file": "../ssh-key"
+      "ansible_ssh_common_args": "-o ProxyCommand='./proxycommand.py %h %p'"
+      "ansible_ssh_private_key_file": "../secrets/ssh-key.unsafe"
       "ansible_user": "hubadmin"
   "vars":
     "disk_lun": 0
diff --git a/main.tf b/main.tf
index 9f95cee802..797753295e 100644
--- a/main.tf
+++ b/main.tf
@@ -210,9 +210,9 @@ locals {
     "nfs_servers" = {
       hosts = {
         (azurerm_linux_virtual_machine.nfs_vm.name) = {
-          ansible_ssh_common_args      = "-o ProxyCommand='../proxycommand.py %h %p'"
+          ansible_ssh_common_args      = "-o ProxyCommand='./proxycommand.py %h %p'"
           ansible_user                 = "hubadmin"
-          ansible_ssh_private_key_file = "../ssh-key"
+          ansible_ssh_private_key_file = "../secrets/ssh-key.unsafe"
         }
       }
       "vars" = {
diff --git a/proxycommand.py b/proxycommand.py
new file mode 100755
index 0000000000..ffa8849b84
--- /dev/null
+++ b/proxycommand.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+import sys
+import subprocess
+import time
+
+
+POD_NAME = "ssh-proxycommand-pod"
+POD_IMAGE = "alpine/socat"
+HOST = sys.argv[1]
+PORT = sys.argv[2]
+
+# Just 'sleep infinity' doesn't handle signals properly
+SCRIPT = "trap 'trap - INT; kill \"$!\"; exit' INT; exec sleep infinity & wait $!"
+
+log = open('log', 'w')
+
+def delete_pod():
+    try:
+        subprocess.check_output([
+            'kubectl', 'delete', 'pod', POD_NAME, '--wait', '--now'
+        ])
+    except subprocess.CalledProcessError as e:
+        print(e.stdout)
+delete_pod()
+
+try:
+    subprocess.check_call([
+        'kubectl', 'run', '--image', POD_IMAGE, '--command', '--wait',
+        POD_NAME,  '--', "/bin/sh", "-c", SCRIPT
+    ])
+
+
+    time.sleep(2)
+
+    print("starting", file=log, flush=True)
+    subprocess.check_call([
+        'kubectl', 'exec', '-i', POD_NAME, '--',
+        'socat', '-', f"tcp:{HOST}:{PORT}"
+    ])
+    print("ending", file=log, flush=True)
+finally:
+    print("deleting", file=log, flush=True)
+    delete_pod()
+
diff --git a/ssh-key.pub b/ssh-key.pub
new file mode 120000
index 0000000000..da373ad07a
--- /dev/null
+++ b/ssh-key.pub
@@ -0,0 +1 @@
+../secrets/ssh-key.pub
\ No newline at end of file

From aa1063b00967a7148dcfac9b55bf5d67a76f5fb0 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Sat, 12 Sep 2020 02:32:16 +0530
Subject: [PATCH 11/20] Cleanup NFS setup

- Use a bigger VM size
- Use PremiumLRS - we can downgrade later if necessary
---
 main.tf      |  9 +++++----
 variables.tf | 12 ++++++++++--
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/main.tf b/main.tf
index 797753295e..b51decea3f 100644
--- a/main.tf
+++ b/main.tf
@@ -53,6 +53,7 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
   default_node_pool {
     name                = "core"
     node_count          = 1
+    # Unfortunately, changing anything about VM type / size recreates *whole cluster
     vm_size             = var.core_vm_size
     os_disk_size_gb     = 100
     enable_auto_scaling = true
@@ -155,7 +156,7 @@ resource "azurerm_linux_virtual_machine" "nfs_vm" {
   name                = "${var.prefix}-nfs-vm"
   resource_group_name = azurerm_resource_group.jupyterhub.name
   location            = azurerm_resource_group.jupyterhub.location
-  size                = "Standard_F2"
+  size                = var.nfs_vm_size
   admin_username      = "hubadmin"
 
   network_interface_ids = [
@@ -168,9 +169,9 @@ resource "azurerm_linux_virtual_machine" "nfs_vm" {
   }
 
   os_disk {
-    caching              = "ReadWrite"
+    caching              = "None"
     storage_account_type = "StandardSSD_LRS"
-    disk_size_gb         = 250
+    disk_size_gb         = 100
   }
 
   source_image_reference {
@@ -185,7 +186,7 @@ resource "azurerm_managed_disk" "nfs_data_disk_1" {
   name                 = "${var.prefix}-nfs-data-disk-1"
   location             = azurerm_resource_group.jupyterhub.location
   resource_group_name  = azurerm_resource_group.jupyterhub.name
-  storage_account_type = "StandardSSD_LRS"
+  storage_account_type = "Premium_LRS"
   create_option        = "Empty"
   disk_size_gb         = "200"
 
diff --git a/variables.tf b/variables.tf
index 1f2a56571e..c4db2e6459 100644
--- a/variables.tf
+++ b/variables.tf
@@ -16,8 +16,16 @@ variable "user_vm_size" {
 
 variable "core_vm_size" {
   type = string
-  # 16GB of RAM, 2 cores, ssd base disk
-  default = "Standard_E2s_v3"
+  # 8GB of RAM, 4 CPU cores, ssd base disk
+  # UNFORTUNATELY changing this triggers a k8s cluster recreation
+  # BOOOO
+  default = "Standard_F4s_v2"
+}
+
+variable "nfs_vm_size" {
+  type = string
+  # 8GB of RAM, 4 CPU cores, ssd base disk
+  default = "Standard_F4s_v2"
 }
 
 variable "ssh_pub_key" {

From 1ceabc72226195f6b2cb9434f1fdb35282e6560d Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Sun, 13 Sep 2020 21:39:11 +0530
Subject: [PATCH 12/20] Bump up resources for core node pool

Prometheus just *eats* resources
---
 variables.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/variables.tf b/variables.tf
index c4db2e6459..35aa76acaf 100644
--- a/variables.tf
+++ b/variables.tf
@@ -19,7 +19,7 @@ variable "core_vm_size" {
   # 8GB of RAM, 4 CPU cores, ssd base disk
   # UNFORTUNATELY changing this triggers a k8s cluster recreation
   # BOOOO
-  default = "Standard_F4s_v2"
+  default = "Standard_E4s_v3"
 }
 
 variable "nfs_vm_size" {

From c6df0686dc72f980be3d1197ae33b2a824262e23 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Mon, 14 Sep 2020 14:09:35 +0530
Subject: [PATCH 13/20] Switch to azure container registry

- Faster pulls, we use the 'premium' tier. We might not
  actually need to, but we can downgrade later.
- Service Principals are kinda weird here, so let's just
  use the admin username / password. This is dangerous
  and stupif. Also needs a new feature in hubploy to
  really work
---
 main.tf      | 23 +++++++++++++++++++++++
 variables.tf |  6 ++++++
 2 files changed, 29 insertions(+)

diff --git a/main.tf b/main.tf
index b51decea3f..f9d13d12d9 100644
--- a/main.tf
+++ b/main.tf
@@ -102,6 +102,16 @@ resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
   }
 }
 
+# AZure container registry
+
+resource "azurerm_container_registry" "container_registry" {
+  # meh, only alphanumberic chars. No separators. BE CONSISTENT, AZURE
+  name = var.global_container_registry_name
+  resource_group_name = azurerm_resource_group.jupyterhub.name
+  location            = azurerm_resource_group.jupyterhub.location
+  sku = "premium"
+  admin_enabled = true
+}
 # NFS VM
 resource "azurerm_network_interface" "nfs_vm" {
   name                = "${var.prefix}-nfs-vm-inet"
@@ -207,6 +217,15 @@ resource "azurerm_virtual_machine_data_disk_attachment" "nfs_data_disk_1" {
 }
 
 locals {
+  registry_creds = {
+    "singleuser" = {
+      "imagePullSecret" = {
+        "username": azurerm_container_registry.container_registry.admin_username,
+        "password": azurerm_container_registry.container_registry.admin_password,
+        "registry": "https://${azurerm_container_registry.container_registry.login_server}"
+      }
+    }
+  }
   ansible_hosts = {
     "nfs_servers" = {
       hosts = {
@@ -232,3 +251,7 @@ resource "local_file" "ansible_hosts_file" {
 output "kubeconfig" {
   value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw
 }
+
+output "registry_creds_config" {
+  value = jsonencode(local.registry_creds)
+}
\ No newline at end of file
diff --git a/variables.tf b/variables.tf
index 35aa76acaf..768f5f52a7 100644
--- a/variables.tf
+++ b/variables.tf
@@ -28,6 +28,12 @@ variable "nfs_vm_size" {
   default = "Standard_F4s_v2"
 }
 
+variable "global_container_registry_name" {
+  type = string
+  # This needs to be globally unique
+  default = "containerregistry2i2cutoronto"
+}
+
 variable "ssh_pub_key" {
   type = string
 }

From b2c9c3d31859054fcb29a6a9f236eaedb9bc395e Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Mon, 14 Sep 2020 14:19:07 +0530
Subject: [PATCH 14/20] Upgrade version of kubernetes

Some of the failures we were seeing - of pod spawns getting
'stuck', might be bugs in AKS versions. See
https://github.com/jupyterhub/kubespawner/pull/433.

Hopefully upgrading fixes it?
---
 main.tf | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/main.tf b/main.tf
index f9d13d12d9..ede4bd4d6c 100644
--- a/main.tf
+++ b/main.tf
@@ -42,6 +42,7 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
   location            = azurerm_resource_group.jupyterhub.location
   resource_group_name = azurerm_resource_group.jupyterhub.name
   dns_prefix          = "${var.prefix}-cluster"
+  kubernetes_version = "1.18.8"
 
   linux_profile {
     admin_username = "hubadmin"
@@ -63,6 +64,8 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
     node_labels = {
       "hub.jupyter.org/pool-name" = "core-pool"
     }
+
+    orchestrator_version = "1.18.8"
   }
 
   identity {
@@ -90,6 +93,8 @@ resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
   os_disk_size_gb       = 200
   node_taints           = ["hub.jupyter.org_dedicated=user:NoSchedule"]
   vnet_subnet_id        = azurerm_subnet.node_subnet.id
+
+  orchestrator_version = "1.18.8"
   node_labels = {
     "hub.jupyter.org/pool-name" = "user-alpha-pool"
   }

From 326855c81a8f9eb5402e93e574627b6574fd664b Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Mon, 14 Sep 2020 14:21:59 +0530
Subject: [PATCH 15/20] Tell autoscaler to get rid of unready nodes fast

AKS nodes seem to be unready a lot
---
 main.tf | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/main.tf b/main.tf
index ede4bd4d6c..f371c586e7 100644
--- a/main.tf
+++ b/main.tf
@@ -68,6 +68,11 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
     orchestrator_version = "1.18.8"
   }
 
+  auto_scaler_profile {
+    # Let's get rid of unready nodes ASAP
+    # Azure nodes love being unready
+    scale_down_unready = "1m"
+  }
   identity {
     type = "SystemAssigned"
   }

From 7db56de297fd73e9b7d2cdf77ad12765b0725bc0 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 6 Jan 2021 18:23:49 +0530
Subject: [PATCH 16/20] Don't specify node_count explicitly

Terraform will try to downscale the cluster if
we set this. If unset, it'll let it be.
---
 main.tf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/main.tf b/main.tf
index f371c586e7..4a382a3eff 100644
--- a/main.tf
+++ b/main.tf
@@ -93,7 +93,6 @@ resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
   name                  = "user"
   kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
   vm_size               = var.user_vm_size
-  node_count            = 1
   enable_auto_scaling   = true
   os_disk_size_gb       = 200
   node_taints           = ["hub.jupyter.org_dedicated=user:NoSchedule"]

From c6acd32789ae60f0d61d91c66f3b13bf2fff0e58 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 6 Jan 2021 18:24:12 +0530
Subject: [PATCH 17/20] Increase NFS disk size to 1T

200G is probably too small for the classes we
are planning on.

Ref #51
---
 main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.tf b/main.tf
index 4a382a3eff..bfe1c80c2e 100644
--- a/main.tf
+++ b/main.tf
@@ -207,7 +207,7 @@ resource "azurerm_managed_disk" "nfs_data_disk_1" {
   resource_group_name  = azurerm_resource_group.jupyterhub.name
   storage_account_type = "Premium_LRS"
   create_option        = "Empty"
-  disk_size_gb         = "200"
+  disk_size_gb         = "1024"
 
   lifecycle {
     # Terraform plz never destroy data thx

From 42cc84592f447f4bc6e6610ab88439249347283c Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 6 Jan 2021 18:45:16 +0530
Subject: [PATCH 18/20] Collect prometheus stats from NFS server as well

The NFS server is our biggest single point of failure,
and we should keep a good eye on it.

Ref #51
---
 main.tf           | 13 +++++++++++++
 nfs-playbook.yaml |  5 +++++
 2 files changed, 18 insertions(+)

diff --git a/main.tf b/main.tf
index bfe1c80c2e..c01e91b48d 100644
--- a/main.tf
+++ b/main.tf
@@ -164,6 +164,19 @@ resource "azurerm_network_security_group" "nfs_vm" {
     destination_port_range     = "2049"
     destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address
   }
+  #
+  # Prometheus from internal network
+  security_rule {
+    access                     = "Allow"
+    direction                  = "Inbound"
+    name                       = "prometheus"
+    priority                   = 102
+    protocol                   = "Tcp"
+    source_port_range          = "*"
+    source_address_prefix      = "*"
+    destination_port_range     = "9100"
+    destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address
+  }
 }
 
 resource "azurerm_network_interface_security_group_association" "main" {
diff --git a/nfs-playbook.yaml b/nfs-playbook.yaml
index 1c76304388..a5a9029662 100644
--- a/nfs-playbook.yaml
+++ b/nfs-playbook.yaml
@@ -42,3 +42,8 @@
         dest: /etc/exports
         content: >
           /export/{{disk_name}} 10.0.0.0/8(all_squash,anonuid=1000,anongid=1000,no_subtree_check,rw,sync)
+
+    - name: Install prometheus-node-exporter
+      apt:
+        pkg:
+          - prometheus-node-exporter

From f332450ee2f5b3c4be0c8ad41810b7724369ae7c Mon Sep 17 00:00:00 2001
From: GeorgianaElena <georgiana.dolocan@gmail.com>
Date: Wed, 17 Feb 2021 13:15:57 +0200
Subject: [PATCH 19/20] Bump hub version to 0.11

---
 main.tf | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/main.tf b/main.tf
index c01e91b48d..9a82a8c011 100644
--- a/main.tf
+++ b/main.tf
@@ -240,12 +240,10 @@ resource "azurerm_virtual_machine_data_disk_attachment" "nfs_data_disk_1" {
 
 locals {
   registry_creds = {
-    "singleuser" = {
-      "imagePullSecret" = {
-        "username": azurerm_container_registry.container_registry.admin_username,
-        "password": azurerm_container_registry.container_registry.admin_password,
-        "registry": "https://${azurerm_container_registry.container_registry.login_server}"
-      }
+    "imagePullSecret" = {
+      "username": azurerm_container_registry.container_registry.admin_username,
+      "password": azurerm_container_registry.container_registry.admin_password,
+      "registry": "https://${azurerm_container_registry.container_registry.login_server}"
     }
   }
   ansible_hosts = {

From 854951fbd6365c8900c19b98cae2c471942a02ee Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Tue, 13 Jul 2021 16:57:35 +0530
Subject: [PATCH 20/20] Remove unused ssh public key

---
 ssh-key.pub | 1 -
 1 file changed, 1 deletion(-)
 delete mode 120000 ssh-key.pub

diff --git a/ssh-key.pub b/ssh-key.pub
deleted file mode 120000
index da373ad07a..0000000000
--- a/ssh-key.pub
+++ /dev/null
@@ -1 +0,0 @@
-../secrets/ssh-key.pub
\ No newline at end of file