Skip to content

Commit

Permalink
Merge Azure TF code from utoronto deploy 2i2c-org#513
Browse files Browse the repository at this point in the history
  • Loading branch information
yuvipanda committed Jul 13, 2021
2 parents fa2af9d + 854951f commit 72e3092
Show file tree
Hide file tree
Showing 6 changed files with 420 additions and 0 deletions.
9 changes: 9 additions & 0 deletions terraform/azure/ansible-hosts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"nfs_servers":
"hosts":
"jupyterhub-2i2c-nfs-vm":
"ansible_ssh_common_args": "-o ProxyCommand='./proxycommand.py %h %p'"
"ansible_ssh_private_key_file": "../secrets/ssh-key.unsafe"
"ansible_user": "hubadmin"
"vars":
"disk_lun": 0
"disk_name": "jupyterhub-2i2c-nfs-data-disk-1"
277 changes: 277 additions & 0 deletions terraform/azure/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
provider "azurerm" {
# whilst the `version` attribute is optional, we recommend pinning to a given version of the Provider
version = "=2.20.0"
features {}
}

terraform {
backend "azurerm" {
resource_group_name = "terraform-state"
storage_account_name = "utorontoterraformstate"
container_name = "terraformstate"
key = "prod.terraform.tfstate"
}
}


provider "local" {
version = "1.4.0"
}

resource "azurerm_resource_group" "jupyterhub" {
name = "${var.prefix}-rg"
location = var.region
}

resource "azurerm_virtual_network" "jupyterhub" {
name = "${var.prefix}-network"
location = azurerm_resource_group.jupyterhub.location
resource_group_name = azurerm_resource_group.jupyterhub.name
address_space = ["10.0.0.0/8"]
}

resource "azurerm_subnet" "node_subnet" {
name = "${var.prefix}-node-subnet"
virtual_network_name = azurerm_virtual_network.jupyterhub.name
resource_group_name = azurerm_resource_group.jupyterhub.name
address_prefixes = ["10.1.0.0/16"]
}

resource "azurerm_kubernetes_cluster" "jupyterhub" {
name = "${var.prefix}-cluster"
location = azurerm_resource_group.jupyterhub.location
resource_group_name = azurerm_resource_group.jupyterhub.name
dns_prefix = "${var.prefix}-cluster"
kubernetes_version = "1.18.8"

linux_profile {
admin_username = "hubadmin"
ssh_key {
key_data = file("${path.module}/ssh-key.pub")
}
}
# Core node-pool
default_node_pool {
name = "core"
node_count = 1
# Unfortunately, changing anything about VM type / size recreates *whole cluster
vm_size = var.core_vm_size
os_disk_size_gb = 100
enable_auto_scaling = true
min_count = 1
max_count = 8
vnet_subnet_id = azurerm_subnet.node_subnet.id
node_labels = {
"hub.jupyter.org/pool-name" = "core-pool"
}

orchestrator_version = "1.18.8"
}

auto_scaler_profile {
# Let's get rid of unready nodes ASAP
# Azure nodes love being unready
scale_down_unready = "1m"
}
identity {
type = "SystemAssigned"
}

network_profile {
# I don't trust Azure CNI
network_plugin = "kubenet"
network_policy = "calico"
}

tags = {
Environment = "Production"
ManagedBy = "2i2c"
}
}

resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
name = "user"
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
vm_size = var.user_vm_size
enable_auto_scaling = true
os_disk_size_gb = 200
node_taints = ["hub.jupyter.org_dedicated=user:NoSchedule"]
vnet_subnet_id = azurerm_subnet.node_subnet.id

orchestrator_version = "1.18.8"
node_labels = {
"hub.jupyter.org/pool-name" = "user-alpha-pool"
}

min_count = 1
max_count = 100
tags = {
Environment = "Production"
ManagedBy = "2i2c"
}
}

# AZure container registry

resource "azurerm_container_registry" "container_registry" {
# meh, only alphanumberic chars. No separators. BE CONSISTENT, AZURE
name = var.global_container_registry_name
resource_group_name = azurerm_resource_group.jupyterhub.name
location = azurerm_resource_group.jupyterhub.location
sku = "premium"
admin_enabled = true
}
# NFS VM
resource "azurerm_network_interface" "nfs_vm" {
name = "${var.prefix}-nfs-vm-inet"
location = azurerm_resource_group.jupyterhub.location
resource_group_name = azurerm_resource_group.jupyterhub.name

ip_configuration {
name = "internal"
subnet_id = azurerm_subnet.node_subnet.id
private_ip_address_allocation = "Dynamic"
}
}

resource "azurerm_network_security_group" "nfs_vm" {
name = "${var.prefix}-nfs-vm-nsg"
location = azurerm_resource_group.jupyterhub.location
resource_group_name = azurerm_resource_group.jupyterhub.name

# SSH from the world
security_rule {
access = "Allow"
direction = "Inbound"
name = "ssh"
priority = 100
protocol = "Tcp"
source_port_range = "*"
source_address_prefix = "*"
destination_port_range = "22"
destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address
}

# NFS from internal network
security_rule {
access = "Allow"
direction = "Inbound"
name = "nfs"
priority = 101
protocol = "Tcp"
source_port_range = "*"
source_address_prefix = "*"
destination_port_range = "2049"
destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address
}
#
# Prometheus from internal network
security_rule {
access = "Allow"
direction = "Inbound"
name = "prometheus"
priority = 102
protocol = "Tcp"
source_port_range = "*"
source_address_prefix = "*"
destination_port_range = "9100"
destination_address_prefix = azurerm_network_interface.nfs_vm.private_ip_address
}
}

resource "azurerm_network_interface_security_group_association" "main" {
network_interface_id = azurerm_network_interface.nfs_vm.id
network_security_group_id = azurerm_network_security_group.nfs_vm.id
}

resource "azurerm_linux_virtual_machine" "nfs_vm" {
name = "${var.prefix}-nfs-vm"
resource_group_name = azurerm_resource_group.jupyterhub.name
location = azurerm_resource_group.jupyterhub.location
size = var.nfs_vm_size
admin_username = "hubadmin"

network_interface_ids = [
azurerm_network_interface.nfs_vm.id,
]

admin_ssh_key {
username = "hubadmin"
public_key = file("${path.module}/ssh-key.pub")
}

os_disk {
caching = "None"
storage_account_type = "StandardSSD_LRS"
disk_size_gb = 100
}

source_image_reference {
publisher = "Canonical"
offer = "0001-com-ubuntu-server-focal"
sku = "20_04-lts"
version = "latest"
}
}

resource "azurerm_managed_disk" "nfs_data_disk_1" {
name = "${var.prefix}-nfs-data-disk-1"
location = azurerm_resource_group.jupyterhub.location
resource_group_name = azurerm_resource_group.jupyterhub.name
storage_account_type = "Premium_LRS"
create_option = "Empty"
disk_size_gb = "1024"

lifecycle {
# Terraform plz never destroy data thx
prevent_destroy = true
}
tags = {
Environment = "Production"
}
}

resource "azurerm_virtual_machine_data_disk_attachment" "nfs_data_disk_1" {
virtual_machine_id = azurerm_linux_virtual_machine.nfs_vm.id
managed_disk_id = azurerm_managed_disk.nfs_data_disk_1.id
lun = 0
caching = "None"
}

locals {
registry_creds = {
"imagePullSecret" = {
"username": azurerm_container_registry.container_registry.admin_username,
"password": azurerm_container_registry.container_registry.admin_password,
"registry": "https://${azurerm_container_registry.container_registry.login_server}"
}
}
ansible_hosts = {
"nfs_servers" = {
hosts = {
(azurerm_linux_virtual_machine.nfs_vm.name) = {
ansible_ssh_common_args = "-o ProxyCommand='./proxycommand.py %h %p'"
ansible_user = "hubadmin"
ansible_ssh_private_key_file = "../secrets/ssh-key.unsafe"
}
}
"vars" = {
disk_name = (azurerm_managed_disk.nfs_data_disk_1.name)
disk_lun = (azurerm_virtual_machine_data_disk_attachment.nfs_data_disk_1.lun)
}
}
}
}

resource "local_file" "ansible_hosts_file" {
content = yamlencode(local.ansible_hosts)
filename = "ansible-hosts.yaml"
}

output "kubeconfig" {
value = azurerm_kubernetes_cluster.jupyterhub.kube_config_raw
}

output "registry_creds_config" {
value = jsonencode(local.registry_creds)
}
49 changes: 49 additions & 0 deletions terraform/azure/nfs-playbook.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
- name: nfs server setup
hosts: nfs_servers
connection: ssh
become: true
handlers:
- name: re-export NFS Shares
command:
cmd: exportfs -ra
tasks:
- name: Install NFS packages
apt:
pkg:
- nfs-kernel-server
- nfs-common
- xfsprogs
- name: Setup XFS
filesystem:
fstype: xfs
dev: /dev/disk/azure/scsi1/lun{{ disk_lun }}
resizefs: true

- name: Mount disk
mount:
path: /export/{{ disk_name }}
src: /dev/disk/azure/scsi1/lun{{ disk_lun }}
state: mounted
fstype: xfs
opts: inode64,prjquota

- name: Create home container directory
file:
state: directory
owner: "1000"
group: "1000"
path: /export/{{disk_name}}/homes
mode: 0700

- name: setup exports file
notify:
- re-export NFS Shares
copy:
dest: /etc/exports
content: >
/export/{{disk_name}} 10.0.0.0/8(all_squash,anonuid=1000,anongid=1000,no_subtree_check,rw,sync)
- name: Install prometheus-node-exporter
apt:
pkg:
- prometheus-node-exporter
44 changes: 44 additions & 0 deletions terraform/azure/proxycommand.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3
import sys
import subprocess
import time


POD_NAME = "ssh-proxycommand-pod"
POD_IMAGE = "alpine/socat"
HOST = sys.argv[1]
PORT = sys.argv[2]

# Just 'sleep infinity' doesn't handle signals properly
SCRIPT = "trap 'trap - INT; kill \"$!\"; exit' INT; exec sleep infinity & wait $!"

log = open('log', 'w')

def delete_pod():
try:
subprocess.check_output([
'kubectl', 'delete', 'pod', POD_NAME, '--wait', '--now'
])
except subprocess.CalledProcessError as e:
print(e.stdout)
delete_pod()

try:
subprocess.check_call([
'kubectl', 'run', '--image', POD_IMAGE, '--command', '--wait',
POD_NAME, '--', "/bin/sh", "-c", SCRIPT
])


time.sleep(2)

print("starting", file=log, flush=True)
subprocess.check_call([
'kubectl', 'exec', '-i', POD_NAME, '--',
'socat', '-', f"tcp:{HOST}:{PORT}"
])
print("ending", file=log, flush=True)
finally:
print("deleting", file=log, flush=True)
delete_pod()

2 changes: 2 additions & 0 deletions terraform/azure/test.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
prefix = "jupyterhub-2i2c"
ssh_pub_key = "ssh-key.pub"
Loading

0 comments on commit 72e3092

Please sign in to comment.