Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactored the Terraform directory structures. #1046

Merged
merged 8 commits into from
Oct 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
variable "project_id" {}
variable "resource_name_prefix" {}
variable "node_pool_prefix" {}
variable "region" {}
variable "tpu_node_pools" {}
variable "maintenance_interval" {}
variable "is_tpu_node_private" {}


module "tpu-gke" {
source = "../../module"
project_id = var.project_id
resource_name_prefix = var.resource_name_prefix
node_pool_prefix = var.node_pool_prefix
region = var.region
tpu_node_pools = var.tpu_node_pools
maintenance_interval = var.maintenance_interval
is_tpu_node_private = var.is_tpu_node_private
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ output "kubernetes_cluster_name" {
description = "GKE Cluster Name"
}

output "kubernetes_cluster_host" {
value = module.tpu-gke.kubernetes_cluster_host
description = "GKE Cluster Host"
}

output "placement_policy_names" {
value = module.tpu-gke.placement_policy_names
description = "GKE TPU Placement Policy Names"
}

output "is_tpu_node_private" {
value = var.is_tpu_node_private
description = "whether we want to make TPU node private"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
project_id = "project-id"
resource_name_prefix = "tpu-v5e-test"
node_pool_prefix = "rp1"
region = "us-east5"
is_tpu_node_private = false
tpu_node_pools = [{
zone = "us-east5-b"
node_count = 32
machine_type = "ct5lp-hightpu-4t"
topology = "8x16"
policy = "sb-compact-rp1"
disk_type = "pd-balanced"
disk_size_gb = 120
}, {
zone = "us-east5-b"
node_count = 32
machine_type = "ct5lp-hightpu-4t"
topology = "8x16"
policy = "sb-compact-rp1"
disk_type = "pd-balanced"
disk_size_gb = 120
}]
maintenance_interval = "PERIODIC"
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/**
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

# GKE cluster
data "google_container_engine_versions" "gke_version" {
location = var.region
version_prefix = "1.27."
}

provider "google" {
project = var.project_id
region = var.region
}

# Separately Managed Node Pool
resource "google_container_node_pool" "multihost_tpu" {
count = length(var.tpu_node_pools)
name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}"
provider = google-beta
project = var.project_id
location = var.region
node_locations = [var.tpu_node_pools[count.index].zone]
cluster = "${var.resource_name_prefix}-gke-cluster"

initial_node_count = var.tpu_node_pools[count.index].node_count

management {
auto_upgrade = false
}

node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/cloud-platform",
]
host_maintenance_policy {
maintenance_interval = var.maintenance_interval
}
labels = {
env = var.project_id
}
gvnic {
enabled = true
}
gcfs_config {
enabled = true
}

image_type = "COS_CONTAINERD"
machine_type = var.tpu_node_pools[count.index].machine_type
disk_type = var.tpu_node_pools[count.index].disk_type
disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb
tags = ["gke-node"]
metadata = {
disable-legacy-endpoints = "true"
}
}
placement_policy {
type = "COMPACT"
policy_name = var.tpu_node_pools[count.index].policy
}

network_config {
enable_private_nodes = var.is_tpu_node_private
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@ output "project_id" {
}

output "kubernetes_cluster_name" {
value = google_container_cluster.tpu_cluster.name
value = google_container_node_pool.multihost_tpu[0].cluster
description = "GKE Cluster Name"
}

output "kubernetes_cluster_host" {
value = google_container_cluster.tpu_cluster.endpoint
description = "GKE Cluster Host"
}

output "placement_policy_names" {
value = flatten([
value = flatten([
google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name
])
description = "GKE TPU Placement Policy Names"
}

output "is_tpu_node_private" {
value = var.is_tpu_node_private
description = "whether we want to make TPU node private"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
project_id = "project-id"
resource_name_prefix = "tpu-test"
region = "us-east5"
tpu_node_pools = [{
zone = "us-east5-b"
node_count = 32
machine_type = "ct5lp-hightpu-4t"
topology = "8x16"
policy = "sb-compact-rp1"
}]
maintenance_interval = "AS_NEEDED"
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ variable "resource_name_prefix" {
description = "prefix for all the resouce naming"
}

variable "node_pool_prefix" {
default = ""
description = "prefix for all the resouce naming"
}

variable "tpu_node_pools" {
description = "tpu podslice config"
type = list(object({
Expand All @@ -35,10 +40,17 @@ variable "tpu_node_pools" {
machine_type = string,
topology = string,
policy = string,
disk_type = optional(string),
disk_size_gb = optional(number),
}))
}

variable "is_tpu_node_private" {
description = "whether we want to make TPU node private"
default = false
}

variable "maintenance_interval" {
default = "AS_NEEDED"
default = "AS_NEEDED"
description = "maintenance interval for TPU machines."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
variable "project_id" {}
variable "resource_name_prefix" {}
variable "region" {}
variable "cpu_node_pool" {}
variable "authorized_cidr_blocks" {}
variable "is_cpu_node_private" {}


module "tpu-gke" {
source = "../../module"
project_id = var.project_id
resource_name_prefix = var.resource_name_prefix
region = var.region
cpu_node_pool = var.cpu_node_pool
is_cpu_node_private = var.is_cpu_node_private
authorized_cidr_blocks = var.authorized_cidr_blocks
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
output "region" {
value = var.region
description = "GCloud Region"
}

output "project_id" {
value = var.project_id
description = "GCloud Project ID"
}

output "kubernetes_cluster_name" {
value = module.tpu-gke.kubernetes_cluster_name
description = "GKE Cluster Name"
}

output "authorized_cidr_blocks" {
value = var.authorized_cidr_blocks
description = "Cluster allowed cidr blocks "
}

output "is_cpu_node_private" {
value = var.is_cpu_node_private
description = "whether we want to make CPU node private"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
project_id = "project-id"
resource_name_prefix = "tpu-v5e-test"
region = "us-east5"
authorized_cidr_blocks = []
is_cpu_node_private = false
cpu_node_pool = {
zone = ["us-east5-a", "us-east5-b", "us-east5-c"]
machine_type = "n2-standard-8",
initial_node_count_per_zone = 1,
min_node_count_per_zone = 1,
max_node_count_per_zone = 30,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/**
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

# GKE cluster
data "google_container_engine_versions" "gke_version" {
location = var.region
version_prefix = "1.27."
}

provider "google" {
project = var.project_id
region = var.region
}

# VPC
resource "google_compute_network" "vpc" {
name = "${var.resource_name_prefix}-vpc"
auto_create_subnetworks = "false"
}

# Subnet
resource "google_compute_subnetwork" "subnet" {
name = "${var.resource_name_prefix}-subnet"
region = var.region
network = google_compute_network.vpc.name
ip_cidr_range = "10.10.0.0/19"
}

resource "google_container_cluster" "tpu_cluster" {
name = "${var.resource_name_prefix}-gke-cluster"
location = var.region

# We can't create a cluster with no node pool defined, but we want to only use
# separately managed node pools. So we create the smallest possible default
# node pool and immediately delete it.
remove_default_node_pool = true
initial_node_count = 1
networking_mode = "VPC_NATIVE"
ip_allocation_policy {
cluster_ipv4_cidr_block = "/14"
services_ipv4_cidr_block = "/20"
}
default_max_pods_per_node = 15

release_channel {
channel = "UNSPECIFIED"
}

network = google_compute_network.vpc.name
subnetwork = google_compute_subnetwork.subnet.name
logging_service = "logging.googleapis.com/kubernetes"
monitoring_service = "monitoring.googleapis.com/kubernetes"

master_authorized_networks_config {
gcp_public_cidrs_access_enabled = false

dynamic "cidr_blocks" {
for_each = var.authorized_cidr_blocks
content {
cidr_block = cidr_blocks.value
display_name = "cidr-blocks-group-${cidr_blocks.key}"
}
}
}

// Needs to be false when creating a PSC-based GKE cluster.
// After that, set as true to disable public endpoint of cluster master.
private_cluster_config {
enable_private_endpoint = false
}

timeouts {
create = "120m"
update = "120m"
}
}

resource "google_container_node_pool" "cpu_node_pool" {
provider = google-beta
project = var.project_id
name = "cpu-node-pool"
location = var.region
node_locations = var.cpu_node_pool.zone
cluster = google_container_cluster.tpu_cluster.name
initial_node_count = var.cpu_node_pool.initial_node_count_per_zone
autoscaling {
min_node_count = var.cpu_node_pool.min_node_count_per_zone
max_node_count = var.cpu_node_pool.max_node_count_per_zone
}
max_pods_per_node = 63
node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
machine_type = var.cpu_node_pool.machine_type

metadata = {
disable-legacy-endpoints = "true"
}
gcfs_config {
enabled = true
}
}

network_config {
enable_private_nodes = var.is_cpu_node_private
}
}
Loading