From 906be5267106a72d51d682d6fda15210118840cf Mon Sep 17 00:00:00 2001 From: Kangmin Date: Fri, 6 Oct 2023 18:47:18 -0400 Subject: [PATCH] Refactored the Terraform directory structures. (#1046) * Reorganize Terraform scripts by moving the sub directories. * Updated the flexible to PSC-based cluster type --- .../add_node_pool}/examples/v5e/main.tf | 4 + .../add_node_pool}/examples/v5e/outputs.tf | 10 +- .../examples/v5e/terraform.tfvars | 23 ++++ .../add_node_pool/module/main.tf | 80 ++++++++++++ .../add_node_pool}/module/outputs.tf | 14 +- .../add_node_pool/module/terraform.tfvars | 11 ++ .../add_node_pool}/module/variables.tf | 14 +- .../create_cluster/examples/v5e/main.tf | 17 +++ .../create_cluster/examples/v5e/outputs.tf | 24 ++++ .../examples/v5e/terraform.tfvars | 12 ++ .../create_cluster/module/main.tf | 121 ++++++++++++++++++ .../create_cluster/module/outputs.tf | 24 ++++ .../create_cluster/module/terraform.tfvars | 11 ++ .../create_cluster/module/variables.tf | 56 ++++++++ .../add_node_pool/examples/v5e}/main.tf | 6 +- .../add_node_pool/examples/v5e}/outputs.tf | 11 +- .../examples/v5e/terraform.tfvars | 14 ++ .../add_node_pool/module/main.tf | 80 ++++++++++++ .../add_node_pool/module/outputs.tf | 19 +++ .../add_node_pool/module/terraform.tfvars | 10 ++ .../add_node_pool/module/variables.tf | 55 ++++++++ .../create_cluster/examples/v5e/main.tf | 17 +++ .../create_cluster/examples/v5e/outputs.tf | 24 ++++ .../examples/v5e/terraform.tfvars | 12 ++ .../create_cluster/module/main.tf | 121 ++++++++++++++++++ .../create_cluster/module/outputs.tf | 24 ++++ .../create_cluster/module/terraform.tfvars | 11 ++ .../create_cluster/module/variables.tf | 56 ++++++++ .../terraform/examples/v4/terraform.tfvars | 9 -- .../terraform/module/terraform.tfvars | 21 --- .../examples/v5e/main.tf | 25 ++++ .../examples/v5e/outputs.tf | 39 ++++++ .../examples/v5e/terraform.tfvars | 83 +++++++----- .../module/main.tf | 69 +++++++++- .../module/outputs.tf | 41 ++++++ .../module/terraform.tfvars | 23 ++++ .../module/variables.tf | 84 ++++++++++++ 37 files changed, 1184 insertions(+), 91 deletions(-) rename tools/kubernetes/terraform/{ => batching_with_compact_placement/add_node_pool}/examples/v5e/main.tf (72%) rename tools/kubernetes/terraform/{ => batching_with_compact_placement/add_node_pool}/examples/v5e/outputs.tf (77%) create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf rename tools/kubernetes/terraform/{ => batching_with_compact_placement/add_node_pool}/module/outputs.tf (65%) create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars rename tools/kubernetes/terraform/{ => batching_with_compact_placement/add_node_pool}/module/variables.tf (78%) create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf rename tools/kubernetes/terraform/{examples/v4 => batching_without_compact_placement/add_node_pool/examples/v5e}/main.tf (72%) rename tools/kubernetes/terraform/{examples/v4 => batching_without_compact_placement/add_node_pool/examples/v5e}/outputs.tf (54%) create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf delete mode 100644 tools/kubernetes/terraform/examples/v4/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/outputs.tf rename tools/kubernetes/terraform/{ => non_batching_with_compact_placement}/examples/v5e/terraform.tfvars (52%) rename tools/kubernetes/terraform/{ => non_batching_with_compact_placement}/module/main.tf (66%) create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf diff --git a/tools/kubernetes/terraform/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf similarity index 72% rename from tools/kubernetes/terraform/examples/v5e/main.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf index c3b6990c..61ac2331 100644 --- a/tools/kubernetes/terraform/examples/v5e/main.tf +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf @@ -1,15 +1,19 @@ variable "project_id" {} variable "resource_name_prefix" {} +variable "node_pool_prefix" {} variable "region" {} variable "tpu_node_pools" {} variable "maintenance_interval" {} +variable "is_tpu_node_private" {} module "tpu-gke" { source = "../../module" project_id = var.project_id resource_name_prefix = var.resource_name_prefix + node_pool_prefix = var.node_pool_prefix region = var.region tpu_node_pools = var.tpu_node_pools maintenance_interval = var.maintenance_interval + is_tpu_node_private = var.is_tpu_node_private } diff --git a/tools/kubernetes/terraform/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf similarity index 77% rename from tools/kubernetes/terraform/examples/v5e/outputs.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf index 757767c3..ebb1782f 100644 --- a/tools/kubernetes/terraform/examples/v5e/outputs.tf +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf @@ -13,12 +13,12 @@ output "kubernetes_cluster_name" { description = "GKE Cluster Name" } -output "kubernetes_cluster_host" { - value = module.tpu-gke.kubernetes_cluster_host - description = "GKE Cluster Host" -} - output "placement_policy_names" { value = module.tpu-gke.placement_policy_names description = "GKE TPU Placement Policy Names" } + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..9148d119 --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars @@ -0,0 +1,23 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +node_pool_prefix = "rp1" +region = "us-east5" +is_tpu_node_private = false +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 32 + machine_type = "ct5lp-hightpu-4t" + topology = "8x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 32 + machine_type = "ct5lp-hightpu-4t" + topology = "8x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 +}] +maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf new file mode 100644 index 00000000..8df883bc --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf @@ -0,0 +1,80 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# Separately Managed Node Pool +resource "google_container_node_pool" "multihost_tpu" { + count = length(var.tpu_node_pools) + name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" + provider = google-beta + project = var.project_id + location = var.region + node_locations = [var.tpu_node_pools[count.index].zone] + cluster = "${var.resource_name_prefix}-gke-cluster" + + initial_node_count = var.tpu_node_pools[count.index].node_count + + management { + auto_upgrade = false + } + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/cloud-platform", + ] + host_maintenance_policy { + maintenance_interval = var.maintenance_interval + } + labels = { + env = var.project_id + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + + image_type = "COS_CONTAINERD" + machine_type = var.tpu_node_pools[count.index].machine_type + disk_type = var.tpu_node_pools[count.index].disk_type + disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb + tags = ["gke-node"] + metadata = { + disable-legacy-endpoints = "true" + } + } + placement_policy { + type = "COMPACT" + policy_name = var.tpu_node_pools[count.index].policy + } + + network_config { + enable_private_nodes = var.is_tpu_node_private + } +} diff --git a/tools/kubernetes/terraform/module/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf similarity index 65% rename from tools/kubernetes/terraform/module/outputs.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf index 24cb14dc..68085ceb 100644 --- a/tools/kubernetes/terraform/module/outputs.tf +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf @@ -9,18 +9,18 @@ output "project_id" { } output "kubernetes_cluster_name" { - value = google_container_cluster.tpu_cluster.name + value = google_container_node_pool.multihost_tpu[0].cluster description = "GKE Cluster Name" } -output "kubernetes_cluster_host" { - value = google_container_cluster.tpu_cluster.endpoint - description = "GKE Cluster Host" -} - output "placement_policy_names" { - value = flatten([ + value = flatten([ google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name ]) description = "GKE TPU Placement Policy Names" } + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars new file mode 100644 index 00000000..520ff3a1 --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-east5" +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 32 + machine_type = "ct5lp-hightpu-4t" + topology = "8x16" + policy = "sb-compact-rp1" +}] +maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/module/variables.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf similarity index 78% rename from tools/kubernetes/terraform/module/variables.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf index 35f460aa..fa5d507d 100644 --- a/tools/kubernetes/terraform/module/variables.tf +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf @@ -27,6 +27,11 @@ variable "resource_name_prefix" { description = "prefix for all the resouce naming" } +variable "node_pool_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + variable "tpu_node_pools" { description = "tpu podslice config" type = list(object({ @@ -35,10 +40,17 @@ variable "tpu_node_pools" { machine_type = string, topology = string, policy = string, + disk_type = optional(string), + disk_size_gb = optional(number), })) } +variable "is_tpu_node_private" { + description = "whether we want to make TPU node private" + default = false +} + variable "maintenance_interval" { - default = "AS_NEEDED" + default = "AS_NEEDED" description = "maintenance interval for TPU machines." } diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf new file mode 100644 index 00000000..7cce20f2 --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf @@ -0,0 +1,17 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "cpu_node_pool" {} +variable "authorized_cidr_blocks" {} +variable "is_cpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + cpu_node_pool = var.cpu_node_pool + is_cpu_node_private = var.is_cpu_node_private + authorized_cidr_blocks = var.authorized_cidr_blocks +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf new file mode 100644 index 00000000..a5514b1f --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..7f9fcb9f --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars @@ -0,0 +1,12 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +region = "us-east5" +authorized_cidr_blocks = [] +is_cpu_node_private = false +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-8", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 30, +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf new file mode 100644 index 00000000..6596e498 --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf @@ -0,0 +1,121 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# VPC +resource "google_compute_network" "vpc" { + name = "${var.resource_name_prefix}-vpc" + auto_create_subnetworks = "false" +} + +# Subnet +resource "google_compute_subnetwork" "subnet" { + name = "${var.resource_name_prefix}-subnet" + region = var.region + network = google_compute_network.vpc.name + ip_cidr_range = "10.10.0.0/19" +} + +resource "google_container_cluster" "tpu_cluster" { + name = "${var.resource_name_prefix}-gke-cluster" + location = var.region + + # We can't create a cluster with no node pool defined, but we want to only use + # separately managed node pools. So we create the smallest possible default + # node pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" + ip_allocation_policy { + cluster_ipv4_cidr_block = "/14" + services_ipv4_cidr_block = "/20" + } + default_max_pods_per_node = 15 + + release_channel { + channel = "UNSPECIFIED" + } + + network = google_compute_network.vpc.name + subnetwork = google_compute_subnetwork.subnet.name + logging_service = "logging.googleapis.com/kubernetes" + monitoring_service = "monitoring.googleapis.com/kubernetes" + + master_authorized_networks_config { + gcp_public_cidrs_access_enabled = false + + dynamic "cidr_blocks" { + for_each = var.authorized_cidr_blocks + content { + cidr_block = cidr_blocks.value + display_name = "cidr-blocks-group-${cidr_blocks.key}" + } + } + } + + // Needs to be false when creating a PSC-based GKE cluster. + // After that, set as true to disable public endpoint of cluster master. + private_cluster_config { + enable_private_endpoint = false + } + + timeouts { + create = "120m" + update = "120m" + } +} + +resource "google_container_node_pool" "cpu_node_pool" { + provider = google-beta + project = var.project_id + name = "cpu-node-pool" + location = var.region + node_locations = var.cpu_node_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone + autoscaling { + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone + } + max_pods_per_node = 63 + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + machine_type = var.cpu_node_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gcfs_config { + enabled = true + } + } + + network_config { + enable_private_nodes = var.is_cpu_node_private + } +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf new file mode 100644 index 00000000..3953819c --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_cluster.tpu_cluster.name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars new file mode 100644 index 00000000..bdda5d5e --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-east5" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf new file mode 100644 index 00000000..df05e43d --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "authorized_cidr_blocks" { + description = "cluster allowed cidr blocks to access with kubectl CLI" + type = list(string) + default = [] +} + +variable "cpu_node_pool" { + description = "cpu nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) + ) + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." + } +} + +variable "is_cpu_node_private" { + description = "whether we want to make CPU node private" + default = false +} diff --git a/tools/kubernetes/terraform/examples/v4/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf similarity index 72% rename from tools/kubernetes/terraform/examples/v4/main.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf index e3856948..61ac2331 100644 --- a/tools/kubernetes/terraform/examples/v4/main.tf +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf @@ -1,15 +1,19 @@ variable "project_id" {} variable "resource_name_prefix" {} +variable "node_pool_prefix" {} variable "region" {} variable "tpu_node_pools" {} variable "maintenance_interval" {} +variable "is_tpu_node_private" {} module "tpu-gke" { source = "../../module" project_id = var.project_id resource_name_prefix = var.resource_name_prefix + node_pool_prefix = var.node_pool_prefix region = var.region tpu_node_pools = var.tpu_node_pools maintenance_interval = var.maintenance_interval -} \ No newline at end of file + is_tpu_node_private = var.is_tpu_node_private +} diff --git a/tools/kubernetes/terraform/examples/v4/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf similarity index 54% rename from tools/kubernetes/terraform/examples/v4/outputs.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf index eb36535c..846c656e 100644 --- a/tools/kubernetes/terraform/examples/v4/outputs.tf +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf @@ -13,12 +13,7 @@ output "kubernetes_cluster_name" { description = "GKE Cluster Name" } -output "kubernetes_cluster_host" { - value = module.tpu-gke.kubernetes_cluster_host - description = "GKE Cluster Host" +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" } - -output "nodepool_tpu_topology" { - value = module.tpu-gke.nodepool_tpu_topology - description = "GKE TPU topology" -} \ No newline at end of file diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..e18f03ca --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars @@ -0,0 +1,14 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +node_pool_prefix = "batch1" +region = "us-east5" +is_tpu_node_private = false +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 16 + machine_type = "ct5lp-hightpu-4t" + topology = "8x8" + disk_type = "pd-balanced" + disk_size_gb = 120 +}] +maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf new file mode 100644 index 00000000..0ccdbdba --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf @@ -0,0 +1,80 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# Separately Managed Node Pool +resource "google_container_node_pool" "multihost_tpu" { + count = length(var.tpu_node_pools) + name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" + provider = google-beta + project = var.project_id + location = var.region + node_locations = [var.tpu_node_pools[count.index].zone] + cluster = "${var.resource_name_prefix}-gke-cluster" + + initial_node_count = var.tpu_node_pools[count.index].node_count + + management { + auto_upgrade = false + } + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/cloud-platform", + ] + host_maintenance_policy { + maintenance_interval = var.maintenance_interval + } + labels = { + env = var.project_id + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + + image_type = "COS_CONTAINERD" + machine_type = var.tpu_node_pools[count.index].machine_type + disk_type = var.tpu_node_pools[count.index].disk_type + disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb + tags = ["gke-node"] + metadata = { + disable-legacy-endpoints = "true" + } + } + placement_policy { + type = "COMPACT" + tpu_topology = var.tpu_node_pools[count.index].topology + } + + network_config { + enable_private_nodes = var.is_tpu_node_private + } +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf new file mode 100644 index 00000000..06972205 --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf @@ -0,0 +1,19 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_node_pool.multihost_tpu[0].cluster + description = "GKE Cluster Name" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars new file mode 100644 index 00000000..a38800da --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars @@ -0,0 +1,10 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-east5" +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 16 + machine_type = "ct5lp-hightpu-4t" + topology = "8x8" +}] +maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf new file mode 100644 index 00000000..c467e69a --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf @@ -0,0 +1,55 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "node_pool_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "tpu_node_pools" { + description = "tpu podslice config" + type = list(object({ + zone = string, + node_count = number, + machine_type = string, + topology = string, + disk_type = optional(string), + disk_size_gb = optional(number), + })) +} + +variable "is_tpu_node_private" { + description = "whether we want to make TPU node private" + default = false +} + +variable "maintenance_interval" { + default = "AS_NEEDED" + description = "maintenance interval for TPU machines." +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf new file mode 100644 index 00000000..7cce20f2 --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf @@ -0,0 +1,17 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "cpu_node_pool" {} +variable "authorized_cidr_blocks" {} +variable "is_cpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + cpu_node_pool = var.cpu_node_pool + is_cpu_node_private = var.is_cpu_node_private + authorized_cidr_blocks = var.authorized_cidr_blocks +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf new file mode 100644 index 00000000..a5514b1f --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..7f9fcb9f --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars @@ -0,0 +1,12 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +region = "us-east5" +authorized_cidr_blocks = [] +is_cpu_node_private = false +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-8", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 30, +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf new file mode 100644 index 00000000..6596e498 --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf @@ -0,0 +1,121 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# VPC +resource "google_compute_network" "vpc" { + name = "${var.resource_name_prefix}-vpc" + auto_create_subnetworks = "false" +} + +# Subnet +resource "google_compute_subnetwork" "subnet" { + name = "${var.resource_name_prefix}-subnet" + region = var.region + network = google_compute_network.vpc.name + ip_cidr_range = "10.10.0.0/19" +} + +resource "google_container_cluster" "tpu_cluster" { + name = "${var.resource_name_prefix}-gke-cluster" + location = var.region + + # We can't create a cluster with no node pool defined, but we want to only use + # separately managed node pools. So we create the smallest possible default + # node pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" + ip_allocation_policy { + cluster_ipv4_cidr_block = "/14" + services_ipv4_cidr_block = "/20" + } + default_max_pods_per_node = 15 + + release_channel { + channel = "UNSPECIFIED" + } + + network = google_compute_network.vpc.name + subnetwork = google_compute_subnetwork.subnet.name + logging_service = "logging.googleapis.com/kubernetes" + monitoring_service = "monitoring.googleapis.com/kubernetes" + + master_authorized_networks_config { + gcp_public_cidrs_access_enabled = false + + dynamic "cidr_blocks" { + for_each = var.authorized_cidr_blocks + content { + cidr_block = cidr_blocks.value + display_name = "cidr-blocks-group-${cidr_blocks.key}" + } + } + } + + // Needs to be false when creating a PSC-based GKE cluster. + // After that, set as true to disable public endpoint of cluster master. + private_cluster_config { + enable_private_endpoint = false + } + + timeouts { + create = "120m" + update = "120m" + } +} + +resource "google_container_node_pool" "cpu_node_pool" { + provider = google-beta + project = var.project_id + name = "cpu-node-pool" + location = var.region + node_locations = var.cpu_node_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone + autoscaling { + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone + } + max_pods_per_node = 63 + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + machine_type = var.cpu_node_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gcfs_config { + enabled = true + } + } + + network_config { + enable_private_nodes = var.is_cpu_node_private + } +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf new file mode 100644 index 00000000..3953819c --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_cluster.tpu_cluster.name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tfvars new file mode 100644 index 00000000..bdda5d5e --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tfvars @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-east5" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf new file mode 100644 index 00000000..df05e43d --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "authorized_cidr_blocks" { + description = "cluster allowed cidr blocks to access with kubectl CLI" + type = list(string) + default = [] +} + +variable "cpu_node_pool" { + description = "cpu nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) + ) + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." + } +} + +variable "is_cpu_node_private" { + description = "whether we want to make CPU node private" + default = false +} diff --git a/tools/kubernetes/terraform/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/examples/v4/terraform.tfvars deleted file mode 100644 index c382f146..00000000 --- a/tools/kubernetes/terraform/examples/v4/terraform.tfvars +++ /dev/null @@ -1,9 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" - }] \ No newline at end of file diff --git a/tools/kubernetes/terraform/module/terraform.tfvars b/tools/kubernetes/terraform/module/terraform.tfvars deleted file mode 100644 index a24e1f9c..00000000 --- a/tools/kubernetes/terraform/module/terraform.tfvars +++ /dev/null @@ -1,21 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -location = "us-central2-b" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 4 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x4" - }, { - zone = "us-central2-b" - node_count = 4 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x4" - }, { - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" -}] -maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/main.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/main.tf new file mode 100644 index 00000000..d6970a7d --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/main.tf @@ -0,0 +1,25 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "node_pool_prefix" {} +variable "region" {} +variable "cpu_node_pool" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} +variable "authorized_cidr_blocks" {} +variable "is_cpu_node_private" {} +variable "is_tpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + node_pool_prefix = var.node_pool_prefix + region = var.region + cpu_node_pool = var.cpu_node_pool + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval + authorized_cidr_blocks = var.authorized_cidr_blocks + is_cpu_node_private = var.is_cpu_node_private + is_tpu_node_private = var.is_tpu_node_private +} diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/outputs.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/outputs.tf new file mode 100644 index 00000000..849d16c8 --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/outputs.tf @@ -0,0 +1,39 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "kubernetes_cluster_host" { + value = module.tpu-gke.kubernetes_cluster_host + description = "GKE Cluster Host" +} + +output "placement_policy_names" { + value = module.tpu-gke.placement_policy_names + description = "GKE TPU Placement Policy Names" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/terraform.tfvars similarity index 52% rename from tools/kubernetes/terraform/examples/v5e/terraform.tfvars rename to tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/terraform.tfvars index 73f76abb..45a65aaf 100644 --- a/tools/kubernetes/terraform/examples/v5e/terraform.tfvars +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/terraform.tfvars @@ -1,101 +1,120 @@ -project_id = "project_id" +project_id = "project-id" resource_name_prefix = "tpu-v5e-test" region = "us-east5" +node_pool_prefix = "rp1" +authorized_cidr_blocks = [] +is_cpu_node_private = false +is_tpu_node_private = false +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-8", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 30, +} tpu_node_pools = [{ zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4a" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4a" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4a" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4a" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4b" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4b" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4b" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4b" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4c" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4c" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4c" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4c" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }, { zone = "us-east5-b" node_count = 64 machine_type = "ct5lp-hightpu-4t" topology = "16x16" - policy = "sb-compact-4d" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4d" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4d" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4d" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 }] maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/module/main.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf similarity index 66% rename from tools/kubernetes/terraform/module/main.tf rename to tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf index 6c551df3..02ef424c 100644 --- a/tools/kubernetes/terraform/module/main.tf +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf @@ -53,17 +53,35 @@ resource "google_container_cluster" "tpu_cluster" { cluster_ipv4_cidr_block = "/14" services_ipv4_cidr_block = "/20" } - default_max_pods_per_node = 50 + default_max_pods_per_node = 15 release_channel { channel = "UNSPECIFIED" } - + network = google_compute_network.vpc.name subnetwork = google_compute_subnetwork.subnet.name logging_service = "logging.googleapis.com/kubernetes" monitoring_service = "monitoring.googleapis.com/kubernetes" + master_authorized_networks_config { + gcp_public_cidrs_access_enabled = false + + dynamic "cidr_blocks" { + for_each = var.authorized_cidr_blocks + content { + cidr_block = cidr_blocks.value + display_name = "cidr-blocks-group-${cidr_blocks.key}" + } + } + } + + // Needs to be false when creating a PSC-based GKE cluster. + // After that, set as true to disable public endpoint of cluster master. + private_cluster_config { + enable_private_endpoint = false + } + timeouts { create = "120m" update = "120m" @@ -81,7 +99,7 @@ resource "google_container_node_pool" "multihost_tpu" { cluster = google_container_cluster.tpu_cluster.name initial_node_count = var.tpu_node_pools[count.index].node_count - + management { auto_upgrade = false } @@ -104,16 +122,53 @@ resource "google_container_node_pool" "multihost_tpu" { gcfs_config { enabled = true } - - image_type = "COS_CONTAINERD" + + image_type = "COS_CONTAINERD" machine_type = var.tpu_node_pools[count.index].machine_type tags = ["gke-node"] metadata = { disable-legacy-endpoints = "true" } } + placement_policy { - type = "COMPACT" - policy_name = var.tpu_node_pools[count.index].policy + type = "COMPACT" + policy_name = var.tpu_node_pools[count.index].policy + } + + network_config { + enable_private_nodes = var.is_tpu_node_private + } +} + +resource "google_container_node_pool" "cpu_node_pool" { + provider = google-beta + project = var.project_id + name = "cpu-node-pool" + location = var.region + node_locations = var.cpu_node_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone + autoscaling { + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone + } + max_pods_per_node = 63 + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + machine_type = var.cpu_node_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gcfs_config { + enabled = true + } + } + + network_config { + enable_private_nodes = var.is_cpu_node_private } } diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf new file mode 100644 index 00000000..4530fefa --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf @@ -0,0 +1,41 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_cluster.tpu_cluster.name + description = "GKE Cluster Name" +} + +output "kubernetes_cluster_host" { + value = google_container_cluster.tpu_cluster.endpoint + description = "GKE Cluster Host" +} + +output "placement_policy_names" { + value = flatten([ + google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name + ]) + description = "GKE TPU Placement Policy Names" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars new file mode 100644 index 00000000..75e43377 --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars @@ -0,0 +1,23 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-east5-a" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + },{ + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" +}] +maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf new file mode 100644 index 00000000..bfdc96e7 --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf @@ -0,0 +1,84 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "node_pool_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "tpu_node_pools" { + description = "tpu podslice config" + type = list(object({ + zone = string, + node_count = number, + machine_type = string, + topology = string, + policy = string, + disk_type = optional(string), + disk_size_gb = optional(number), + })) +} + +variable "authorized_cidr_blocks" { + description = "cluster allowed cidr blocks to access with kubectl CLI" + type = list(string) + default = [] +} + +variable "cpu_node_pool" { + description = "cpu nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) + ) + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." + } +} + +variable "is_cpu_node_private" { + description = "whether we want to make CPU node private" + default = false +} + +variable "is_tpu_node_private" { + description = "whether we want to make TPU node private" + default = false +} + +variable "maintenance_interval" { + default = "AS_NEEDED" + description = "maintenance interval for TPU machines." +}