From 87645e6498bbc1af056d64081c0afbe4fc077f20 Mon Sep 17 00:00:00 2001 From: Kangmin Xie Date: Mon, 2 Oct 2023 03:28:19 +0000 Subject: [PATCH 1/8] Reorganize Terraform scripts by moving the sub directories. --- .../add_node_pool/examples/v4/main.tf | 15 ++ .../add_node_pool/examples/v4/outputs.tf | 24 +++ .../examples/v4/terraform.tfvars | 9 + .../add_node_pool/examples/v5e/main.tf | 19 ++ .../add_node_pool/examples/v5e/outputs.tf | 24 +++ .../examples/v5e/terraform.tfvars | 15 ++ .../add_node_pool/module/main.tf | 80 ++++++++ .../add_node_pool/module/outputs.tf | 26 +++ .../add_node_pool/module/terraform.tfvars | 20 ++ .../add_node_pool/module/variables.tf | 56 ++++++ .../create_cluster/examples/v4/main.tf | 17 ++ .../create_cluster/examples/v4/outputs.tf | 24 +++ .../examples/v4/terraform.tfvars | 16 ++ .../create_cluster/examples/v5e/main.tf | 17 ++ .../create_cluster/examples/v5e/outputs.tf | 24 +++ .../examples/v5e/terraform.tfvars | 12 ++ .../create_cluster/module/main.tf | 121 ++++++++++++ .../create_cluster/module/outputs.tf | 24 +++ .../create_cluster/module/terraform.tfvars | 11 ++ .../create_cluster/module/variables.tf | 56 ++++++ .../add_node_pool/examples/v4/main.tf | 15 ++ .../add_node_pool/examples/v4/outputs.tf | 19 ++ .../examples/v4/terraform.tfvars | 9 + .../add_node_pool/examples/v5e/main.tf | 19 ++ .../add_node_pool/examples/v5e/outputs.tf | 19 ++ .../examples/v5e/terraform.tfvars | 14 ++ .../add_node_pool/module/main.tf | 80 ++++++++ .../add_node_pool/module/outputs.tf | 19 ++ .../add_node_pool/module/terraform.tfvars | 20 ++ .../add_node_pool/module/variables.tf | 55 ++++++ .../create_cluster/examples/v4/main.tf | 17 ++ .../create_cluster/examples/v4/outputs.tf | 19 ++ .../examples/v4/terraform.tfvars | 16 ++ .../create_cluster/examples/v5e/main.tf | 17 ++ .../create_cluster/examples/v5e/outputs.tf | 24 +++ .../examples/v5e/terraform.tfvars | 12 ++ .../create_cluster/module/main.tf | 121 ++++++++++++ .../create_cluster/module/outputs.tf | 24 +++ .../create_cluster/module/terraform.tf | 11 ++ .../create_cluster/module/variables.tf | 56 ++++++ .../examples/v4/main.tf | 15 ++ .../examples/v4/outputs.tf | 24 +++ .../examples/v4/terraform.tfvars | 9 + .../module/main.tf | 174 ++++++++++++++++++ .../module/outputs.tf | 41 +++++ .../module/terraform.tfvars | 28 +++ .../module/variables.tf | 84 +++++++++ 47 files changed, 1571 insertions(+) create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/main.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/main.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/main.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/main.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tf create mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/main.tf create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/outputs.tf create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/terraform.tfvars create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/main.tf new file mode 100644 index 00000000..c3b6990c --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/main.tf @@ -0,0 +1,15 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/outputs.tf new file mode 100644 index 00000000..78a05be8 --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "kubernetes_cluster_host" { + value = module.tpu-gke.kubernetes_cluster_host + description = "GKE Cluster Host" +} + +output "nodepool_tpu_topology" { + value = module.tpu-gke.nodepool_tpu_topology + description = "GKE TPU topology" +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/terraform.tfvars new file mode 100644 index 00000000..84f60850 --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/terraform.tfvars @@ -0,0 +1,9 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +tpu_node_pools = [{ + zone = "us-central2-b" + node_count = 2 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x2" +}] diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf new file mode 100644 index 00000000..61ac2331 --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf @@ -0,0 +1,19 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "node_pool_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} +variable "is_tpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + node_pool_prefix = var.node_pool_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval + is_tpu_node_private = var.is_tpu_node_private +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf new file mode 100644 index 00000000..ebb1782f --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "placement_policy_names" { + value = module.tpu-gke.placement_policy_names + description = "GKE TPU Placement Policy Names" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..ac742690 --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars @@ -0,0 +1,15 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +node_pool_prefix = "rp1" +region = "us-east5" +is_tpu_node_private = false +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 32 + machine_type = "ct5lp-hightpu-4t" + topology = "8x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 +}] +maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf new file mode 100644 index 00000000..8df883bc --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf @@ -0,0 +1,80 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# Separately Managed Node Pool +resource "google_container_node_pool" "multihost_tpu" { + count = length(var.tpu_node_pools) + name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" + provider = google-beta + project = var.project_id + location = var.region + node_locations = [var.tpu_node_pools[count.index].zone] + cluster = "${var.resource_name_prefix}-gke-cluster" + + initial_node_count = var.tpu_node_pools[count.index].node_count + + management { + auto_upgrade = false + } + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/cloud-platform", + ] + host_maintenance_policy { + maintenance_interval = var.maintenance_interval + } + labels = { + env = var.project_id + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + + image_type = "COS_CONTAINERD" + machine_type = var.tpu_node_pools[count.index].machine_type + disk_type = var.tpu_node_pools[count.index].disk_type + disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb + tags = ["gke-node"] + metadata = { + disable-legacy-endpoints = "true" + } + } + placement_policy { + type = "COMPACT" + policy_name = var.tpu_node_pools[count.index].policy + } + + network_config { + enable_private_nodes = var.is_tpu_node_private + } +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf new file mode 100644 index 00000000..68085ceb --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf @@ -0,0 +1,26 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_node_pool.multihost_tpu[0].cluster + description = "GKE Cluster Name" +} + +output "placement_policy_names" { + value = flatten([ + google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name + ]) + description = "GKE TPU Placement Policy Names" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars new file mode 100644 index 00000000..e44d8dec --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars @@ -0,0 +1,20 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +tpu_node_pools = [{ + zone = "us-central2-b" + node_count = 4 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x4" + }, { + zone = "us-central2-b" + node_count = 4 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x4" + }, { + zone = "us-central2-b" + node_count = 2 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x2" +}] +maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf new file mode 100644 index 00000000..fa5d507d --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "node_pool_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "tpu_node_pools" { + description = "tpu podslice config" + type = list(object({ + zone = string, + node_count = number, + machine_type = string, + topology = string, + policy = string, + disk_type = optional(string), + disk_size_gb = optional(number), + })) +} + +variable "is_tpu_node_private" { + description = "whether we want to make TPU node private" + default = false +} + +variable "maintenance_interval" { + default = "AS_NEEDED" + description = "maintenance interval for TPU machines." +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/main.tf new file mode 100644 index 00000000..304251dc --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/main.tf @@ -0,0 +1,17 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "cpu_node_pool" {} +variable "maintenance_interval" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + cpu_node_pool = var.cpu_node_pool + maintenance_interval = var.maintenance_interval +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/outputs.tf new file mode 100644 index 00000000..78a05be8 --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "kubernetes_cluster_host" { + value = module.tpu-gke.kubernetes_cluster_host + description = "GKE Cluster Host" +} + +output "nodepool_tpu_topology" { + value = module.tpu-gke.nodepool_tpu_topology + description = "GKE TPU topology" +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/terraform.tfvars new file mode 100644 index 00000000..20ecf2ca --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/terraform.tfvars @@ -0,0 +1,16 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +tpu_node_pools = [{ + zone = "us-central2-b" + node_count = 2 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x2" +}] +cpu_node_pool = { + zone = ["us-central2-a", "us-central2-b", "us-central2-c"] + machine_type = "n2-standard-8", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 30, +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf new file mode 100644 index 00000000..7cce20f2 --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf @@ -0,0 +1,17 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "cpu_node_pool" {} +variable "authorized_cidr_blocks" {} +variable "is_cpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + cpu_node_pool = var.cpu_node_pool + is_cpu_node_private = var.is_cpu_node_private + authorized_cidr_blocks = var.authorized_cidr_blocks +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf new file mode 100644 index 00000000..a5514b1f --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..7f9fcb9f --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars @@ -0,0 +1,12 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +region = "us-east5" +authorized_cidr_blocks = [] +is_cpu_node_private = false +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-8", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 30, +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf new file mode 100644 index 00000000..81e216be --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf @@ -0,0 +1,121 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# VPC +resource "google_compute_network" "vpc" { + name = "${var.resource_name_prefix}-vpc" + auto_create_subnetworks = "false" +} + +# Subnet +resource "google_compute_subnetwork" "subnet" { + name = "${var.resource_name_prefix}-subnet" + region = var.region + network = google_compute_network.vpc.name + ip_cidr_range = "10.10.0.0/19" +} + +resource "google_container_cluster" "tpu_cluster" { + name = "${var.resource_name_prefix}-gke-cluster" + location = var.region + + # We can't create a cluster with no node pool defined, but we want to only use + # separately managed node pools. So we create the smallest possible default + # node pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" + ip_allocation_policy { + cluster_ipv4_cidr_block = "/14" + services_ipv4_cidr_block = "/20" + } + default_max_pods_per_node = 15 + + release_channel { + channel = "UNSPECIFIED" + } + + network = google_compute_network.vpc.name + subnetwork = google_compute_subnetwork.subnet.name + logging_service = "logging.googleapis.com/kubernetes" + monitoring_service = "monitoring.googleapis.com/kubernetes" + + master_authorized_networks_config { + gcp_public_cidrs_access_enabled = false + + dynamic "cidr_blocks" { + for_each = var.authorized_cidr_blocks + content { + cidr_block = cidr_blocks.value + display_name = "cidr-blocks-group-${cidr_blocks.key}" + } + } + } + + // Needs to be false when creating a GKE flexible cluster. + // After that, set as true to disable public endpoint of cluster master. + private_cluster_config { + enable_private_endpoint = false + } + + timeouts { + create = "120m" + update = "120m" + } +} + +resource "google_container_node_pool" "cpu_node_pool" { + provider = google-beta + project = var.project_id + name = "cpu-node-pool" + location = var.region + node_locations = var.cpu_node_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone + autoscaling { + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone + } + max_pods_per_node = 63 + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + machine_type = var.cpu_node_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gcfs_config { + enabled = true + } + } + + network_config { + enable_private_nodes = var.is_cpu_node_private + } +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf new file mode 100644 index 00000000..3953819c --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_cluster.tpu_cluster.name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars new file mode 100644 index 00000000..f3f4e7be --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-central2-a", "us-central2-b", "us-central2-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf new file mode 100644 index 00000000..df05e43d --- /dev/null +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "authorized_cidr_blocks" { + description = "cluster allowed cidr blocks to access with kubectl CLI" + type = list(string) + default = [] +} + +variable "cpu_node_pool" { + description = "cpu nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) + ) + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." + } +} + +variable "is_cpu_node_private" { + description = "whether we want to make CPU node private" + default = false +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/main.tf new file mode 100644 index 00000000..c3b6990c --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/main.tf @@ -0,0 +1,15 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/outputs.tf new file mode 100644 index 00000000..44d8350f --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/outputs.tf @@ -0,0 +1,19 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "kubernetes_cluster_host" { + value = module.tpu-gke.kubernetes_cluster_host + description = "GKE Cluster Host" +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/terraform.tfvars new file mode 100644 index 00000000..84f60850 --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/terraform.tfvars @@ -0,0 +1,9 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +tpu_node_pools = [{ + zone = "us-central2-b" + node_count = 2 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x2" +}] diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf new file mode 100644 index 00000000..61ac2331 --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf @@ -0,0 +1,19 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "node_pool_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} +variable "is_tpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + node_pool_prefix = var.node_pool_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval + is_tpu_node_private = var.is_tpu_node_private +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf new file mode 100644 index 00000000..846c656e --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf @@ -0,0 +1,19 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..e18f03ca --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars @@ -0,0 +1,14 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +node_pool_prefix = "batch1" +region = "us-east5" +is_tpu_node_private = false +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 16 + machine_type = "ct5lp-hightpu-4t" + topology = "8x8" + disk_type = "pd-balanced" + disk_size_gb = 120 +}] +maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf new file mode 100644 index 00000000..0ccdbdba --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf @@ -0,0 +1,80 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# Separately Managed Node Pool +resource "google_container_node_pool" "multihost_tpu" { + count = length(var.tpu_node_pools) + name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" + provider = google-beta + project = var.project_id + location = var.region + node_locations = [var.tpu_node_pools[count.index].zone] + cluster = "${var.resource_name_prefix}-gke-cluster" + + initial_node_count = var.tpu_node_pools[count.index].node_count + + management { + auto_upgrade = false + } + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/cloud-platform", + ] + host_maintenance_policy { + maintenance_interval = var.maintenance_interval + } + labels = { + env = var.project_id + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + + image_type = "COS_CONTAINERD" + machine_type = var.tpu_node_pools[count.index].machine_type + disk_type = var.tpu_node_pools[count.index].disk_type + disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb + tags = ["gke-node"] + metadata = { + disable-legacy-endpoints = "true" + } + } + placement_policy { + type = "COMPACT" + tpu_topology = var.tpu_node_pools[count.index].topology + } + + network_config { + enable_private_nodes = var.is_tpu_node_private + } +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf new file mode 100644 index 00000000..06972205 --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf @@ -0,0 +1,19 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_node_pool.multihost_tpu[0].cluster + description = "GKE Cluster Name" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars new file mode 100644 index 00000000..e44d8dec --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars @@ -0,0 +1,20 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +tpu_node_pools = [{ + zone = "us-central2-b" + node_count = 4 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x4" + }, { + zone = "us-central2-b" + node_count = 4 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x4" + }, { + zone = "us-central2-b" + node_count = 2 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x2" +}] +maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf new file mode 100644 index 00000000..c467e69a --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf @@ -0,0 +1,55 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "node_pool_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "tpu_node_pools" { + description = "tpu podslice config" + type = list(object({ + zone = string, + node_count = number, + machine_type = string, + topology = string, + disk_type = optional(string), + disk_size_gb = optional(number), + })) +} + +variable "is_tpu_node_private" { + description = "whether we want to make TPU node private" + default = false +} + +variable "maintenance_interval" { + default = "AS_NEEDED" + description = "maintenance interval for TPU machines." +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/main.tf new file mode 100644 index 00000000..304251dc --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/main.tf @@ -0,0 +1,17 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "cpu_node_pool" {} +variable "maintenance_interval" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + cpu_node_pool = var.cpu_node_pool + maintenance_interval = var.maintenance_interval +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/outputs.tf new file mode 100644 index 00000000..44d8350f --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/outputs.tf @@ -0,0 +1,19 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "kubernetes_cluster_host" { + value = module.tpu-gke.kubernetes_cluster_host + description = "GKE Cluster Host" +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/terraform.tfvars new file mode 100644 index 00000000..1e6c096f --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/terraform.tfvars @@ -0,0 +1,16 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +tpu_node_pools = [{ + zone = "us-central2-b" + node_count = 2 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x2" + }] +cpu_node_pool = { + zone = ["us-central2-a", "us-central2-b", "us-central2-c"] + machine_type = "n2-standard-8", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 30, +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf new file mode 100644 index 00000000..7cce20f2 --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf @@ -0,0 +1,17 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "cpu_node_pool" {} +variable "authorized_cidr_blocks" {} +variable "is_cpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + cpu_node_pool = var.cpu_node_pool + is_cpu_node_private = var.is_cpu_node_private + authorized_cidr_blocks = var.authorized_cidr_blocks +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf new file mode 100644 index 00000000..a5514b1f --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..7f9fcb9f --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars @@ -0,0 +1,12 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +region = "us-east5" +authorized_cidr_blocks = [] +is_cpu_node_private = false +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-8", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 30, +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf new file mode 100644 index 00000000..81e216be --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf @@ -0,0 +1,121 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# VPC +resource "google_compute_network" "vpc" { + name = "${var.resource_name_prefix}-vpc" + auto_create_subnetworks = "false" +} + +# Subnet +resource "google_compute_subnetwork" "subnet" { + name = "${var.resource_name_prefix}-subnet" + region = var.region + network = google_compute_network.vpc.name + ip_cidr_range = "10.10.0.0/19" +} + +resource "google_container_cluster" "tpu_cluster" { + name = "${var.resource_name_prefix}-gke-cluster" + location = var.region + + # We can't create a cluster with no node pool defined, but we want to only use + # separately managed node pools. So we create the smallest possible default + # node pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" + ip_allocation_policy { + cluster_ipv4_cidr_block = "/14" + services_ipv4_cidr_block = "/20" + } + default_max_pods_per_node = 15 + + release_channel { + channel = "UNSPECIFIED" + } + + network = google_compute_network.vpc.name + subnetwork = google_compute_subnetwork.subnet.name + logging_service = "logging.googleapis.com/kubernetes" + monitoring_service = "monitoring.googleapis.com/kubernetes" + + master_authorized_networks_config { + gcp_public_cidrs_access_enabled = false + + dynamic "cidr_blocks" { + for_each = var.authorized_cidr_blocks + content { + cidr_block = cidr_blocks.value + display_name = "cidr-blocks-group-${cidr_blocks.key}" + } + } + } + + // Needs to be false when creating a GKE flexible cluster. + // After that, set as true to disable public endpoint of cluster master. + private_cluster_config { + enable_private_endpoint = false + } + + timeouts { + create = "120m" + update = "120m" + } +} + +resource "google_container_node_pool" "cpu_node_pool" { + provider = google-beta + project = var.project_id + name = "cpu-node-pool" + location = var.region + node_locations = var.cpu_node_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone + autoscaling { + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone + } + max_pods_per_node = 63 + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + machine_type = var.cpu_node_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gcfs_config { + enabled = true + } + } + + network_config { + enable_private_nodes = var.is_cpu_node_private + } +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf new file mode 100644 index 00000000..3953819c --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_cluster.tpu_cluster.name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tf new file mode 100644 index 00000000..f3f4e7be --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tf @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-central2-a", "us-central2-b", "us-central2-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf new file mode 100644 index 00000000..df05e43d --- /dev/null +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "authorized_cidr_blocks" { + description = "cluster allowed cidr blocks to access with kubectl CLI" + type = list(string) + default = [] +} + +variable "cpu_node_pool" { + description = "cpu nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) + ) + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." + } +} + +variable "is_cpu_node_private" { + description = "whether we want to make CPU node private" + default = false +} diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/main.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/main.tf new file mode 100644 index 00000000..c3b6990c --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/main.tf @@ -0,0 +1,15 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval +} diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/outputs.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/outputs.tf new file mode 100644 index 00000000..78a05be8 --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "kubernetes_cluster_host" { + value = module.tpu-gke.kubernetes_cluster_host + description = "GKE Cluster Host" +} + +output "nodepool_tpu_topology" { + value = module.tpu-gke.nodepool_tpu_topology + description = "GKE TPU topology" +} diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/terraform.tfvars new file mode 100644 index 00000000..84f60850 --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/terraform.tfvars @@ -0,0 +1,9 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +tpu_node_pools = [{ + zone = "us-central2-b" + node_count = 2 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x2" +}] diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf new file mode 100644 index 00000000..147f222f --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf @@ -0,0 +1,174 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# VPC +resource "google_compute_network" "vpc" { + name = "${var.resource_name_prefix}-vpc" + auto_create_subnetworks = "false" +} + +# Subnet +resource "google_compute_subnetwork" "subnet" { + name = "${var.resource_name_prefix}-subnet" + region = var.region + network = google_compute_network.vpc.name + ip_cidr_range = "10.10.0.0/19" +} + +resource "google_container_cluster" "tpu_cluster" { + name = "${var.resource_name_prefix}-gke-cluster" + location = var.region + + # We can't create a cluster with no node pool defined, but we want to only use + # separately managed node pools. So we create the smallest possible default + # node pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" + ip_allocation_policy { + cluster_ipv4_cidr_block = "/14" + services_ipv4_cidr_block = "/20" + } + default_max_pods_per_node = 15 + + release_channel { + channel = "UNSPECIFIED" + } + + network = google_compute_network.vpc.name + subnetwork = google_compute_subnetwork.subnet.name + logging_service = "logging.googleapis.com/kubernetes" + monitoring_service = "monitoring.googleapis.com/kubernetes" + + master_authorized_networks_config { + gcp_public_cidrs_access_enabled = false + + dynamic "cidr_blocks" { + for_each = var.authorized_cidr_blocks + content { + cidr_block = cidr_blocks.value + display_name = "cidr-blocks-group-${cidr_blocks.key}" + } + } + } + + // Needs to be false when creating a GKE flexible cluster. + // After that, set as true to disable public endpoint of cluster master. + private_cluster_config { + enable_private_endpoint = false + } + + timeouts { + create = "120m" + update = "120m" + } +} + +# Separately Managed Node Pool +resource "google_container_node_pool" "multihost_tpu" { + count = length(var.tpu_node_pools) + name = "${google_container_cluster.tpu_cluster.name}-${count.index}" + provider = google-beta + project = var.project_id + location = var.region + node_locations = [var.tpu_node_pools[count.index].zone] + cluster = google_container_cluster.tpu_cluster.name + + initial_node_count = var.tpu_node_pools[count.index].node_count + + management { + auto_upgrade = false + } + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/cloud-platform", + ] + host_maintenance_policy { + maintenance_interval = var.maintenance_interval + } + labels = { + env = var.project_id + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + + image_type = "COS_CONTAINERD" + machine_type = var.tpu_node_pools[count.index].machine_type + tags = ["gke-node"] + metadata = { + disable-legacy-endpoints = "true" + } + } + + placement_policy { + type = "COMPACT" + policy_name = var.tpu_node_pools[count.index].policy + } + + network_config { + enable_private_nodes = var.is_tpu_node_private + } +} + +resource "google_container_node_pool" "cpu_node_pool" { + provider = google-beta + project = var.project_id + name = "cpu-node-pool" + location = var.region + node_locations = var.cpu_node_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone + autoscaling { + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone + } + max_pods_per_node = 63 + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + machine_type = var.cpu_node_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gcfs_config { + enabled = true + } + } + + network_config { + enable_private_nodes = var.is_cpu_node_private + } +} diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf new file mode 100644 index 00000000..4530fefa --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf @@ -0,0 +1,41 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_cluster.tpu_cluster.name + description = "GKE Cluster Name" +} + +output "kubernetes_cluster_host" { + value = google_container_cluster.tpu_cluster.endpoint + description = "GKE Cluster Host" +} + +output "placement_policy_names" { + value = flatten([ + google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name + ]) + description = "GKE TPU Placement Policy Names" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars new file mode 100644 index 00000000..8f63265f --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars @@ -0,0 +1,28 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-central2-a", "us-central2-b", "us-central2-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} +tpu_node_pools = [{ + zone = "us-central2-b" + node_count = 4 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x4" + }, { + zone = "us-central2-b" + node_count = 4 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x4" + }, { + zone = "us-central2-b" + node_count = 2 + machine_type = "ct4p-hightpu-4t" + topology = "2x2x2" +}] +maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf new file mode 100644 index 00000000..bfdc96e7 --- /dev/null +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf @@ -0,0 +1,84 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "node_pool_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "tpu_node_pools" { + description = "tpu podslice config" + type = list(object({ + zone = string, + node_count = number, + machine_type = string, + topology = string, + policy = string, + disk_type = optional(string), + disk_size_gb = optional(number), + })) +} + +variable "authorized_cidr_blocks" { + description = "cluster allowed cidr blocks to access with kubectl CLI" + type = list(string) + default = [] +} + +variable "cpu_node_pool" { + description = "cpu nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) + ) + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." + } +} + +variable "is_cpu_node_private" { + description = "whether we want to make CPU node private" + default = false +} + +variable "is_tpu_node_private" { + description = "whether we want to make TPU node private" + default = false +} + +variable "maintenance_interval" { + default = "AS_NEEDED" + description = "maintenance interval for TPU machines." +} From e3efb57e190a55605ffc1c2d7871ddb0376d3f61 Mon Sep 17 00:00:00 2001 From: Kangmin Xie Date: Mon, 2 Oct 2023 19:58:39 +0000 Subject: [PATCH 2/8] Updated the flexible to PSC-based cluster type --- .../create_cluster/module/main.tf | 2 +- .../create_cluster/module/main.tf | 2 +- .../non_batching_with_compact_placement/module/main.tf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf index 81e216be..6596e498 100644 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf +++ b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf @@ -76,7 +76,7 @@ resource "google_container_cluster" "tpu_cluster" { } } - // Needs to be false when creating a GKE flexible cluster. + // Needs to be false when creating a PSC-based GKE cluster. // After that, set as true to disable public endpoint of cluster master. private_cluster_config { enable_private_endpoint = false diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf index 81e216be..6596e498 100644 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf +++ b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf @@ -76,7 +76,7 @@ resource "google_container_cluster" "tpu_cluster" { } } - // Needs to be false when creating a GKE flexible cluster. + // Needs to be false when creating a PSC-based GKE cluster. // After that, set as true to disable public endpoint of cluster master. private_cluster_config { enable_private_endpoint = false diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf index 147f222f..02ef424c 100644 --- a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf +++ b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf @@ -76,7 +76,7 @@ resource "google_container_cluster" "tpu_cluster" { } } - // Needs to be false when creating a GKE flexible cluster. + // Needs to be false when creating a PSC-based GKE cluster. // After that, set as true to disable public endpoint of cluster master. private_cluster_config { enable_private_endpoint = false From 158503b4c4a9f823f76d07b06f64ea6e7ba3a531 Mon Sep 17 00:00:00 2001 From: Kangmin Xie Date: Fri, 6 Oct 2023 15:11:41 +0000 Subject: [PATCH 3/8] Refactored current content to inference directory. --- .../add_node_pool/examples/v5e/main.tf | 19 +++ .../add_node_pool/examples/v5e/outputs.tf | 24 ++++ .../examples/v5e/terraform.tfvars | 23 ++++ .../add_node_pool/module/main.tf | 80 ++++++++++++ .../add_node_pool/module/outputs.tf | 26 ++++ .../add_node_pool/module/terraform.tfvars | 11 ++ .../add_node_pool/module/variables.tf | 56 ++++++++ .../create_cluster/examples/v5e/main.tf | 17 +++ .../create_cluster/examples/v5e/outputs.tf | 24 ++++ .../examples/v5e/terraform.tfvars | 12 ++ .../create_cluster/module/main.tf | 121 ++++++++++++++++++ .../create_cluster/module/outputs.tf | 24 ++++ .../create_cluster/module/terraform.tfvars | 11 ++ .../create_cluster/module/variables.tf | 56 ++++++++ .../add_node_pool/examples/v5e/main.tf | 19 +++ .../add_node_pool/examples/v5e/outputs.tf | 19 +++ .../examples/v5e/terraform.tfvars | 14 ++ .../add_node_pool/module/main.tf | 80 ++++++++++++ .../add_node_pool/module/outputs.tf | 19 +++ .../add_node_pool/module/terraform.tfvars | 10 ++ .../add_node_pool/module/variables.tf | 55 ++++++++ .../create_cluster/examples/v5e/main.tf | 17 +++ .../create_cluster/examples/v5e/outputs.tf | 24 ++++ .../examples/v5e/terraform.tfvars | 12 ++ .../create_cluster/module/main.tf | 121 ++++++++++++++++++ .../create_cluster/module/outputs.tf | 24 ++++ .../create_cluster/module/terraform.tfvars | 11 ++ .../create_cluster/module/variables.tf | 56 ++++++++ 28 files changed, 985 insertions(+) create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/main.tf create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/outputs.tf create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/variables.tf create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/main.tf create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/outputs.tf create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/variables.tf create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/main.tf create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/outputs.tf create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/variables.tf create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/main.tf create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/outputs.tf create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/variables.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf new file mode 100644 index 00000000..61ac2331 --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf @@ -0,0 +1,19 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "node_pool_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} +variable "is_tpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + node_pool_prefix = var.node_pool_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval + is_tpu_node_private = var.is_tpu_node_private +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf new file mode 100644 index 00000000..ebb1782f --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "placement_policy_names" { + value = module.tpu-gke.placement_policy_names + description = "GKE TPU Placement Policy Names" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..9148d119 --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars @@ -0,0 +1,23 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +node_pool_prefix = "rp1" +region = "us-east5" +is_tpu_node_private = false +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 32 + machine_type = "ct5lp-hightpu-4t" + topology = "8x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 32 + machine_type = "ct5lp-hightpu-4t" + topology = "8x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 +}] +maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/main.tf new file mode 100644 index 00000000..8df883bc --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/main.tf @@ -0,0 +1,80 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# Separately Managed Node Pool +resource "google_container_node_pool" "multihost_tpu" { + count = length(var.tpu_node_pools) + name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" + provider = google-beta + project = var.project_id + location = var.region + node_locations = [var.tpu_node_pools[count.index].zone] + cluster = "${var.resource_name_prefix}-gke-cluster" + + initial_node_count = var.tpu_node_pools[count.index].node_count + + management { + auto_upgrade = false + } + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/cloud-platform", + ] + host_maintenance_policy { + maintenance_interval = var.maintenance_interval + } + labels = { + env = var.project_id + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + + image_type = "COS_CONTAINERD" + machine_type = var.tpu_node_pools[count.index].machine_type + disk_type = var.tpu_node_pools[count.index].disk_type + disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb + tags = ["gke-node"] + metadata = { + disable-legacy-endpoints = "true" + } + } + placement_policy { + type = "COMPACT" + policy_name = var.tpu_node_pools[count.index].policy + } + + network_config { + enable_private_nodes = var.is_tpu_node_private + } +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/outputs.tf new file mode 100644 index 00000000..68085ceb --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/outputs.tf @@ -0,0 +1,26 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_node_pool.multihost_tpu[0].cluster + description = "GKE Cluster Name" +} + +output "placement_policy_names" { + value = flatten([ + google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name + ]) + description = "GKE TPU Placement Policy Names" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/terraform.tfvars new file mode 100644 index 00000000..520ff3a1 --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/terraform.tfvars @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-east5" +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 32 + machine_type = "ct5lp-hightpu-4t" + topology = "8x16" + policy = "sb-compact-rp1" +}] +maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/variables.tf new file mode 100644 index 00000000..fa5d507d --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "node_pool_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "tpu_node_pools" { + description = "tpu podslice config" + type = list(object({ + zone = string, + node_count = number, + machine_type = string, + topology = string, + policy = string, + disk_type = optional(string), + disk_size_gb = optional(number), + })) +} + +variable "is_tpu_node_private" { + description = "whether we want to make TPU node private" + default = false +} + +variable "maintenance_interval" { + default = "AS_NEEDED" + description = "maintenance interval for TPU machines." +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/main.tf new file mode 100644 index 00000000..7cce20f2 --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/main.tf @@ -0,0 +1,17 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "cpu_node_pool" {} +variable "authorized_cidr_blocks" {} +variable "is_cpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + cpu_node_pool = var.cpu_node_pool + is_cpu_node_private = var.is_cpu_node_private + authorized_cidr_blocks = var.authorized_cidr_blocks +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf new file mode 100644 index 00000000..a5514b1f --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..7f9fcb9f --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars @@ -0,0 +1,12 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +region = "us-east5" +authorized_cidr_blocks = [] +is_cpu_node_private = false +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-8", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 30, +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/main.tf new file mode 100644 index 00000000..6596e498 --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/main.tf @@ -0,0 +1,121 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# VPC +resource "google_compute_network" "vpc" { + name = "${var.resource_name_prefix}-vpc" + auto_create_subnetworks = "false" +} + +# Subnet +resource "google_compute_subnetwork" "subnet" { + name = "${var.resource_name_prefix}-subnet" + region = var.region + network = google_compute_network.vpc.name + ip_cidr_range = "10.10.0.0/19" +} + +resource "google_container_cluster" "tpu_cluster" { + name = "${var.resource_name_prefix}-gke-cluster" + location = var.region + + # We can't create a cluster with no node pool defined, but we want to only use + # separately managed node pools. So we create the smallest possible default + # node pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" + ip_allocation_policy { + cluster_ipv4_cidr_block = "/14" + services_ipv4_cidr_block = "/20" + } + default_max_pods_per_node = 15 + + release_channel { + channel = "UNSPECIFIED" + } + + network = google_compute_network.vpc.name + subnetwork = google_compute_subnetwork.subnet.name + logging_service = "logging.googleapis.com/kubernetes" + monitoring_service = "monitoring.googleapis.com/kubernetes" + + master_authorized_networks_config { + gcp_public_cidrs_access_enabled = false + + dynamic "cidr_blocks" { + for_each = var.authorized_cidr_blocks + content { + cidr_block = cidr_blocks.value + display_name = "cidr-blocks-group-${cidr_blocks.key}" + } + } + } + + // Needs to be false when creating a PSC-based GKE cluster. + // After that, set as true to disable public endpoint of cluster master. + private_cluster_config { + enable_private_endpoint = false + } + + timeouts { + create = "120m" + update = "120m" + } +} + +resource "google_container_node_pool" "cpu_node_pool" { + provider = google-beta + project = var.project_id + name = "cpu-node-pool" + location = var.region + node_locations = var.cpu_node_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone + autoscaling { + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone + } + max_pods_per_node = 63 + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + machine_type = var.cpu_node_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gcfs_config { + enabled = true + } + } + + network_config { + enable_private_nodes = var.is_cpu_node_private + } +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/outputs.tf new file mode 100644 index 00000000..3953819c --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_cluster.tpu_cluster.name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/terraform.tfvars new file mode 100644 index 00000000..bdda5d5e --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/terraform.tfvars @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-east5" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/variables.tf new file mode 100644 index 00000000..df05e43d --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "authorized_cidr_blocks" { + description = "cluster allowed cidr blocks to access with kubectl CLI" + type = list(string) + default = [] +} + +variable "cpu_node_pool" { + description = "cpu nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) + ) + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." + } +} + +variable "is_cpu_node_private" { + description = "whether we want to make CPU node private" + default = false +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf new file mode 100644 index 00000000..61ac2331 --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf @@ -0,0 +1,19 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "node_pool_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} +variable "is_tpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + node_pool_prefix = var.node_pool_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval + is_tpu_node_private = var.is_tpu_node_private +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf new file mode 100644 index 00000000..846c656e --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf @@ -0,0 +1,19 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..e18f03ca --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars @@ -0,0 +1,14 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +node_pool_prefix = "batch1" +region = "us-east5" +is_tpu_node_private = false +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 16 + machine_type = "ct5lp-hightpu-4t" + topology = "8x8" + disk_type = "pd-balanced" + disk_size_gb = 120 +}] +maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/main.tf new file mode 100644 index 00000000..0ccdbdba --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/main.tf @@ -0,0 +1,80 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# Separately Managed Node Pool +resource "google_container_node_pool" "multihost_tpu" { + count = length(var.tpu_node_pools) + name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" + provider = google-beta + project = var.project_id + location = var.region + node_locations = [var.tpu_node_pools[count.index].zone] + cluster = "${var.resource_name_prefix}-gke-cluster" + + initial_node_count = var.tpu_node_pools[count.index].node_count + + management { + auto_upgrade = false + } + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/cloud-platform", + ] + host_maintenance_policy { + maintenance_interval = var.maintenance_interval + } + labels = { + env = var.project_id + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + + image_type = "COS_CONTAINERD" + machine_type = var.tpu_node_pools[count.index].machine_type + disk_type = var.tpu_node_pools[count.index].disk_type + disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb + tags = ["gke-node"] + metadata = { + disable-legacy-endpoints = "true" + } + } + placement_policy { + type = "COMPACT" + tpu_topology = var.tpu_node_pools[count.index].topology + } + + network_config { + enable_private_nodes = var.is_tpu_node_private + } +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/outputs.tf new file mode 100644 index 00000000..06972205 --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/outputs.tf @@ -0,0 +1,19 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_node_pool.multihost_tpu[0].cluster + description = "GKE Cluster Name" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/terraform.tfvars new file mode 100644 index 00000000..a38800da --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/terraform.tfvars @@ -0,0 +1,10 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-east5" +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 16 + machine_type = "ct5lp-hightpu-4t" + topology = "8x8" +}] +maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/variables.tf new file mode 100644 index 00000000..c467e69a --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/variables.tf @@ -0,0 +1,55 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "node_pool_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "tpu_node_pools" { + description = "tpu podslice config" + type = list(object({ + zone = string, + node_count = number, + machine_type = string, + topology = string, + disk_type = optional(string), + disk_size_gb = optional(number), + })) +} + +variable "is_tpu_node_private" { + description = "whether we want to make TPU node private" + default = false +} + +variable "maintenance_interval" { + default = "AS_NEEDED" + description = "maintenance interval for TPU machines." +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/main.tf new file mode 100644 index 00000000..7cce20f2 --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/main.tf @@ -0,0 +1,17 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "cpu_node_pool" {} +variable "authorized_cidr_blocks" {} +variable "is_cpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + cpu_node_pool = var.cpu_node_pool + is_cpu_node_private = var.is_cpu_node_private + authorized_cidr_blocks = var.authorized_cidr_blocks +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf new file mode 100644 index 00000000..a5514b1f --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..7f9fcb9f --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars @@ -0,0 +1,12 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +region = "us-east5" +authorized_cidr_blocks = [] +is_cpu_node_private = false +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-8", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 30, +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/main.tf new file mode 100644 index 00000000..6596e498 --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/main.tf @@ -0,0 +1,121 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# VPC +resource "google_compute_network" "vpc" { + name = "${var.resource_name_prefix}-vpc" + auto_create_subnetworks = "false" +} + +# Subnet +resource "google_compute_subnetwork" "subnet" { + name = "${var.resource_name_prefix}-subnet" + region = var.region + network = google_compute_network.vpc.name + ip_cidr_range = "10.10.0.0/19" +} + +resource "google_container_cluster" "tpu_cluster" { + name = "${var.resource_name_prefix}-gke-cluster" + location = var.region + + # We can't create a cluster with no node pool defined, but we want to only use + # separately managed node pools. So we create the smallest possible default + # node pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" + ip_allocation_policy { + cluster_ipv4_cidr_block = "/14" + services_ipv4_cidr_block = "/20" + } + default_max_pods_per_node = 15 + + release_channel { + channel = "UNSPECIFIED" + } + + network = google_compute_network.vpc.name + subnetwork = google_compute_subnetwork.subnet.name + logging_service = "logging.googleapis.com/kubernetes" + monitoring_service = "monitoring.googleapis.com/kubernetes" + + master_authorized_networks_config { + gcp_public_cidrs_access_enabled = false + + dynamic "cidr_blocks" { + for_each = var.authorized_cidr_blocks + content { + cidr_block = cidr_blocks.value + display_name = "cidr-blocks-group-${cidr_blocks.key}" + } + } + } + + // Needs to be false when creating a PSC-based GKE cluster. + // After that, set as true to disable public endpoint of cluster master. + private_cluster_config { + enable_private_endpoint = false + } + + timeouts { + create = "120m" + update = "120m" + } +} + +resource "google_container_node_pool" "cpu_node_pool" { + provider = google-beta + project = var.project_id + name = "cpu-node-pool" + location = var.region + node_locations = var.cpu_node_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone + autoscaling { + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone + } + max_pods_per_node = 63 + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + machine_type = var.cpu_node_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gcfs_config { + enabled = true + } + } + + network_config { + enable_private_nodes = var.is_cpu_node_private + } +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/outputs.tf new file mode 100644 index 00000000..3953819c --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_cluster.tpu_cluster.name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/terraform.tfvars new file mode 100644 index 00000000..bdda5d5e --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/terraform.tfvars @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-east5" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/variables.tf new file mode 100644 index 00000000..df05e43d --- /dev/null +++ b/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "authorized_cidr_blocks" { + description = "cluster allowed cidr blocks to access with kubectl CLI" + type = list(string) + default = [] +} + +variable "cpu_node_pool" { + description = "cpu nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) + ) + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." + } +} + +variable "is_cpu_node_private" { + description = "whether we want to make CPU node private" + default = false +} From 9c4b82d9a1d6897c002fa243b03287570ce451ff Mon Sep 17 00:00:00 2001 From: Kangmin Xie Date: Fri, 6 Oct 2023 15:47:49 +0000 Subject: [PATCH 4/8] Refactored current content to inference directory - continue. --- .../add_node_pool/examples/v4/main.tf | 15 --- .../add_node_pool/examples/v4/outputs.tf | 24 ---- .../examples/v4/terraform.tfvars | 9 -- .../add_node_pool/examples/v5e/main.tf | 19 --- .../add_node_pool/examples/v5e/outputs.tf | 24 ---- .../examples/v5e/terraform.tfvars | 15 --- .../add_node_pool/module/main.tf | 80 ------------ .../add_node_pool/module/outputs.tf | 26 ---- .../add_node_pool/module/terraform.tfvars | 20 --- .../add_node_pool/module/variables.tf | 56 -------- .../create_cluster/examples/v4/main.tf | 17 --- .../create_cluster/examples/v4/outputs.tf | 24 ---- .../examples/v4/terraform.tfvars | 16 --- .../create_cluster/examples/v5e/main.tf | 17 --- .../create_cluster/examples/v5e/outputs.tf | 24 ---- .../examples/v5e/terraform.tfvars | 12 -- .../create_cluster/module/main.tf | 121 ------------------ .../create_cluster/module/outputs.tf | 24 ---- .../create_cluster/module/terraform.tfvars | 11 -- .../create_cluster/module/variables.tf | 56 -------- .../add_node_pool/examples/v4/main.tf | 15 --- .../add_node_pool/examples/v4/outputs.tf | 19 --- .../examples/v4/terraform.tfvars | 9 -- .../add_node_pool/examples/v5e/main.tf | 19 --- .../add_node_pool/examples/v5e/outputs.tf | 19 --- .../examples/v5e/terraform.tfvars | 14 -- .../add_node_pool/module/main.tf | 80 ------------ .../add_node_pool/module/outputs.tf | 19 --- .../add_node_pool/module/terraform.tfvars | 20 --- .../add_node_pool/module/variables.tf | 55 -------- .../create_cluster/examples/v4/main.tf | 17 --- .../create_cluster/examples/v4/outputs.tf | 19 --- .../examples/v4/terraform.tfvars | 16 --- .../create_cluster/examples/v5e/main.tf | 17 --- .../create_cluster/examples/v5e/outputs.tf | 24 ---- .../examples/v5e/terraform.tfvars | 12 -- .../create_cluster/module/main.tf | 121 ------------------ .../create_cluster/module/outputs.tf | 24 ---- .../create_cluster/module/terraform.tf | 11 -- .../create_cluster/module/variables.tf | 56 -------- .../kubernetes/terraform/examples/v4/main.tf | 15 --- .../terraform/examples/v4/outputs.tf | 24 ---- .../terraform/examples/v4/terraform.tfvars | 9 -- .../kubernetes/terraform/examples/v5e/main.tf | 15 --- .../terraform/examples/v5e/outputs.tf | 24 ---- .../terraform/examples/v5e/terraform.tfvars | 101 --------------- .../module/main.tf | 0 .../module/outputs.tf | 0 .../module/terraform.tfvars | 23 ++++ .../module/variables.tf | 0 tools/kubernetes/terraform/module/main.tf | 119 ----------------- tools/kubernetes/terraform/module/outputs.tf | 26 ---- .../terraform/module/terraform.tfvars | 21 --- .../kubernetes/terraform/module/variables.tf | 44 ------- .../examples/v4/main.tf | 15 --- .../examples/v4/outputs.tf | 24 ---- .../examples/v4/terraform.tfvars | 9 -- .../module/terraform.tfvars | 28 ---- 58 files changed, 23 insertions(+), 1670 deletions(-) delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/main.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/main.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/main.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/main.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tf delete mode 100644 tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf delete mode 100644 tools/kubernetes/terraform/examples/v4/main.tf delete mode 100644 tools/kubernetes/terraform/examples/v4/outputs.tf delete mode 100644 tools/kubernetes/terraform/examples/v4/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/examples/v5e/main.tf delete mode 100644 tools/kubernetes/terraform/examples/v5e/outputs.tf delete mode 100644 tools/kubernetes/terraform/examples/v5e/terraform.tfvars rename tools/kubernetes/terraform/{ => inference}/non_batching_with_compact_placement/module/main.tf (100%) rename tools/kubernetes/terraform/{ => inference}/non_batching_with_compact_placement/module/outputs.tf (100%) create mode 100644 tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/terraform.tfvars rename tools/kubernetes/terraform/{ => inference}/non_batching_with_compact_placement/module/variables.tf (100%) delete mode 100644 tools/kubernetes/terraform/module/main.tf delete mode 100644 tools/kubernetes/terraform/module/outputs.tf delete mode 100644 tools/kubernetes/terraform/module/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/module/variables.tf delete mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/main.tf delete mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/outputs.tf delete mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/main.tf deleted file mode 100644 index c3b6990c..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/main.tf +++ /dev/null @@ -1,15 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "maintenance_interval" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - maintenance_interval = var.maintenance_interval -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/outputs.tf deleted file mode 100644 index 78a05be8..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "kubernetes_cluster_host" { - value = module.tpu-gke.kubernetes_cluster_host - description = "GKE Cluster Host" -} - -output "nodepool_tpu_topology" { - value = module.tpu-gke.nodepool_tpu_topology - description = "GKE TPU topology" -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/terraform.tfvars deleted file mode 100644 index 84f60850..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v4/terraform.tfvars +++ /dev/null @@ -1,9 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" -}] diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf deleted file mode 100644 index 61ac2331..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf +++ /dev/null @@ -1,19 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "node_pool_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "maintenance_interval" {} -variable "is_tpu_node_private" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - node_pool_prefix = var.node_pool_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - maintenance_interval = var.maintenance_interval - is_tpu_node_private = var.is_tpu_node_private -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf deleted file mode 100644 index ebb1782f..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "placement_policy_names" { - value = module.tpu-gke.placement_policy_names - description = "GKE TPU Placement Policy Names" -} - -output "is_tpu_node_private" { - value = var.is_tpu_node_private - description = "whether we want to make TPU node private" -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars deleted file mode 100644 index ac742690..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars +++ /dev/null @@ -1,15 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-v5e-test" -node_pool_prefix = "rp1" -region = "us-east5" -is_tpu_node_private = false -tpu_node_pools = [{ - zone = "us-east5-b" - node_count = 32 - machine_type = "ct5lp-hightpu-4t" - topology = "8x16" - policy = "sb-compact-rp1" - disk_type = "pd-balanced" - disk_size_gb = 120 -}] -maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf deleted file mode 100644 index 8df883bc..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# GKE cluster -data "google_container_engine_versions" "gke_version" { - location = var.region - version_prefix = "1.27." -} - -provider "google" { - project = var.project_id - region = var.region -} - -# Separately Managed Node Pool -resource "google_container_node_pool" "multihost_tpu" { - count = length(var.tpu_node_pools) - name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" - provider = google-beta - project = var.project_id - location = var.region - node_locations = [var.tpu_node_pools[count.index].zone] - cluster = "${var.resource_name_prefix}-gke-cluster" - - initial_node_count = var.tpu_node_pools[count.index].node_count - - management { - auto_upgrade = false - } - - node_config { - oauth_scopes = [ - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/monitoring", - "https://www.googleapis.com/auth/cloud-platform", - ] - host_maintenance_policy { - maintenance_interval = var.maintenance_interval - } - labels = { - env = var.project_id - } - gvnic { - enabled = true - } - gcfs_config { - enabled = true - } - - image_type = "COS_CONTAINERD" - machine_type = var.tpu_node_pools[count.index].machine_type - disk_type = var.tpu_node_pools[count.index].disk_type - disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb - tags = ["gke-node"] - metadata = { - disable-legacy-endpoints = "true" - } - } - placement_policy { - type = "COMPACT" - policy_name = var.tpu_node_pools[count.index].policy - } - - network_config { - enable_private_nodes = var.is_tpu_node_private - } -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf deleted file mode 100644 index 68085ceb..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf +++ /dev/null @@ -1,26 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = google_container_node_pool.multihost_tpu[0].cluster - description = "GKE Cluster Name" -} - -output "placement_policy_names" { - value = flatten([ - google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name - ]) - description = "GKE TPU Placement Policy Names" -} - -output "is_tpu_node_private" { - value = var.is_tpu_node_private - description = "whether we want to make TPU node private" -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars deleted file mode 100644 index e44d8dec..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars +++ /dev/null @@ -1,20 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 4 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x4" - }, { - zone = "us-central2-b" - node_count = 4 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x4" - }, { - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" -}] -maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf deleted file mode 100644 index fa5d507d..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - description = "project id" -} - -variable "region" { - description = "region" -} - -variable "resource_name_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "node_pool_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "tpu_node_pools" { - description = "tpu podslice config" - type = list(object({ - zone = string, - node_count = number, - machine_type = string, - topology = string, - policy = string, - disk_type = optional(string), - disk_size_gb = optional(number), - })) -} - -variable "is_tpu_node_private" { - description = "whether we want to make TPU node private" - default = false -} - -variable "maintenance_interval" { - default = "AS_NEEDED" - description = "maintenance interval for TPU machines." -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/main.tf deleted file mode 100644 index 304251dc..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/main.tf +++ /dev/null @@ -1,17 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "cpu_node_pool" {} -variable "maintenance_interval" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - cpu_node_pool = var.cpu_node_pool - maintenance_interval = var.maintenance_interval -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/outputs.tf deleted file mode 100644 index 78a05be8..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "kubernetes_cluster_host" { - value = module.tpu-gke.kubernetes_cluster_host - description = "GKE Cluster Host" -} - -output "nodepool_tpu_topology" { - value = module.tpu-gke.nodepool_tpu_topology - description = "GKE TPU topology" -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/terraform.tfvars deleted file mode 100644 index 20ecf2ca..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v4/terraform.tfvars +++ /dev/null @@ -1,16 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" -}] -cpu_node_pool = { - zone = ["us-central2-a", "us-central2-b", "us-central2-c"] - machine_type = "n2-standard-8", - initial_node_count_per_zone = 1, - min_node_count_per_zone = 1, - max_node_count_per_zone = 30, -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf deleted file mode 100644 index 7cce20f2..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf +++ /dev/null @@ -1,17 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "cpu_node_pool" {} -variable "authorized_cidr_blocks" {} -variable "is_cpu_node_private" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - cpu_node_pool = var.cpu_node_pool - is_cpu_node_private = var.is_cpu_node_private - authorized_cidr_blocks = var.authorized_cidr_blocks -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf deleted file mode 100644 index a5514b1f..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "authorized_cidr_blocks" { - value = var.authorized_cidr_blocks - description = "Cluster allowed cidr blocks " -} - -output "is_cpu_node_private" { - value = var.is_cpu_node_private - description = "whether we want to make CPU node private" -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars deleted file mode 100644 index 7f9fcb9f..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars +++ /dev/null @@ -1,12 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-v5e-test" -region = "us-east5" -authorized_cidr_blocks = [] -is_cpu_node_private = false -cpu_node_pool = { - zone = ["us-east5-a", "us-east5-b", "us-east5-c"] - machine_type = "n2-standard-8", - initial_node_count_per_zone = 1, - min_node_count_per_zone = 1, - max_node_count_per_zone = 30, -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf deleted file mode 100644 index 6596e498..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# GKE cluster -data "google_container_engine_versions" "gke_version" { - location = var.region - version_prefix = "1.27." -} - -provider "google" { - project = var.project_id - region = var.region -} - -# VPC -resource "google_compute_network" "vpc" { - name = "${var.resource_name_prefix}-vpc" - auto_create_subnetworks = "false" -} - -# Subnet -resource "google_compute_subnetwork" "subnet" { - name = "${var.resource_name_prefix}-subnet" - region = var.region - network = google_compute_network.vpc.name - ip_cidr_range = "10.10.0.0/19" -} - -resource "google_container_cluster" "tpu_cluster" { - name = "${var.resource_name_prefix}-gke-cluster" - location = var.region - - # We can't create a cluster with no node pool defined, but we want to only use - # separately managed node pools. So we create the smallest possible default - # node pool and immediately delete it. - remove_default_node_pool = true - initial_node_count = 1 - networking_mode = "VPC_NATIVE" - ip_allocation_policy { - cluster_ipv4_cidr_block = "/14" - services_ipv4_cidr_block = "/20" - } - default_max_pods_per_node = 15 - - release_channel { - channel = "UNSPECIFIED" - } - - network = google_compute_network.vpc.name - subnetwork = google_compute_subnetwork.subnet.name - logging_service = "logging.googleapis.com/kubernetes" - monitoring_service = "monitoring.googleapis.com/kubernetes" - - master_authorized_networks_config { - gcp_public_cidrs_access_enabled = false - - dynamic "cidr_blocks" { - for_each = var.authorized_cidr_blocks - content { - cidr_block = cidr_blocks.value - display_name = "cidr-blocks-group-${cidr_blocks.key}" - } - } - } - - // Needs to be false when creating a PSC-based GKE cluster. - // After that, set as true to disable public endpoint of cluster master. - private_cluster_config { - enable_private_endpoint = false - } - - timeouts { - create = "120m" - update = "120m" - } -} - -resource "google_container_node_pool" "cpu_node_pool" { - provider = google-beta - project = var.project_id - name = "cpu-node-pool" - location = var.region - node_locations = var.cpu_node_pool.zone - cluster = google_container_cluster.tpu_cluster.name - initial_node_count = var.cpu_node_pool.initial_node_count_per_zone - autoscaling { - min_node_count = var.cpu_node_pool.min_node_count_per_zone - max_node_count = var.cpu_node_pool.max_node_count_per_zone - } - max_pods_per_node = 63 - node_config { - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - machine_type = var.cpu_node_pool.machine_type - - metadata = { - disable-legacy-endpoints = "true" - } - gcfs_config { - enabled = true - } - } - - network_config { - enable_private_nodes = var.is_cpu_node_private - } -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf deleted file mode 100644 index 3953819c..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = google_container_cluster.tpu_cluster.name - description = "GKE Cluster Name" -} - -output "authorized_cidr_blocks" { - value = var.authorized_cidr_blocks - description = "Cluster allowed cidr blocks " -} - -output "is_cpu_node_private" { - value = var.is_cpu_node_private - description = "whether we want to make CPU node private" -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars deleted file mode 100644 index f3f4e7be..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars +++ /dev/null @@ -1,11 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -authorized_cidr_blocks = [] -cpu_node_pool = { - zone = ["us-central2-a", "us-central2-b", "us-central2-c"] - machine_type = "n2-standard-64", - initial_node_count_per_zone = 1, - min_node_count_per_zone = 1, - max_node_count_per_zone = 10 -} diff --git a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf deleted file mode 100644 index df05e43d..00000000 --- a/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - description = "project id" -} - -variable "region" { - description = "region" -} - -variable "resource_name_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "authorized_cidr_blocks" { - description = "cluster allowed cidr blocks to access with kubectl CLI" - type = list(string) - default = [] -} - -variable "cpu_node_pool" { - description = "cpu nodepool config" - type = object({ - zone = list(string), - machine_type = string, - initial_node_count_per_zone = number, - min_node_count_per_zone = number, - max_node_count_per_zone = number - }) - validation { - condition = ( - (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) - ) - error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." - } -} - -variable "is_cpu_node_private" { - description = "whether we want to make CPU node private" - default = false -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/main.tf deleted file mode 100644 index c3b6990c..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/main.tf +++ /dev/null @@ -1,15 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "maintenance_interval" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - maintenance_interval = var.maintenance_interval -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/outputs.tf deleted file mode 100644 index 44d8350f..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/outputs.tf +++ /dev/null @@ -1,19 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "kubernetes_cluster_host" { - value = module.tpu-gke.kubernetes_cluster_host - description = "GKE Cluster Host" -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/terraform.tfvars deleted file mode 100644 index 84f60850..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v4/terraform.tfvars +++ /dev/null @@ -1,9 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" -}] diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf deleted file mode 100644 index 61ac2331..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf +++ /dev/null @@ -1,19 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "node_pool_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "maintenance_interval" {} -variable "is_tpu_node_private" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - node_pool_prefix = var.node_pool_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - maintenance_interval = var.maintenance_interval - is_tpu_node_private = var.is_tpu_node_private -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf deleted file mode 100644 index 846c656e..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf +++ /dev/null @@ -1,19 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "is_tpu_node_private" { - value = var.is_tpu_node_private - description = "whether we want to make TPU node private" -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars deleted file mode 100644 index e18f03ca..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars +++ /dev/null @@ -1,14 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-v5e-test" -node_pool_prefix = "batch1" -region = "us-east5" -is_tpu_node_private = false -tpu_node_pools = [{ - zone = "us-east5-b" - node_count = 16 - machine_type = "ct5lp-hightpu-4t" - topology = "8x8" - disk_type = "pd-balanced" - disk_size_gb = 120 -}] -maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf deleted file mode 100644 index 0ccdbdba..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# GKE cluster -data "google_container_engine_versions" "gke_version" { - location = var.region - version_prefix = "1.27." -} - -provider "google" { - project = var.project_id - region = var.region -} - -# Separately Managed Node Pool -resource "google_container_node_pool" "multihost_tpu" { - count = length(var.tpu_node_pools) - name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" - provider = google-beta - project = var.project_id - location = var.region - node_locations = [var.tpu_node_pools[count.index].zone] - cluster = "${var.resource_name_prefix}-gke-cluster" - - initial_node_count = var.tpu_node_pools[count.index].node_count - - management { - auto_upgrade = false - } - - node_config { - oauth_scopes = [ - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/monitoring", - "https://www.googleapis.com/auth/cloud-platform", - ] - host_maintenance_policy { - maintenance_interval = var.maintenance_interval - } - labels = { - env = var.project_id - } - gvnic { - enabled = true - } - gcfs_config { - enabled = true - } - - image_type = "COS_CONTAINERD" - machine_type = var.tpu_node_pools[count.index].machine_type - disk_type = var.tpu_node_pools[count.index].disk_type - disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb - tags = ["gke-node"] - metadata = { - disable-legacy-endpoints = "true" - } - } - placement_policy { - type = "COMPACT" - tpu_topology = var.tpu_node_pools[count.index].topology - } - - network_config { - enable_private_nodes = var.is_tpu_node_private - } -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf deleted file mode 100644 index 06972205..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf +++ /dev/null @@ -1,19 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = google_container_node_pool.multihost_tpu[0].cluster - description = "GKE Cluster Name" -} - -output "is_tpu_node_private" { - value = var.is_tpu_node_private - description = "whether we want to make TPU node private" -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars deleted file mode 100644 index e44d8dec..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars +++ /dev/null @@ -1,20 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 4 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x4" - }, { - zone = "us-central2-b" - node_count = 4 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x4" - }, { - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" -}] -maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf deleted file mode 100644 index c467e69a..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - description = "project id" -} - -variable "region" { - description = "region" -} - -variable "resource_name_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "node_pool_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "tpu_node_pools" { - description = "tpu podslice config" - type = list(object({ - zone = string, - node_count = number, - machine_type = string, - topology = string, - disk_type = optional(string), - disk_size_gb = optional(number), - })) -} - -variable "is_tpu_node_private" { - description = "whether we want to make TPU node private" - default = false -} - -variable "maintenance_interval" { - default = "AS_NEEDED" - description = "maintenance interval for TPU machines." -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/main.tf deleted file mode 100644 index 304251dc..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/main.tf +++ /dev/null @@ -1,17 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "cpu_node_pool" {} -variable "maintenance_interval" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - cpu_node_pool = var.cpu_node_pool - maintenance_interval = var.maintenance_interval -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/outputs.tf deleted file mode 100644 index 44d8350f..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/outputs.tf +++ /dev/null @@ -1,19 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "kubernetes_cluster_host" { - value = module.tpu-gke.kubernetes_cluster_host - description = "GKE Cluster Host" -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/terraform.tfvars deleted file mode 100644 index 1e6c096f..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v4/terraform.tfvars +++ /dev/null @@ -1,16 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" - }] -cpu_node_pool = { - zone = ["us-central2-a", "us-central2-b", "us-central2-c"] - machine_type = "n2-standard-8", - initial_node_count_per_zone = 1, - min_node_count_per_zone = 1, - max_node_count_per_zone = 30, -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf deleted file mode 100644 index 7cce20f2..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf +++ /dev/null @@ -1,17 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "cpu_node_pool" {} -variable "authorized_cidr_blocks" {} -variable "is_cpu_node_private" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - cpu_node_pool = var.cpu_node_pool - is_cpu_node_private = var.is_cpu_node_private - authorized_cidr_blocks = var.authorized_cidr_blocks -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf deleted file mode 100644 index a5514b1f..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "authorized_cidr_blocks" { - value = var.authorized_cidr_blocks - description = "Cluster allowed cidr blocks " -} - -output "is_cpu_node_private" { - value = var.is_cpu_node_private - description = "whether we want to make CPU node private" -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars deleted file mode 100644 index 7f9fcb9f..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars +++ /dev/null @@ -1,12 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-v5e-test" -region = "us-east5" -authorized_cidr_blocks = [] -is_cpu_node_private = false -cpu_node_pool = { - zone = ["us-east5-a", "us-east5-b", "us-east5-c"] - machine_type = "n2-standard-8", - initial_node_count_per_zone = 1, - min_node_count_per_zone = 1, - max_node_count_per_zone = 30, -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf deleted file mode 100644 index 6596e498..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# GKE cluster -data "google_container_engine_versions" "gke_version" { - location = var.region - version_prefix = "1.27." -} - -provider "google" { - project = var.project_id - region = var.region -} - -# VPC -resource "google_compute_network" "vpc" { - name = "${var.resource_name_prefix}-vpc" - auto_create_subnetworks = "false" -} - -# Subnet -resource "google_compute_subnetwork" "subnet" { - name = "${var.resource_name_prefix}-subnet" - region = var.region - network = google_compute_network.vpc.name - ip_cidr_range = "10.10.0.0/19" -} - -resource "google_container_cluster" "tpu_cluster" { - name = "${var.resource_name_prefix}-gke-cluster" - location = var.region - - # We can't create a cluster with no node pool defined, but we want to only use - # separately managed node pools. So we create the smallest possible default - # node pool and immediately delete it. - remove_default_node_pool = true - initial_node_count = 1 - networking_mode = "VPC_NATIVE" - ip_allocation_policy { - cluster_ipv4_cidr_block = "/14" - services_ipv4_cidr_block = "/20" - } - default_max_pods_per_node = 15 - - release_channel { - channel = "UNSPECIFIED" - } - - network = google_compute_network.vpc.name - subnetwork = google_compute_subnetwork.subnet.name - logging_service = "logging.googleapis.com/kubernetes" - monitoring_service = "monitoring.googleapis.com/kubernetes" - - master_authorized_networks_config { - gcp_public_cidrs_access_enabled = false - - dynamic "cidr_blocks" { - for_each = var.authorized_cidr_blocks - content { - cidr_block = cidr_blocks.value - display_name = "cidr-blocks-group-${cidr_blocks.key}" - } - } - } - - // Needs to be false when creating a PSC-based GKE cluster. - // After that, set as true to disable public endpoint of cluster master. - private_cluster_config { - enable_private_endpoint = false - } - - timeouts { - create = "120m" - update = "120m" - } -} - -resource "google_container_node_pool" "cpu_node_pool" { - provider = google-beta - project = var.project_id - name = "cpu-node-pool" - location = var.region - node_locations = var.cpu_node_pool.zone - cluster = google_container_cluster.tpu_cluster.name - initial_node_count = var.cpu_node_pool.initial_node_count_per_zone - autoscaling { - min_node_count = var.cpu_node_pool.min_node_count_per_zone - max_node_count = var.cpu_node_pool.max_node_count_per_zone - } - max_pods_per_node = 63 - node_config { - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - machine_type = var.cpu_node_pool.machine_type - - metadata = { - disable-legacy-endpoints = "true" - } - gcfs_config { - enabled = true - } - } - - network_config { - enable_private_nodes = var.is_cpu_node_private - } -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf deleted file mode 100644 index 3953819c..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = google_container_cluster.tpu_cluster.name - description = "GKE Cluster Name" -} - -output "authorized_cidr_blocks" { - value = var.authorized_cidr_blocks - description = "Cluster allowed cidr blocks " -} - -output "is_cpu_node_private" { - value = var.is_cpu_node_private - description = "whether we want to make CPU node private" -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tf deleted file mode 100644 index f3f4e7be..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tf +++ /dev/null @@ -1,11 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -authorized_cidr_blocks = [] -cpu_node_pool = { - zone = ["us-central2-a", "us-central2-b", "us-central2-c"] - machine_type = "n2-standard-64", - initial_node_count_per_zone = 1, - min_node_count_per_zone = 1, - max_node_count_per_zone = 10 -} diff --git a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf deleted file mode 100644 index df05e43d..00000000 --- a/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - description = "project id" -} - -variable "region" { - description = "region" -} - -variable "resource_name_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "authorized_cidr_blocks" { - description = "cluster allowed cidr blocks to access with kubectl CLI" - type = list(string) - default = [] -} - -variable "cpu_node_pool" { - description = "cpu nodepool config" - type = object({ - zone = list(string), - machine_type = string, - initial_node_count_per_zone = number, - min_node_count_per_zone = number, - max_node_count_per_zone = number - }) - validation { - condition = ( - (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) - ) - error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." - } -} - -variable "is_cpu_node_private" { - description = "whether we want to make CPU node private" - default = false -} diff --git a/tools/kubernetes/terraform/examples/v4/main.tf b/tools/kubernetes/terraform/examples/v4/main.tf deleted file mode 100644 index e3856948..00000000 --- a/tools/kubernetes/terraform/examples/v4/main.tf +++ /dev/null @@ -1,15 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "maintenance_interval" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - maintenance_interval = var.maintenance_interval -} \ No newline at end of file diff --git a/tools/kubernetes/terraform/examples/v4/outputs.tf b/tools/kubernetes/terraform/examples/v4/outputs.tf deleted file mode 100644 index eb36535c..00000000 --- a/tools/kubernetes/terraform/examples/v4/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "kubernetes_cluster_host" { - value = module.tpu-gke.kubernetes_cluster_host - description = "GKE Cluster Host" -} - -output "nodepool_tpu_topology" { - value = module.tpu-gke.nodepool_tpu_topology - description = "GKE TPU topology" -} \ No newline at end of file diff --git a/tools/kubernetes/terraform/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/examples/v4/terraform.tfvars deleted file mode 100644 index c382f146..00000000 --- a/tools/kubernetes/terraform/examples/v4/terraform.tfvars +++ /dev/null @@ -1,9 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" - }] \ No newline at end of file diff --git a/tools/kubernetes/terraform/examples/v5e/main.tf b/tools/kubernetes/terraform/examples/v5e/main.tf deleted file mode 100644 index c3b6990c..00000000 --- a/tools/kubernetes/terraform/examples/v5e/main.tf +++ /dev/null @@ -1,15 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "maintenance_interval" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - maintenance_interval = var.maintenance_interval -} diff --git a/tools/kubernetes/terraform/examples/v5e/outputs.tf b/tools/kubernetes/terraform/examples/v5e/outputs.tf deleted file mode 100644 index 757767c3..00000000 --- a/tools/kubernetes/terraform/examples/v5e/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "kubernetes_cluster_host" { - value = module.tpu-gke.kubernetes_cluster_host - description = "GKE Cluster Host" -} - -output "placement_policy_names" { - value = module.tpu-gke.placement_policy_names - description = "GKE TPU Placement Policy Names" -} diff --git a/tools/kubernetes/terraform/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/examples/v5e/terraform.tfvars deleted file mode 100644 index 73f76abb..00000000 --- a/tools/kubernetes/terraform/examples/v5e/terraform.tfvars +++ /dev/null @@ -1,101 +0,0 @@ -project_id = "project_id" -resource_name_prefix = "tpu-v5e-test" -region = "us-east5" -tpu_node_pools = [{ - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4a" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4a" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4a" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4a" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4b" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4b" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4b" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4b" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4c" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4c" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4c" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4c" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4d" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4d" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4d" - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-4d" -}] -maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/main.tf similarity index 100% rename from tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf rename to tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/main.tf diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf rename to tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/outputs.tf diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/terraform.tfvars b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/terraform.tfvars new file mode 100644 index 00000000..75e43377 --- /dev/null +++ b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/terraform.tfvars @@ -0,0 +1,23 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-east5-a" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + },{ + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" +}] +maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/variables.tf similarity index 100% rename from tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf rename to tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/variables.tf diff --git a/tools/kubernetes/terraform/module/main.tf b/tools/kubernetes/terraform/module/main.tf deleted file mode 100644 index 6c551df3..00000000 --- a/tools/kubernetes/terraform/module/main.tf +++ /dev/null @@ -1,119 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# GKE cluster -data "google_container_engine_versions" "gke_version" { - location = var.region - version_prefix = "1.27." -} - -provider "google" { - project = var.project_id - region = var.region -} - -# VPC -resource "google_compute_network" "vpc" { - name = "${var.resource_name_prefix}-vpc" - auto_create_subnetworks = "false" -} - -# Subnet -resource "google_compute_subnetwork" "subnet" { - name = "${var.resource_name_prefix}-subnet" - region = var.region - network = google_compute_network.vpc.name - ip_cidr_range = "10.10.0.0/19" -} - -resource "google_container_cluster" "tpu_cluster" { - name = "${var.resource_name_prefix}-gke-cluster" - location = var.region - - # We can't create a cluster with no node pool defined, but we want to only use - # separately managed node pools. So we create the smallest possible default - # node pool and immediately delete it. - remove_default_node_pool = true - initial_node_count = 1 - networking_mode = "VPC_NATIVE" - ip_allocation_policy { - cluster_ipv4_cidr_block = "/14" - services_ipv4_cidr_block = "/20" - } - default_max_pods_per_node = 50 - - release_channel { - channel = "UNSPECIFIED" - } - - network = google_compute_network.vpc.name - subnetwork = google_compute_subnetwork.subnet.name - logging_service = "logging.googleapis.com/kubernetes" - monitoring_service = "monitoring.googleapis.com/kubernetes" - - timeouts { - create = "120m" - update = "120m" - } -} - -# Separately Managed Node Pool -resource "google_container_node_pool" "multihost_tpu" { - count = length(var.tpu_node_pools) - name = "${google_container_cluster.tpu_cluster.name}-${count.index}" - provider = google-beta - project = var.project_id - location = var.region - node_locations = [var.tpu_node_pools[count.index].zone] - cluster = google_container_cluster.tpu_cluster.name - - initial_node_count = var.tpu_node_pools[count.index].node_count - - management { - auto_upgrade = false - } - - node_config { - oauth_scopes = [ - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/monitoring", - "https://www.googleapis.com/auth/cloud-platform", - ] - host_maintenance_policy { - maintenance_interval = var.maintenance_interval - } - labels = { - env = var.project_id - } - gvnic { - enabled = true - } - gcfs_config { - enabled = true - } - - image_type = "COS_CONTAINERD" - machine_type = var.tpu_node_pools[count.index].machine_type - tags = ["gke-node"] - metadata = { - disable-legacy-endpoints = "true" - } - } - placement_policy { - type = "COMPACT" - policy_name = var.tpu_node_pools[count.index].policy - } -} diff --git a/tools/kubernetes/terraform/module/outputs.tf b/tools/kubernetes/terraform/module/outputs.tf deleted file mode 100644 index 24cb14dc..00000000 --- a/tools/kubernetes/terraform/module/outputs.tf +++ /dev/null @@ -1,26 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = google_container_cluster.tpu_cluster.name - description = "GKE Cluster Name" -} - -output "kubernetes_cluster_host" { - value = google_container_cluster.tpu_cluster.endpoint - description = "GKE Cluster Host" -} - -output "placement_policy_names" { - value = flatten([ - google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name - ]) - description = "GKE TPU Placement Policy Names" -} diff --git a/tools/kubernetes/terraform/module/terraform.tfvars b/tools/kubernetes/terraform/module/terraform.tfvars deleted file mode 100644 index a24e1f9c..00000000 --- a/tools/kubernetes/terraform/module/terraform.tfvars +++ /dev/null @@ -1,21 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -location = "us-central2-b" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 4 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x4" - }, { - zone = "us-central2-b" - node_count = 4 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x4" - }, { - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" -}] -maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/module/variables.tf b/tools/kubernetes/terraform/module/variables.tf deleted file mode 100644 index 35f460aa..00000000 --- a/tools/kubernetes/terraform/module/variables.tf +++ /dev/null @@ -1,44 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - description = "project id" -} - -variable "region" { - description = "region" -} - -variable "resource_name_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "tpu_node_pools" { - description = "tpu podslice config" - type = list(object({ - zone = string, - node_count = number, - machine_type = string, - topology = string, - policy = string, - })) -} - -variable "maintenance_interval" { - default = "AS_NEEDED" - description = "maintenance interval for TPU machines." -} diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/main.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/main.tf deleted file mode 100644 index c3b6990c..00000000 --- a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/main.tf +++ /dev/null @@ -1,15 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "maintenance_interval" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - maintenance_interval = var.maintenance_interval -} diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/outputs.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/outputs.tf deleted file mode 100644 index 78a05be8..00000000 --- a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "kubernetes_cluster_host" { - value = module.tpu-gke.kubernetes_cluster_host - description = "GKE Cluster Host" -} - -output "nodepool_tpu_topology" { - value = module.tpu-gke.nodepool_tpu_topology - description = "GKE TPU topology" -} diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/terraform.tfvars b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/terraform.tfvars deleted file mode 100644 index 84f60850..00000000 --- a/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v4/terraform.tfvars +++ /dev/null @@ -1,9 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" -}] diff --git a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars deleted file mode 100644 index 8f63265f..00000000 --- a/tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars +++ /dev/null @@ -1,28 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -authorized_cidr_blocks = [] -cpu_node_pool = { - zone = ["us-central2-a", "us-central2-b", "us-central2-c"] - machine_type = "n2-standard-64", - initial_node_count_per_zone = 1, - min_node_count_per_zone = 1, - max_node_count_per_zone = 10 -} -tpu_node_pools = [{ - zone = "us-central2-b" - node_count = 4 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x4" - }, { - zone = "us-central2-b" - node_count = 4 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x4" - }, { - zone = "us-central2-b" - node_count = 2 - machine_type = "ct4p-hightpu-4t" - topology = "2x2x2" -}] -maintenance_interval = "AS_NEEDED" From 567ff06707cf188117e041fb6fc041429f9b4f53 Mon Sep 17 00:00:00 2001 From: Kangmin Xie Date: Fri, 6 Oct 2023 16:11:43 +0000 Subject: [PATCH 5/8] Added the scripts for training directory --- .../examples/v5e | 1 + .../add_node_pool/examples/main.tf | 19 +++ .../add_node_pool/examples/outputs.tf | 24 ++++ .../add_node_pool/examples/terraform.tfvars | 39 ++++++ .../add_node_pool/module/main.tf | 81 +++++++++++ .../add_node_pool/module/outputs.tf | 26 ++++ .../add_node_pool/module/terraform.tfvars | 12 ++ .../add_node_pool/module/variables.tf | 56 ++++++++ .../create_cluster/examples/v5e/main.tf | 17 +++ .../create_cluster/examples/v5e/outputs.tf | 24 ++++ .../examples/v5e/terraform.tfvars | 13 ++ .../create_cluster/module/main.tf | 126 ++++++++++++++++++ .../create_cluster/module/outputs.tf | 24 ++++ .../create_cluster/module/terraform.tfvars | 11 ++ .../create_cluster/module/variables.tf | 56 ++++++++ .../add_node_pool/examples/main.tf | 19 +++ .../add_node_pool/examples/outputs.tf | 19 +++ .../add_node_pool/examples/terraform.tfvars | 21 +++ .../add_node_pool/module/main.tf | 81 +++++++++++ .../add_node_pool/module/outputs.tf | 19 +++ .../add_node_pool/module/terraform.tfvars | 11 ++ .../add_node_pool/module/variables.tf | 55 ++++++++ .../create_cluster/examples/v5e/main.tf | 17 +++ .../create_cluster/examples/v5e/outputs.tf | 24 ++++ .../examples/v5e/terraform.tfvars | 13 ++ .../create_cluster/module/main.tf | 126 ++++++++++++++++++ .../create_cluster/module/outputs.tf | 24 ++++ .../create_cluster/module/terraform.tfvars | 11 ++ .../create_cluster/module/variables.tf | 56 ++++++++ 29 files changed, 1025 insertions(+) create mode 160000 tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/main.tf create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/outputs.tf create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/terraform.tfvars create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/main.tf create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/outputs.tf create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/variables.tf create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/main.tf create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/outputs.tf create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/variables.tf create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/main.tf create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/outputs.tf create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/terraform.tfvars create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/main.tf create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/outputs.tf create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/variables.tf create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/main.tf create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/main.tf create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/outputs.tf create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/terraform.tfvars create mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/variables.tf diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e new file mode 160000 index 00000000..c79d0301 --- /dev/null +++ b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e @@ -0,0 +1 @@ +Subproject commit c79d0301eda699b15e19158033b8b12fe01b0d69 diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/main.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/main.tf new file mode 100644 index 00000000..61ac2331 --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/main.tf @@ -0,0 +1,19 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "node_pool_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} +variable "is_tpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + node_pool_prefix = var.node_pool_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval + is_tpu_node_private = var.is_tpu_node_private +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/outputs.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/outputs.tf new file mode 100644 index 00000000..ebb1782f --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "placement_policy_names" { + value = module.tpu-gke.placement_policy_names + description = "GKE TPU Placement Policy Names" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/terraform.tfvars b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/terraform.tfvars new file mode 100644 index 00000000..78852a67 --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/terraform.tfvars @@ -0,0 +1,39 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +node_pool_prefix = "rp1" +region = "us-east5" +is_tpu_node_private = false +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-1" + disk_type = "pd-balanced" + disk_size_gb = 120 +}] +maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/main.tf new file mode 100644 index 00000000..498177bf --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/main.tf @@ -0,0 +1,81 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# Separately Managed Node Pool +resource "google_container_node_pool" "multihost_tpu" { + count = length(var.tpu_node_pools) + name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" + provider = google-beta + project = var.project_id + location = var.region + node_locations = [var.tpu_node_pools[count.index].zone] + cluster = "${var.resource_name_prefix}-gke-cluster" + + initial_node_count = var.tpu_node_pools[count.index].node_count + + management { + // auto_upgrade must be true when release_channel = RAPID for cluster. + auto_upgrade = true + } + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/cloud-platform", + ] + host_maintenance_policy { + maintenance_interval = var.maintenance_interval + } + labels = { + env = var.project_id + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + + image_type = "COS_CONTAINERD" + machine_type = var.tpu_node_pools[count.index].machine_type + disk_type = var.tpu_node_pools[count.index].disk_type + disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb + tags = ["gke-node"] + metadata = { + disable-legacy-endpoints = "true" + } + } + placement_policy { + type = "COMPACT" + policy_name = var.tpu_node_pools[count.index].policy + } + + network_config { + enable_private_nodes = var.is_tpu_node_private + } +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/outputs.tf new file mode 100644 index 00000000..68085ceb --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/outputs.tf @@ -0,0 +1,26 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_node_pool.multihost_tpu[0].cluster + description = "GKE Cluster Name" +} + +output "placement_policy_names" { + value = flatten([ + google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name + ]) + description = "GKE TPU Placement Policy Names" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/terraform.tfvars new file mode 100644 index 00000000..171d95df --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/terraform.tfvars @@ -0,0 +1,12 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +node_pool_prefix = "rp1" +region = "us-east5" +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-1" +}] +maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/variables.tf new file mode 100644 index 00000000..fa5d507d --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "node_pool_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "tpu_node_pools" { + description = "tpu podslice config" + type = list(object({ + zone = string, + node_count = number, + machine_type = string, + topology = string, + policy = string, + disk_type = optional(string), + disk_size_gb = optional(number), + })) +} + +variable "is_tpu_node_private" { + description = "whether we want to make TPU node private" + default = false +} + +variable "maintenance_interval" { + default = "AS_NEEDED" + description = "maintenance interval for TPU machines." +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/main.tf new file mode 100644 index 00000000..7cce20f2 --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/main.tf @@ -0,0 +1,17 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "cpu_node_pool" {} +variable "authorized_cidr_blocks" {} +variable "is_cpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + cpu_node_pool = var.cpu_node_pool + is_cpu_node_private = var.is_cpu_node_private + authorized_cidr_blocks = var.authorized_cidr_blocks +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf new file mode 100644 index 00000000..a5514b1f --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..86a001a2 --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars @@ -0,0 +1,13 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +region = "us-east5" +authorized_cidr_blocks = [] +is_cpu_node_private = false +cpu_node_pool = { + location_policy = "BALANCED" + zone = ["us-east5-b"] + machine_type = "e2-standard-32", + initial_node_count_per_zone = 5, + min_node_count_per_zone = 5, + max_node_count_per_zone = 1000, +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/main.tf new file mode 100644 index 00000000..f5b9cf30 --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/main.tf @@ -0,0 +1,126 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# VPC +resource "google_compute_network" "vpc" { + name = "${var.resource_name_prefix}-vpc" + auto_create_subnetworks = "false" +} + +# Subnet +resource "google_compute_subnetwork" "subnet" { + name = "${var.resource_name_prefix}-subnet" + region = var.region + network = google_compute_network.vpc.name + ip_cidr_range = "10.10.0.0/18" +} + +resource "google_container_cluster" "tpu_cluster" { + name = "${var.resource_name_prefix}-gke-cluster" + location = var.region + + # We can't create a cluster with no node pool defined, but we want to only use + # separately managed node pools. So we create the smallest possible default + # node pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" # Enables IP aliasing. + + ip_allocation_policy { + cluster_ipv4_cidr_block = "/14" + services_ipv4_cidr_block = "/20" + } + default_max_pods_per_node = 15 + + release_channel { + channel = "RAPID" + } + + network = google_compute_network.vpc.name + subnetwork = google_compute_subnetwork.subnet.name + logging_service = "logging.googleapis.com/kubernetes" + monitoring_service = "monitoring.googleapis.com/kubernetes" + + master_authorized_networks_config { + gcp_public_cidrs_access_enabled = false + + dynamic "cidr_blocks" { + for_each = var.authorized_cidr_blocks + content { + cidr_block = cidr_blocks.value + display_name = "cidr-blocks-group-${cidr_blocks.key}" + } + } + } + + // Needs to be false when creating a PSC-based GKE cluster. + // After that, set as true to disable public endpoint of cluster master. + private_cluster_config { + enable_private_endpoint = false + } + + timeouts { + create = "120m" + update = "120m" + } +} + +resource "google_container_node_pool" "cpu_node_pool" { + provider = google-beta + project = var.project_id + name = "cpu-node-pool" + location = var.region + node_locations = var.cpu_node_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone + autoscaling { + location_policy = "BALANCED" + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone + } + max_pods_per_node = 63 + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + machine_type = var.cpu_node_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + } + + network_config { + enable_private_nodes = var.is_cpu_node_private + } +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/outputs.tf new file mode 100644 index 00000000..3953819c --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_cluster.tpu_cluster.name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/terraform.tfvars new file mode 100644 index 00000000..f3f4e7be --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/terraform.tfvars @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-central2-a", "us-central2-b", "us-central2-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/variables.tf new file mode 100644 index 00000000..df05e43d --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "authorized_cidr_blocks" { + description = "cluster allowed cidr blocks to access with kubectl CLI" + type = list(string) + default = [] +} + +variable "cpu_node_pool" { + description = "cpu nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) + ) + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." + } +} + +variable "is_cpu_node_private" { + description = "whether we want to make CPU node private" + default = false +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/main.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/main.tf new file mode 100644 index 00000000..61ac2331 --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/main.tf @@ -0,0 +1,19 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "node_pool_prefix" {} +variable "region" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} +variable "is_tpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + node_pool_prefix = var.node_pool_prefix + region = var.region + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval + is_tpu_node_private = var.is_tpu_node_private +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/outputs.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/outputs.tf new file mode 100644 index 00000000..846c656e --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/outputs.tf @@ -0,0 +1,19 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/terraform.tfvars b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/terraform.tfvars new file mode 100644 index 00000000..ed10a48c --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/terraform.tfvars @@ -0,0 +1,21 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +node_pool_prefix = "batch1" +region = "us-east5" +is_tpu_node_private = false +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + disk_type = "pd-balanced" + disk_size_gb = 120 + },{ + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + disk_type = "pd-balanced" + disk_size_gb = 120 +}] +maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/main.tf new file mode 100644 index 00000000..7b4789fb --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/main.tf @@ -0,0 +1,81 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# Separately Managed Node Pool +resource "google_container_node_pool" "multihost_tpu" { + count = length(var.tpu_node_pools) + name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" + provider = google-beta + project = var.project_id + location = var.region + node_locations = [var.tpu_node_pools[count.index].zone] + cluster = "${var.resource_name_prefix}-gke-cluster" + + initial_node_count = var.tpu_node_pools[count.index].node_count + + management { + // auto_upgrade must be true when release_channel = RAPID for cluster. + auto_upgrade = true + } + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/cloud-platform", + ] + host_maintenance_policy { + maintenance_interval = var.maintenance_interval + } + labels = { + env = var.project_id + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + + image_type = "COS_CONTAINERD" + machine_type = var.tpu_node_pools[count.index].machine_type + disk_type = var.tpu_node_pools[count.index].disk_type + disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb + tags = ["gke-node"] + metadata = { + disable-legacy-endpoints = "true" + } + } + placement_policy { + type = "COMPACT" + tpu_topology = var.tpu_node_pools[count.index].topology + } + + network_config { + enable_private_nodes = var.is_tpu_node_private + } +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/outputs.tf new file mode 100644 index 00000000..06972205 --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/outputs.tf @@ -0,0 +1,19 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_node_pool.multihost_tpu[0].cluster + description = "GKE Cluster Name" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/terraform.tfvars new file mode 100644 index 00000000..48dbf54f --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/terraform.tfvars @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +node_pool_prefix = "batch1" +region = "us-east5" +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" +}] +maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/variables.tf new file mode 100644 index 00000000..c467e69a --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/variables.tf @@ -0,0 +1,55 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "node_pool_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "tpu_node_pools" { + description = "tpu podslice config" + type = list(object({ + zone = string, + node_count = number, + machine_type = string, + topology = string, + disk_type = optional(string), + disk_size_gb = optional(number), + })) +} + +variable "is_tpu_node_private" { + description = "whether we want to make TPU node private" + default = false +} + +variable "maintenance_interval" { + default = "AS_NEEDED" + description = "maintenance interval for TPU machines." +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/main.tf new file mode 100644 index 00000000..7cce20f2 --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/main.tf @@ -0,0 +1,17 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "region" {} +variable "cpu_node_pool" {} +variable "authorized_cidr_blocks" {} +variable "is_cpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + region = var.region + cpu_node_pool = var.cpu_node_pool + is_cpu_node_private = var.is_cpu_node_private + authorized_cidr_blocks = var.authorized_cidr_blocks +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf new file mode 100644 index 00000000..a5514b1f --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars new file mode 100644 index 00000000..86a001a2 --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars @@ -0,0 +1,13 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +region = "us-east5" +authorized_cidr_blocks = [] +is_cpu_node_private = false +cpu_node_pool = { + location_policy = "BALANCED" + zone = ["us-east5-b"] + machine_type = "e2-standard-32", + initial_node_count_per_zone = 5, + min_node_count_per_zone = 5, + max_node_count_per_zone = 1000, +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/main.tf new file mode 100644 index 00000000..f5b9cf30 --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/main.tf @@ -0,0 +1,126 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# GKE cluster +data "google_container_engine_versions" "gke_version" { + location = var.region + version_prefix = "1.27." +} + +provider "google" { + project = var.project_id + region = var.region +} + +# VPC +resource "google_compute_network" "vpc" { + name = "${var.resource_name_prefix}-vpc" + auto_create_subnetworks = "false" +} + +# Subnet +resource "google_compute_subnetwork" "subnet" { + name = "${var.resource_name_prefix}-subnet" + region = var.region + network = google_compute_network.vpc.name + ip_cidr_range = "10.10.0.0/18" +} + +resource "google_container_cluster" "tpu_cluster" { + name = "${var.resource_name_prefix}-gke-cluster" + location = var.region + + # We can't create a cluster with no node pool defined, but we want to only use + # separately managed node pools. So we create the smallest possible default + # node pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" # Enables IP aliasing. + + ip_allocation_policy { + cluster_ipv4_cidr_block = "/14" + services_ipv4_cidr_block = "/20" + } + default_max_pods_per_node = 15 + + release_channel { + channel = "RAPID" + } + + network = google_compute_network.vpc.name + subnetwork = google_compute_subnetwork.subnet.name + logging_service = "logging.googleapis.com/kubernetes" + monitoring_service = "monitoring.googleapis.com/kubernetes" + + master_authorized_networks_config { + gcp_public_cidrs_access_enabled = false + + dynamic "cidr_blocks" { + for_each = var.authorized_cidr_blocks + content { + cidr_block = cidr_blocks.value + display_name = "cidr-blocks-group-${cidr_blocks.key}" + } + } + } + + // Needs to be false when creating a PSC-based GKE cluster. + // After that, set as true to disable public endpoint of cluster master. + private_cluster_config { + enable_private_endpoint = false + } + + timeouts { + create = "120m" + update = "120m" + } +} + +resource "google_container_node_pool" "cpu_node_pool" { + provider = google-beta + project = var.project_id + name = "cpu-node-pool" + location = var.region + node_locations = var.cpu_node_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone + autoscaling { + location_policy = "BALANCED" + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone + } + max_pods_per_node = 63 + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + machine_type = var.cpu_node_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gvnic { + enabled = true + } + gcfs_config { + enabled = true + } + } + + network_config { + enable_private_nodes = var.is_cpu_node_private + } +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/outputs.tf new file mode 100644 index 00000000..3953819c --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/outputs.tf @@ -0,0 +1,24 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = google_container_cluster.tpu_cluster.name + description = "GKE Cluster Name" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/terraform.tfvars new file mode 100644 index 00000000..f3f4e7be --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/terraform.tfvars @@ -0,0 +1,11 @@ +project_id = "project-id" +resource_name_prefix = "tpu-test" +region = "us-central2" +authorized_cidr_blocks = [] +cpu_node_pool = { + zone = ["us-central2-a", "us-central2-b", "us-central2-c"] + machine_type = "n2-standard-64", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/variables.tf new file mode 100644 index 00000000..df05e43d --- /dev/null +++ b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/variables.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "project id" +} + +variable "region" { + description = "region" +} + +variable "resource_name_prefix" { + default = "" + description = "prefix for all the resouce naming" +} + +variable "authorized_cidr_blocks" { + description = "cluster allowed cidr blocks to access with kubectl CLI" + type = list(string) + default = [] +} + +variable "cpu_node_pool" { + description = "cpu nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) + ) + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." + } +} + +variable "is_cpu_node_private" { + description = "whether we want to make CPU node private" + default = false +} From b880ee6af92bad633406e5a38f6d8b95553cffdd Mon Sep 17 00:00:00 2001 From: Kangmin Xie Date: Fri, 6 Oct 2023 16:20:13 +0000 Subject: [PATCH 6/8] Rename module to clean the commit tree --- .../examples/v5/main.tf | 25 ++++ .../examples/v5/outputs.tf | 39 ++++++ .../examples/v5/terraform.tfvars | 120 ++++++++++++++++++ .../examples/v5e | 1 - 4 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/main.tf create mode 100644 tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/outputs.tf create mode 100644 tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/terraform.tfvars delete mode 160000 tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/main.tf b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/main.tf new file mode 100644 index 00000000..d6970a7d --- /dev/null +++ b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/main.tf @@ -0,0 +1,25 @@ +variable "project_id" {} +variable "resource_name_prefix" {} +variable "node_pool_prefix" {} +variable "region" {} +variable "cpu_node_pool" {} +variable "tpu_node_pools" {} +variable "maintenance_interval" {} +variable "authorized_cidr_blocks" {} +variable "is_cpu_node_private" {} +variable "is_tpu_node_private" {} + + +module "tpu-gke" { + source = "../../module" + project_id = var.project_id + resource_name_prefix = var.resource_name_prefix + node_pool_prefix = var.node_pool_prefix + region = var.region + cpu_node_pool = var.cpu_node_pool + tpu_node_pools = var.tpu_node_pools + maintenance_interval = var.maintenance_interval + authorized_cidr_blocks = var.authorized_cidr_blocks + is_cpu_node_private = var.is_cpu_node_private + is_tpu_node_private = var.is_tpu_node_private +} diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/outputs.tf b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/outputs.tf new file mode 100644 index 00000000..849d16c8 --- /dev/null +++ b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/outputs.tf @@ -0,0 +1,39 @@ +output "region" { + value = var.region + description = "GCloud Region" +} + +output "project_id" { + value = var.project_id + description = "GCloud Project ID" +} + +output "kubernetes_cluster_name" { + value = module.tpu-gke.kubernetes_cluster_name + description = "GKE Cluster Name" +} + +output "kubernetes_cluster_host" { + value = module.tpu-gke.kubernetes_cluster_host + description = "GKE Cluster Host" +} + +output "placement_policy_names" { + value = module.tpu-gke.placement_policy_names + description = "GKE TPU Placement Policy Names" +} + +output "authorized_cidr_blocks" { + value = var.authorized_cidr_blocks + description = "Cluster allowed cidr blocks " +} + +output "is_cpu_node_private" { + value = var.is_cpu_node_private + description = "whether we want to make CPU node private" +} + +output "is_tpu_node_private" { + value = var.is_tpu_node_private + description = "whether we want to make TPU node private" +} diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/terraform.tfvars b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/terraform.tfvars new file mode 100644 index 00000000..45a65aaf --- /dev/null +++ b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/terraform.tfvars @@ -0,0 +1,120 @@ +project_id = "project-id" +resource_name_prefix = "tpu-v5e-test" +region = "us-east5" +node_pool_prefix = "rp1" +authorized_cidr_blocks = [] +is_cpu_node_private = false +is_tpu_node_private = false +cpu_node_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "n2-standard-8", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 30, +} +tpu_node_pools = [{ + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 + }, { + zone = "us-east5-b" + node_count = 64 + machine_type = "ct5lp-hightpu-4t" + topology = "16x16" + policy = "sb-compact-rp1" + disk_type = "pd-balanced" + disk_size_gb = 120 +}] +maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e deleted file mode 160000 index c79d0301..00000000 --- a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c79d0301eda699b15e19158033b8b12fe01b0d69 From 1429b5e5798e4975fa6372127bb6475588d73fd2 Mon Sep 17 00:00:00 2001 From: Kangmin Xie Date: Fri, 6 Oct 2023 16:22:53 +0000 Subject: [PATCH 7/8] Rename module back. --- .../examples/{v5 => v5e}/main.tf | 0 .../examples/{v5 => v5e}/outputs.tf | 0 .../examples/{v5 => v5e}/terraform.tfvars | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/{v5 => v5e}/main.tf (100%) rename tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/{v5 => v5e}/outputs.tf (100%) rename tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/{v5 => v5e}/terraform.tfvars (100%) diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/main.tf b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/main.tf rename to tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/main.tf diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/outputs.tf b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/outputs.tf rename to tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/outputs.tf diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/terraform.tfvars b/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5/terraform.tfvars rename to tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/terraform.tfvars From 1327834d57caabea6870bbc50cf45d144df08c55 Mon Sep 17 00:00:00 2001 From: Kangmin Xie Date: Fri, 6 Oct 2023 17:21:37 +0000 Subject: [PATCH 8/8] Update the repo structure. --- .../add_node_pool/examples/v5e/main.tf | 0 .../add_node_pool/examples/v5e/outputs.tf | 0 .../examples/v5e/terraform.tfvars | 0 .../add_node_pool/module/main.tf | 0 .../add_node_pool/module/outputs.tf | 0 .../add_node_pool/module/terraform.tfvars | 0 .../add_node_pool/module/variables.tf | 0 .../create_cluster/examples/v5e/main.tf | 0 .../create_cluster/examples/v5e/outputs.tf | 0 .../examples/v5e/terraform.tfvars | 0 .../create_cluster/module/main.tf | 0 .../create_cluster/module/outputs.tf | 0 .../create_cluster/module/terraform.tfvars | 0 .../create_cluster/module/variables.tf | 0 .../add_node_pool/examples/v5e/main.tf | 0 .../add_node_pool/examples/v5e/outputs.tf | 0 .../examples/v5e/terraform.tfvars | 0 .../add_node_pool/module/main.tf | 0 .../add_node_pool/module/outputs.tf | 0 .../add_node_pool/module/terraform.tfvars | 0 .../add_node_pool/module/variables.tf | 0 .../create_cluster/examples/v5e/main.tf | 0 .../create_cluster/examples/v5e/outputs.tf | 0 .../examples/v5e/terraform.tfvars | 0 .../create_cluster/module/main.tf | 0 .../create_cluster/module/outputs.tf | 0 .../create_cluster/module/terraform.tfvars | 0 .../create_cluster/module/variables.tf | 0 .../examples/v5e/main.tf | 0 .../examples/v5e/outputs.tf | 0 .../examples/v5e/terraform.tfvars | 0 .../module/main.tf | 0 .../module/outputs.tf | 0 .../module/terraform.tfvars | 0 .../module/variables.tf | 0 .../add_node_pool/examples/main.tf | 19 --- .../add_node_pool/examples/outputs.tf | 24 ---- .../add_node_pool/examples/terraform.tfvars | 39 ------ .../add_node_pool/module/main.tf | 81 ----------- .../add_node_pool/module/outputs.tf | 26 ---- .../add_node_pool/module/terraform.tfvars | 12 -- .../add_node_pool/module/variables.tf | 56 -------- .../create_cluster/examples/v5e/main.tf | 17 --- .../create_cluster/examples/v5e/outputs.tf | 24 ---- .../examples/v5e/terraform.tfvars | 13 -- .../create_cluster/module/main.tf | 126 ------------------ .../create_cluster/module/outputs.tf | 24 ---- .../create_cluster/module/terraform.tfvars | 11 -- .../create_cluster/module/variables.tf | 56 -------- .../add_node_pool/examples/main.tf | 19 --- .../add_node_pool/examples/outputs.tf | 19 --- .../add_node_pool/examples/terraform.tfvars | 21 --- .../add_node_pool/module/main.tf | 81 ----------- .../add_node_pool/module/outputs.tf | 19 --- .../add_node_pool/module/terraform.tfvars | 11 -- .../add_node_pool/module/variables.tf | 55 -------- .../create_cluster/examples/v5e/main.tf | 17 --- .../create_cluster/examples/v5e/outputs.tf | 24 ---- .../examples/v5e/terraform.tfvars | 13 -- .../create_cluster/module/main.tf | 126 ------------------ .../create_cluster/module/outputs.tf | 24 ---- .../create_cluster/module/terraform.tfvars | 11 -- .../create_cluster/module/variables.tf | 56 -------- 63 files changed, 1024 deletions(-) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/add_node_pool/module/main.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/add_node_pool/module/outputs.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/add_node_pool/module/terraform.tfvars (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/add_node_pool/module/variables.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/create_cluster/examples/v5e/main.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/create_cluster/module/main.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/create_cluster/module/outputs.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/create_cluster/module/terraform.tfvars (100%) rename tools/kubernetes/terraform/{inference => }/batching_with_compact_placement/create_cluster/module/variables.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/add_node_pool/module/main.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/add_node_pool/module/outputs.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/add_node_pool/module/terraform.tfvars (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/add_node_pool/module/variables.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/create_cluster/examples/v5e/main.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/create_cluster/module/main.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/create_cluster/module/outputs.tf (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/create_cluster/module/terraform.tfvars (100%) rename tools/kubernetes/terraform/{inference => }/batching_without_compact_placement/create_cluster/module/variables.tf (100%) rename tools/kubernetes/terraform/{inference => }/non_batching_with_compact_placement/examples/v5e/main.tf (100%) rename tools/kubernetes/terraform/{inference => }/non_batching_with_compact_placement/examples/v5e/outputs.tf (100%) rename tools/kubernetes/terraform/{inference => }/non_batching_with_compact_placement/examples/v5e/terraform.tfvars (100%) rename tools/kubernetes/terraform/{inference => }/non_batching_with_compact_placement/module/main.tf (100%) rename tools/kubernetes/terraform/{inference => }/non_batching_with_compact_placement/module/outputs.tf (100%) rename tools/kubernetes/terraform/{inference => }/non_batching_with_compact_placement/module/terraform.tfvars (100%) rename tools/kubernetes/terraform/{inference => }/non_batching_with_compact_placement/module/variables.tf (100%) delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/main.tf delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/outputs.tf delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/main.tf delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/outputs.tf delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/variables.tf delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/main.tf delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/main.tf delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/outputs.tf delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/variables.tf delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/main.tf delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/outputs.tf delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/main.tf delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/outputs.tf delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/variables.tf delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/main.tf delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/main.tf delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/outputs.tf delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/terraform.tfvars delete mode 100644 tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/variables.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/main.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/terraform.tfvars diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/main.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/main.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/outputs.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/terraform.tfvars rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/terraform.tfvars diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/add_node_pool/module/variables.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/main.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/main.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars rename to tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/main.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/main.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/outputs.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/outputs.tf diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/terraform.tfvars rename to tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/terraform.tfvars diff --git a/tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_with_compact_placement/create_cluster/module/variables.tf rename to tools/kubernetes/terraform/batching_with_compact_placement/create_cluster/module/variables.tf diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/main.tf diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/outputs.tf diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars rename to tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/examples/v5e/terraform.tfvars diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/main.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/main.tf diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/outputs.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/outputs.tf diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/terraform.tfvars rename to tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/terraform.tfvars diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/add_node_pool/module/variables.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/add_node_pool/module/variables.tf diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/main.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/main.tf diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars rename to tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/main.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/main.tf diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/outputs.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/outputs.tf diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/terraform.tfvars rename to tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/terraform.tfvars diff --git a/tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf similarity index 100% rename from tools/kubernetes/terraform/inference/batching_without_compact_placement/create_cluster/module/variables.tf rename to tools/kubernetes/terraform/batching_without_compact_placement/create_cluster/module/variables.tf diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/main.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/main.tf rename to tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/main.tf diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/outputs.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/outputs.tf rename to tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/outputs.tf diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/non_batching_with_compact_placement/examples/v5e/terraform.tfvars rename to tools/kubernetes/terraform/non_batching_with_compact_placement/examples/v5e/terraform.tfvars diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/main.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf similarity index 100% rename from tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/main.tf rename to tools/kubernetes/terraform/non_batching_with_compact_placement/module/main.tf diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/outputs.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf similarity index 100% rename from tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/outputs.tf rename to tools/kubernetes/terraform/non_batching_with_compact_placement/module/outputs.tf diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/terraform.tfvars b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars similarity index 100% rename from tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/terraform.tfvars rename to tools/kubernetes/terraform/non_batching_with_compact_placement/module/terraform.tfvars diff --git a/tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/variables.tf b/tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf similarity index 100% rename from tools/kubernetes/terraform/inference/non_batching_with_compact_placement/module/variables.tf rename to tools/kubernetes/terraform/non_batching_with_compact_placement/module/variables.tf diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/main.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/main.tf deleted file mode 100644 index 61ac2331..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/main.tf +++ /dev/null @@ -1,19 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "node_pool_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "maintenance_interval" {} -variable "is_tpu_node_private" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - node_pool_prefix = var.node_pool_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - maintenance_interval = var.maintenance_interval - is_tpu_node_private = var.is_tpu_node_private -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/outputs.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/outputs.tf deleted file mode 100644 index ebb1782f..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "placement_policy_names" { - value = module.tpu-gke.placement_policy_names - description = "GKE TPU Placement Policy Names" -} - -output "is_tpu_node_private" { - value = var.is_tpu_node_private - description = "whether we want to make TPU node private" -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/terraform.tfvars b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/terraform.tfvars deleted file mode 100644 index 78852a67..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/examples/terraform.tfvars +++ /dev/null @@ -1,39 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-v5e-test" -node_pool_prefix = "rp1" -region = "us-east5" -is_tpu_node_private = false -tpu_node_pools = [{ - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-1" - disk_type = "pd-balanced" - disk_size_gb = 120 - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-1" - disk_type = "pd-balanced" - disk_size_gb = 120 - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-1" - disk_type = "pd-balanced" - disk_size_gb = 120 - }, { - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-1" - disk_type = "pd-balanced" - disk_size_gb = 120 -}] -maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/main.tf deleted file mode 100644 index 498177bf..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/main.tf +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# GKE cluster -data "google_container_engine_versions" "gke_version" { - location = var.region - version_prefix = "1.27." -} - -provider "google" { - project = var.project_id - region = var.region -} - -# Separately Managed Node Pool -resource "google_container_node_pool" "multihost_tpu" { - count = length(var.tpu_node_pools) - name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" - provider = google-beta - project = var.project_id - location = var.region - node_locations = [var.tpu_node_pools[count.index].zone] - cluster = "${var.resource_name_prefix}-gke-cluster" - - initial_node_count = var.tpu_node_pools[count.index].node_count - - management { - // auto_upgrade must be true when release_channel = RAPID for cluster. - auto_upgrade = true - } - - node_config { - oauth_scopes = [ - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/monitoring", - "https://www.googleapis.com/auth/cloud-platform", - ] - host_maintenance_policy { - maintenance_interval = var.maintenance_interval - } - labels = { - env = var.project_id - } - gvnic { - enabled = true - } - gcfs_config { - enabled = true - } - - image_type = "COS_CONTAINERD" - machine_type = var.tpu_node_pools[count.index].machine_type - disk_type = var.tpu_node_pools[count.index].disk_type - disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb - tags = ["gke-node"] - metadata = { - disable-legacy-endpoints = "true" - } - } - placement_policy { - type = "COMPACT" - policy_name = var.tpu_node_pools[count.index].policy - } - - network_config { - enable_private_nodes = var.is_tpu_node_private - } -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/outputs.tf deleted file mode 100644 index 68085ceb..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/outputs.tf +++ /dev/null @@ -1,26 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = google_container_node_pool.multihost_tpu[0].cluster - description = "GKE Cluster Name" -} - -output "placement_policy_names" { - value = flatten([ - google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name - ]) - description = "GKE TPU Placement Policy Names" -} - -output "is_tpu_node_private" { - value = var.is_tpu_node_private - description = "whether we want to make TPU node private" -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/terraform.tfvars deleted file mode 100644 index 171d95df..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/terraform.tfvars +++ /dev/null @@ -1,12 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -node_pool_prefix = "rp1" -region = "us-east5" -tpu_node_pools = [{ - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - policy = "sb-compact-1" -}] -maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/variables.tf deleted file mode 100644 index fa5d507d..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/add_node_pool/module/variables.tf +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - description = "project id" -} - -variable "region" { - description = "region" -} - -variable "resource_name_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "node_pool_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "tpu_node_pools" { - description = "tpu podslice config" - type = list(object({ - zone = string, - node_count = number, - machine_type = string, - topology = string, - policy = string, - disk_type = optional(string), - disk_size_gb = optional(number), - })) -} - -variable "is_tpu_node_private" { - description = "whether we want to make TPU node private" - default = false -} - -variable "maintenance_interval" { - default = "AS_NEEDED" - description = "maintenance interval for TPU machines." -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/main.tf deleted file mode 100644 index 7cce20f2..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/main.tf +++ /dev/null @@ -1,17 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "cpu_node_pool" {} -variable "authorized_cidr_blocks" {} -variable "is_cpu_node_private" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - cpu_node_pool = var.cpu_node_pool - is_cpu_node_private = var.is_cpu_node_private - authorized_cidr_blocks = var.authorized_cidr_blocks -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf deleted file mode 100644 index a5514b1f..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "authorized_cidr_blocks" { - value = var.authorized_cidr_blocks - description = "Cluster allowed cidr blocks " -} - -output "is_cpu_node_private" { - value = var.is_cpu_node_private - description = "whether we want to make CPU node private" -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars deleted file mode 100644 index 86a001a2..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/examples/v5e/terraform.tfvars +++ /dev/null @@ -1,13 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-v5e-test" -region = "us-east5" -authorized_cidr_blocks = [] -is_cpu_node_private = false -cpu_node_pool = { - location_policy = "BALANCED" - zone = ["us-east5-b"] - machine_type = "e2-standard-32", - initial_node_count_per_zone = 5, - min_node_count_per_zone = 5, - max_node_count_per_zone = 1000, -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/main.tf deleted file mode 100644 index f5b9cf30..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/main.tf +++ /dev/null @@ -1,126 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# GKE cluster -data "google_container_engine_versions" "gke_version" { - location = var.region - version_prefix = "1.27." -} - -provider "google" { - project = var.project_id - region = var.region -} - -# VPC -resource "google_compute_network" "vpc" { - name = "${var.resource_name_prefix}-vpc" - auto_create_subnetworks = "false" -} - -# Subnet -resource "google_compute_subnetwork" "subnet" { - name = "${var.resource_name_prefix}-subnet" - region = var.region - network = google_compute_network.vpc.name - ip_cidr_range = "10.10.0.0/18" -} - -resource "google_container_cluster" "tpu_cluster" { - name = "${var.resource_name_prefix}-gke-cluster" - location = var.region - - # We can't create a cluster with no node pool defined, but we want to only use - # separately managed node pools. So we create the smallest possible default - # node pool and immediately delete it. - remove_default_node_pool = true - initial_node_count = 1 - networking_mode = "VPC_NATIVE" # Enables IP aliasing. - - ip_allocation_policy { - cluster_ipv4_cidr_block = "/14" - services_ipv4_cidr_block = "/20" - } - default_max_pods_per_node = 15 - - release_channel { - channel = "RAPID" - } - - network = google_compute_network.vpc.name - subnetwork = google_compute_subnetwork.subnet.name - logging_service = "logging.googleapis.com/kubernetes" - monitoring_service = "monitoring.googleapis.com/kubernetes" - - master_authorized_networks_config { - gcp_public_cidrs_access_enabled = false - - dynamic "cidr_blocks" { - for_each = var.authorized_cidr_blocks - content { - cidr_block = cidr_blocks.value - display_name = "cidr-blocks-group-${cidr_blocks.key}" - } - } - } - - // Needs to be false when creating a PSC-based GKE cluster. - // After that, set as true to disable public endpoint of cluster master. - private_cluster_config { - enable_private_endpoint = false - } - - timeouts { - create = "120m" - update = "120m" - } -} - -resource "google_container_node_pool" "cpu_node_pool" { - provider = google-beta - project = var.project_id - name = "cpu-node-pool" - location = var.region - node_locations = var.cpu_node_pool.zone - cluster = google_container_cluster.tpu_cluster.name - initial_node_count = var.cpu_node_pool.initial_node_count_per_zone - autoscaling { - location_policy = "BALANCED" - min_node_count = var.cpu_node_pool.min_node_count_per_zone - max_node_count = var.cpu_node_pool.max_node_count_per_zone - } - max_pods_per_node = 63 - node_config { - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - machine_type = var.cpu_node_pool.machine_type - - metadata = { - disable-legacy-endpoints = "true" - } - gvnic { - enabled = true - } - gcfs_config { - enabled = true - } - } - - network_config { - enable_private_nodes = var.is_cpu_node_private - } -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/outputs.tf deleted file mode 100644 index 3953819c..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = google_container_cluster.tpu_cluster.name - description = "GKE Cluster Name" -} - -output "authorized_cidr_blocks" { - value = var.authorized_cidr_blocks - description = "Cluster allowed cidr blocks " -} - -output "is_cpu_node_private" { - value = var.is_cpu_node_private - description = "whether we want to make CPU node private" -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/terraform.tfvars deleted file mode 100644 index f3f4e7be..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/terraform.tfvars +++ /dev/null @@ -1,11 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -authorized_cidr_blocks = [] -cpu_node_pool = { - zone = ["us-central2-a", "us-central2-b", "us-central2-c"] - machine_type = "n2-standard-64", - initial_node_count_per_zone = 1, - min_node_count_per_zone = 1, - max_node_count_per_zone = 10 -} diff --git a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/variables.tf deleted file mode 100644 index df05e43d..00000000 --- a/tools/kubernetes/terraform/training/batching_with_compact_placement/create_cluster/module/variables.tf +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - description = "project id" -} - -variable "region" { - description = "region" -} - -variable "resource_name_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "authorized_cidr_blocks" { - description = "cluster allowed cidr blocks to access with kubectl CLI" - type = list(string) - default = [] -} - -variable "cpu_node_pool" { - description = "cpu nodepool config" - type = object({ - zone = list(string), - machine_type = string, - initial_node_count_per_zone = number, - min_node_count_per_zone = number, - max_node_count_per_zone = number - }) - validation { - condition = ( - (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) - ) - error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." - } -} - -variable "is_cpu_node_private" { - description = "whether we want to make CPU node private" - default = false -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/main.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/main.tf deleted file mode 100644 index 61ac2331..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/main.tf +++ /dev/null @@ -1,19 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "node_pool_prefix" {} -variable "region" {} -variable "tpu_node_pools" {} -variable "maintenance_interval" {} -variable "is_tpu_node_private" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - node_pool_prefix = var.node_pool_prefix - region = var.region - tpu_node_pools = var.tpu_node_pools - maintenance_interval = var.maintenance_interval - is_tpu_node_private = var.is_tpu_node_private -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/outputs.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/outputs.tf deleted file mode 100644 index 846c656e..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/outputs.tf +++ /dev/null @@ -1,19 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "is_tpu_node_private" { - value = var.is_tpu_node_private - description = "whether we want to make TPU node private" -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/terraform.tfvars b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/terraform.tfvars deleted file mode 100644 index ed10a48c..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/examples/terraform.tfvars +++ /dev/null @@ -1,21 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-v5e-test" -node_pool_prefix = "batch1" -region = "us-east5" -is_tpu_node_private = false -tpu_node_pools = [{ - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - disk_type = "pd-balanced" - disk_size_gb = 120 - },{ - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" - disk_type = "pd-balanced" - disk_size_gb = 120 -}] -maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/main.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/main.tf deleted file mode 100644 index 7b4789fb..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/main.tf +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# GKE cluster -data "google_container_engine_versions" "gke_version" { - location = var.region - version_prefix = "1.27." -} - -provider "google" { - project = var.project_id - region = var.region -} - -# Separately Managed Node Pool -resource "google_container_node_pool" "multihost_tpu" { - count = length(var.tpu_node_pools) - name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}" - provider = google-beta - project = var.project_id - location = var.region - node_locations = [var.tpu_node_pools[count.index].zone] - cluster = "${var.resource_name_prefix}-gke-cluster" - - initial_node_count = var.tpu_node_pools[count.index].node_count - - management { - // auto_upgrade must be true when release_channel = RAPID for cluster. - auto_upgrade = true - } - - node_config { - oauth_scopes = [ - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/monitoring", - "https://www.googleapis.com/auth/cloud-platform", - ] - host_maintenance_policy { - maintenance_interval = var.maintenance_interval - } - labels = { - env = var.project_id - } - gvnic { - enabled = true - } - gcfs_config { - enabled = true - } - - image_type = "COS_CONTAINERD" - machine_type = var.tpu_node_pools[count.index].machine_type - disk_type = var.tpu_node_pools[count.index].disk_type - disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb - tags = ["gke-node"] - metadata = { - disable-legacy-endpoints = "true" - } - } - placement_policy { - type = "COMPACT" - tpu_topology = var.tpu_node_pools[count.index].topology - } - - network_config { - enable_private_nodes = var.is_tpu_node_private - } -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/outputs.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/outputs.tf deleted file mode 100644 index 06972205..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/outputs.tf +++ /dev/null @@ -1,19 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = google_container_node_pool.multihost_tpu[0].cluster - description = "GKE Cluster Name" -} - -output "is_tpu_node_private" { - value = var.is_tpu_node_private - description = "whether we want to make TPU node private" -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/terraform.tfvars b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/terraform.tfvars deleted file mode 100644 index 48dbf54f..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/terraform.tfvars +++ /dev/null @@ -1,11 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-v5e-test" -node_pool_prefix = "batch1" -region = "us-east5" -tpu_node_pools = [{ - zone = "us-east5-b" - node_count = 64 - machine_type = "ct5lp-hightpu-4t" - topology = "16x16" -}] -maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/variables.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/variables.tf deleted file mode 100644 index c467e69a..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/add_node_pool/module/variables.tf +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - description = "project id" -} - -variable "region" { - description = "region" -} - -variable "resource_name_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "node_pool_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "tpu_node_pools" { - description = "tpu podslice config" - type = list(object({ - zone = string, - node_count = number, - machine_type = string, - topology = string, - disk_type = optional(string), - disk_size_gb = optional(number), - })) -} - -variable "is_tpu_node_private" { - description = "whether we want to make TPU node private" - default = false -} - -variable "maintenance_interval" { - default = "AS_NEEDED" - description = "maintenance interval for TPU machines." -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/main.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/main.tf deleted file mode 100644 index 7cce20f2..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/main.tf +++ /dev/null @@ -1,17 +0,0 @@ -variable "project_id" {} -variable "resource_name_prefix" {} -variable "region" {} -variable "cpu_node_pool" {} -variable "authorized_cidr_blocks" {} -variable "is_cpu_node_private" {} - - -module "tpu-gke" { - source = "../../module" - project_id = var.project_id - resource_name_prefix = var.resource_name_prefix - region = var.region - cpu_node_pool = var.cpu_node_pool - is_cpu_node_private = var.is_cpu_node_private - authorized_cidr_blocks = var.authorized_cidr_blocks -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf deleted file mode 100644 index a5514b1f..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = module.tpu-gke.kubernetes_cluster_name - description = "GKE Cluster Name" -} - -output "authorized_cidr_blocks" { - value = var.authorized_cidr_blocks - description = "Cluster allowed cidr blocks " -} - -output "is_cpu_node_private" { - value = var.is_cpu_node_private - description = "whether we want to make CPU node private" -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars deleted file mode 100644 index 86a001a2..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/examples/v5e/terraform.tfvars +++ /dev/null @@ -1,13 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-v5e-test" -region = "us-east5" -authorized_cidr_blocks = [] -is_cpu_node_private = false -cpu_node_pool = { - location_policy = "BALANCED" - zone = ["us-east5-b"] - machine_type = "e2-standard-32", - initial_node_count_per_zone = 5, - min_node_count_per_zone = 5, - max_node_count_per_zone = 1000, -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/main.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/main.tf deleted file mode 100644 index f5b9cf30..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/main.tf +++ /dev/null @@ -1,126 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# GKE cluster -data "google_container_engine_versions" "gke_version" { - location = var.region - version_prefix = "1.27." -} - -provider "google" { - project = var.project_id - region = var.region -} - -# VPC -resource "google_compute_network" "vpc" { - name = "${var.resource_name_prefix}-vpc" - auto_create_subnetworks = "false" -} - -# Subnet -resource "google_compute_subnetwork" "subnet" { - name = "${var.resource_name_prefix}-subnet" - region = var.region - network = google_compute_network.vpc.name - ip_cidr_range = "10.10.0.0/18" -} - -resource "google_container_cluster" "tpu_cluster" { - name = "${var.resource_name_prefix}-gke-cluster" - location = var.region - - # We can't create a cluster with no node pool defined, but we want to only use - # separately managed node pools. So we create the smallest possible default - # node pool and immediately delete it. - remove_default_node_pool = true - initial_node_count = 1 - networking_mode = "VPC_NATIVE" # Enables IP aliasing. - - ip_allocation_policy { - cluster_ipv4_cidr_block = "/14" - services_ipv4_cidr_block = "/20" - } - default_max_pods_per_node = 15 - - release_channel { - channel = "RAPID" - } - - network = google_compute_network.vpc.name - subnetwork = google_compute_subnetwork.subnet.name - logging_service = "logging.googleapis.com/kubernetes" - monitoring_service = "monitoring.googleapis.com/kubernetes" - - master_authorized_networks_config { - gcp_public_cidrs_access_enabled = false - - dynamic "cidr_blocks" { - for_each = var.authorized_cidr_blocks - content { - cidr_block = cidr_blocks.value - display_name = "cidr-blocks-group-${cidr_blocks.key}" - } - } - } - - // Needs to be false when creating a PSC-based GKE cluster. - // After that, set as true to disable public endpoint of cluster master. - private_cluster_config { - enable_private_endpoint = false - } - - timeouts { - create = "120m" - update = "120m" - } -} - -resource "google_container_node_pool" "cpu_node_pool" { - provider = google-beta - project = var.project_id - name = "cpu-node-pool" - location = var.region - node_locations = var.cpu_node_pool.zone - cluster = google_container_cluster.tpu_cluster.name - initial_node_count = var.cpu_node_pool.initial_node_count_per_zone - autoscaling { - location_policy = "BALANCED" - min_node_count = var.cpu_node_pool.min_node_count_per_zone - max_node_count = var.cpu_node_pool.max_node_count_per_zone - } - max_pods_per_node = 63 - node_config { - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - machine_type = var.cpu_node_pool.machine_type - - metadata = { - disable-legacy-endpoints = "true" - } - gvnic { - enabled = true - } - gcfs_config { - enabled = true - } - } - - network_config { - enable_private_nodes = var.is_cpu_node_private - } -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/outputs.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/outputs.tf deleted file mode 100644 index 3953819c..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "region" { - value = var.region - description = "GCloud Region" -} - -output "project_id" { - value = var.project_id - description = "GCloud Project ID" -} - -output "kubernetes_cluster_name" { - value = google_container_cluster.tpu_cluster.name - description = "GKE Cluster Name" -} - -output "authorized_cidr_blocks" { - value = var.authorized_cidr_blocks - description = "Cluster allowed cidr blocks " -} - -output "is_cpu_node_private" { - value = var.is_cpu_node_private - description = "whether we want to make CPU node private" -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/terraform.tfvars b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/terraform.tfvars deleted file mode 100644 index f3f4e7be..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/terraform.tfvars +++ /dev/null @@ -1,11 +0,0 @@ -project_id = "project-id" -resource_name_prefix = "tpu-test" -region = "us-central2" -authorized_cidr_blocks = [] -cpu_node_pool = { - zone = ["us-central2-a", "us-central2-b", "us-central2-c"] - machine_type = "n2-standard-64", - initial_node_count_per_zone = 1, - min_node_count_per_zone = 1, - max_node_count_per_zone = 10 -} diff --git a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/variables.tf b/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/variables.tf deleted file mode 100644 index df05e43d..00000000 --- a/tools/kubernetes/terraform/training/batching_without_compact_placement/create_cluster/module/variables.tf +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - description = "project id" -} - -variable "region" { - description = "region" -} - -variable "resource_name_prefix" { - default = "" - description = "prefix for all the resouce naming" -} - -variable "authorized_cidr_blocks" { - description = "cluster allowed cidr blocks to access with kubectl CLI" - type = list(string) - default = [] -} - -variable "cpu_node_pool" { - description = "cpu nodepool config" - type = object({ - zone = list(string), - machine_type = string, - initial_node_count_per_zone = number, - min_node_count_per_zone = number, - max_node_count_per_zone = number - }) - validation { - condition = ( - (var.cpu_node_pool.min_node_count_per_zone >= 0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) - ) - error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." - } -} - -variable "is_cpu_node_private" { - description = "whether we want to make CPU node private" - default = false -}