From b9889ec6fb7494df0c201516851e8202d4064ab2 Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Wed, 22 May 2019 18:27:03 -0700 Subject: [PATCH] Gke terraform and guide (#493) * Initial progress * Adds infrastructure * Some changes to main.tf and adds manifests and charts * Final touches, everything works * Adds readme * Cleans up README a little * Adds firewall rule to allow ssh from bastion to GKE nodes * Fixes name of ssh from bastion firewall rule * Adds tweaks and formatting * Adds reclaim policy to delete disks * Updates readme to reflect disk reclaim policy * Adds on-destroy to change persistent volume claimpolicy to delete * Removes superfluous command changing reclaim policy * Adds monitor node count variable * Adds startup script daemonset to properly change open fd on pd and tikv nodes * Streamlines startup daemonset and adds Linux Guest Environment installation * Adds note about default set up exceeding GCP default cpu quota * Adds link to GCP quota documentation page * Fixes formatting * Adds clarifications to README, adds tidb and monitor port to outputs * Adds section on how to delete nodes that are automatically replicated across zones in a regional cluster --- deploy/gcp/.gitignore | 4 + deploy/gcp/README.md | 126 ++++++ deploy/gcp/charts/tidb-cluster | 1 + deploy/gcp/charts/tidb-operator | 1 + deploy/gcp/data.tf | 31 ++ deploy/gcp/main.tf | 340 +++++++++++++++ deploy/gcp/manifests/crd.yaml | 1 + deploy/gcp/manifests/gke-storage.yml | 1 + .../manifests/local-volume-provisioner.yaml | 128 ++++++ deploy/gcp/manifests/startup-script.yaml | 55 +++ deploy/gcp/manifests/tiller-rbac.yaml | 1 + deploy/gcp/outputs.tf | 43 ++ .../templates/tidb-cluster-values.yaml.tpl | 407 ++++++++++++++++++ deploy/gcp/variables.tf | 66 +++ 14 files changed, 1205 insertions(+) create mode 100644 deploy/gcp/.gitignore create mode 100644 deploy/gcp/README.md create mode 120000 deploy/gcp/charts/tidb-cluster create mode 120000 deploy/gcp/charts/tidb-operator create mode 100644 deploy/gcp/data.tf create mode 100644 deploy/gcp/main.tf create mode 120000 deploy/gcp/manifests/crd.yaml create mode 120000 deploy/gcp/manifests/gke-storage.yml create mode 100644 deploy/gcp/manifests/local-volume-provisioner.yaml create mode 100644 deploy/gcp/manifests/startup-script.yaml create mode 120000 deploy/gcp/manifests/tiller-rbac.yaml create mode 100644 deploy/gcp/outputs.tf create mode 100644 deploy/gcp/templates/tidb-cluster-values.yaml.tpl create mode 100644 deploy/gcp/variables.tf diff --git a/deploy/gcp/.gitignore b/deploy/gcp/.gitignore new file mode 100644 index 0000000000..955562c1e4 --- /dev/null +++ b/deploy/gcp/.gitignore @@ -0,0 +1,4 @@ +.terraform +*.tfstate* +credentials +rendered diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md new file mode 100644 index 0000000000..db9575b59e --- /dev/null +++ b/deploy/gcp/README.md @@ -0,0 +1,126 @@ +# Deploy TiDB Operator and TiDB cluster on GCP GKE + +## Requirements: +* [gcloud](https://cloud.google.com/sdk/install) +* [terraform](https://www.terraform.io/downloads.html) +* [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/#install-kubectl) >= 1.11 +* [helm](https://github.com/helm/helm/blob/master/docs/install.md#installing-the-helm-client) >= 2.9.0 +* [jq](https://stedolan.github.io/jq/download/) + +## Configure gcloud + +https://cloud.google.com/sdk/docs/initializing + +## Setup + +The default setup will create a new VPC, two subnetworks, and an f1-micro instance as a bastion machine. The GKE cluster is created with the following instance types as worker nodes: + +* 3 n1-standard-4 instances for PD +* 3 n1-highmem-8 instances for TiKV +* 3 n1-standard-16 instances for TiDB +* 3 n1-standard-2 instances for monitor + +> *NOTE*: The number of nodes created depends on how many availability zones there are in the chosen region. Most have 3 zones, but us-central1 has 4. See https://cloud.google.com/compute/docs/regions-zones/ for more information. Please refer to the `Customize` section for information on how to customize node pools in a regional cluster. + +> *NOTE*: The default setup, as listed above, will exceed the default CPU quota of a GCP project. To increase your project's quota, please follow the instructions [here](https://cloud.google.com/compute/quotas). The default setup will require at least 91 CPUs, more if you need to scale out. + +The terraform script expects three environment variables. You can let Terraform prompt you for them, or `export` them ahead of time. If you choose to export them, they are: + +* `TF_VAR_GCP_CREDENTIALS_PATH`: Path to a valid GCP credentials file. It is generally considered a good idea to create a service account to be used by Terraform. See [this page](https://cloud.google.com/iam/docs/creating-managing-service-accounts) for more information on how to manage them. See [this page](https://cloud.google.com/iam/docs/creating-managing-service-account-keys) for creating and managing service account keys which, when downloaded, will be the needed credentials file. +* `TF_VAR_GCP_REGION`: The region to create the resources in, for example: `us-west1` +* `TF_VAR_GCP_PROJECT`: The name of the GCP project + + + +The service account should have sufficient permissions to create resources in the project. The `Project Editor` primitive will accomplish this. + +If the GCP project is new, make sure the relevant APIs are enabled: + +```bash +gcloud services enable cloudresourcemanager.googleapis.com && \ +gcloud services enable cloudbilling.googleapis.com && \ +gcloud services enable iam.googleapis.com && \ +gcloud services enable compute.googleapis.com && \ +gcloud services enable container.googleapis.com +``` + +Now we can launch the script: + +```bash +git clone --depth=1 https://github.com/pingcap/tidb-operator +cd tidb-operator/deploy/gcp +terraform init +terraform apply +``` + +After `terraform apply` is successful, the TiDB cluster can be accessed by SSHing into the bastion machine and connecting via MySQL: +```bash +gcloud compute ssh bastion --zone +mysql -h -P 4000 -u root +``` + +It is possible to interact with the cluster using `kubectl` and `helm` with the kubeconfig file `credentials/kubeconfig_`. The default `cluster_name` is `my-cluster`, it can be changed in `variables.tf` +```bash +# By specifying --kubeconfig argument +kubectl --kubeconfig credentials/kubeconfig_ get po -n tidb +helm --kubeconfig credentials/kubeconfig_ ls + +# Or setting KUBECONFIG environment variable +export KUBECONFIG=$PWD/credentials/kubeconfig_ +kubectl get po -n tidb +helm ls +``` + +When done, the infrastructure can be torn down by running `terraform destroy` + + +## Upgrade TiDB cluster + +To upgrade TiDB cluster, modify `tidb_version` variable to a higher version in variables.tf and run `terraform apply`. + +> *Note*: The upgrading doesn't finish immediately. You can watch the upgrading process by `watch kubectl --kubeconfig credentials/kubeconfig_ get po -n tidb` + +## Scale TiDB cluster + +To scale TiDB cluster, modify `tikv_count`, `tikv_replica_count`, `tidb_count`, and `tidb_replica_count` to your desired count, and then run `terraform apply`. + +> *Note*: Currently, scaling in is not supported since we cannot determine which node to remove. Scaling out needs a few minutes to complete, you can watch the scaling out by `watch kubectl --kubeconfig credentials/kubeconfig_ get po -n tidb` + +> *Note*: Incrementing the node count will create a node per GCP availability zones. + +## Customize + +### Customize GCP resources + +GCP allows attaching a local SSD to any instance type that is `n1-standard-1` or greater. This allows for good customizability. + +### Customize TiDB Parameters + +Currently, there are not too many parameters exposed to be customized. However, you can modify `templates/tidb-cluster-values.yaml.tpl` before deploying. If you modify it after the cluster is created and then run `terraform apply`, it will not take effect unless the pod(s) is manually deleted. + +### Customizing node pools + +The cluster is created as a regional, as opposed to a zonal cluster. This means that GKE will replicate node pools to each availability zone. This is desired to maintain high availability, however for the monitoring services, like Grafana, this is potentially unnecessary. It is possible to manually remove nodes if desired via `gcloud`. + +> *NOTE*: GKE node pools are managed instance groups, so a node deleted by `gcloud compute instances delete` will be automatically recreated and added back to the cluster. + +Suppose we wish to delete a node from the monitor pool, we can do +```bash +$ gcloud compute instance-groups managed list | grep monitor +``` +and the result will be something like this +```bash +gke-my-cluster-monitor-pool-08578e18-grp us-west1-b zone gke-my-cluster-monitor-pool-08578e18 0 0 gke-my-cluster-monitor-pool-08578e18 no +gke-my-cluster-monitor-pool-7e31100f-grp us-west1-c zone gke-my-cluster-monitor-pool-7e31100f 1 1 gke-my-cluster-monitor-pool-7e31100f no +gke-my-cluster-monitor-pool-78a961e5-grp us-west1-a zone gke-my-cluster-monitor-pool-78a961e5 1 1 gke-my-cluster-monitor-pool-78a961e5 no +``` +The first column is the name of the managed instance group, and the second column is the zone it was created in. We will also need the name of the instance in that group, we can get it as follows +```bash +$ gcloud compute instance-groups managed list-instances gke-my-cluster-monitor-pool-08578e18-grp --zone us-west1-b +NAME ZONE STATUS ACTION INSTANCE_TEMPLATE VERSION_NAME LAST_ERROR +gke-my-cluster-monitor-pool-08578e18-c7vd us-west1-b RUNNING NONE gke-my-cluster-monitor-pool-08578e18 +``` +Now we can delete the instance +```bash +$ gcloud compute instance-groups managed delete-instances gke-my-cluster-monitor-pool-08578e18-grp --instances=gke-my-cluster-monitor-pool-08578e18-c7vd --zone us-west1-b +``` \ No newline at end of file diff --git a/deploy/gcp/charts/tidb-cluster b/deploy/gcp/charts/tidb-cluster new file mode 120000 index 0000000000..326d382104 --- /dev/null +++ b/deploy/gcp/charts/tidb-cluster @@ -0,0 +1 @@ +../../../charts/tidb-cluster \ No newline at end of file diff --git a/deploy/gcp/charts/tidb-operator b/deploy/gcp/charts/tidb-operator new file mode 120000 index 0000000000..a45f172da2 --- /dev/null +++ b/deploy/gcp/charts/tidb-operator @@ -0,0 +1 @@ +../../../charts/tidb-operator \ No newline at end of file diff --git a/deploy/gcp/data.tf b/deploy/gcp/data.tf new file mode 100644 index 0000000000..0595c5e681 --- /dev/null +++ b/deploy/gcp/data.tf @@ -0,0 +1,31 @@ +data "template_file" "tidb_cluster_values" { + template = "${file("${path.module}/templates/tidb-cluster-values.yaml.tpl")}" + + vars { + cluster_version = "${var.tidb_version}" + pd_replicas = "${var.pd_replica_count}" + tikv_replicas = "${var.tikv_replica_count}" + tidb_replicas = "${var.tidb_replica_count}" + operator_version = "${var.tidb_operator_version}" + } +} + +data "external" "tidb_ilb_ip" { + depends_on = ["null_resource.deploy-tidb-cluster"] + program = ["bash", "-c", "kubectl --kubeconfig ${local.kubeconfig} get svc -n tidb tidb-cluster-tidb -o json | jq '.status.loadBalancer.ingress[0]'"] +} + +data "external" "monitor_ilb_ip" { + depends_on = ["null_resource.deploy-tidb-cluster"] + program = ["bash", "-c", "kubectl --kubeconfig ${local.kubeconfig} get svc -n tidb tidb-cluster-grafana -o json | jq '.status.loadBalancer.ingress[0]'"] +} + +data "external" "tidb_port" { + depends_on = ["null_resource.deploy-tidb-cluster"] + program = ["bash", "-c", "kubectl --kubeconfig ${local.kubeconfig} get svc -n tidb tidb-cluster-tidb -o json | jq '.spec.ports | .[] | select( .name == \"mysql-client\") | {port: .port|tostring}'"] +} + +data "external" "monitor_port" { + depends_on = ["null_resource.deploy-tidb-cluster"] + program = ["bash", "-c", "kubectl --kubeconfig ${local.kubeconfig} get svc -n tidb tidb-cluster-grafana -o json | jq '.spec.ports | .[] | select( .name == \"grafana\") | {port: .port|tostring}'"] +} diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf new file mode 100644 index 0000000000..a10aeeb5a9 --- /dev/null +++ b/deploy/gcp/main.tf @@ -0,0 +1,340 @@ +variable "GCP_CREDENTIALS_PATH" {} +variable "GCP_REGION" {} +variable "GCP_PROJECT" {} + +provider "google" { + credentials = "${file("${var.GCP_CREDENTIALS_PATH}")}" + region = "${var.GCP_REGION}" + project = "${var.GCP_PROJECT}" +} + +// required for taints on node pools +provider "google-beta" { + credentials = "${file("${var.GCP_CREDENTIALS_PATH}")}" + region = "${var.GCP_REGION}" + project = "${var.GCP_PROJECT}" +} + +locals { + credential_path = "${path.module}/credentials" + kubeconfig = "${local.credential_path}/kubeconfig_${var.cluster_name}" + tidb_cluster_values_path = "${path.module}/rendered/tidb-cluster-values.yaml" +} + +resource "null_resource" "prepare-dir" { + provisioner "local-exec" { + command = "mkdir -p ${local.credential_path}" + } +} + +resource "google_compute_network" "vpc_network" { + name = "vpc-network" + auto_create_subnetworks = false + project = "${var.GCP_PROJECT}" +} + +resource "google_compute_subnetwork" "private_subnet" { + ip_cidr_range = "172.31.252.0/22" + name = "private-subnet" + network = "${google_compute_network.vpc_network.name}" + project = "${var.GCP_PROJECT}" + + secondary_ip_range { + ip_cidr_range = "172.30.0.0/16" + range_name = "pods-${var.GCP_REGION}" + } + + secondary_ip_range { + ip_cidr_range = "172.31.224.0/20" + range_name = "services-${var.GCP_REGION}" + } + + lifecycle { + ignore_changes = ["secondary_ip_range"] + } +} + +resource "google_compute_subnetwork" "public_subnet" { + ip_cidr_range = "172.29.252.0/22" + name = "public-subnet" + network = "${google_compute_network.vpc_network.name}" + project = "${var.GCP_PROJECT}" +} + +resource "google_container_cluster" "cluster" { + name = "${var.cluster_name}" + network = "${google_compute_network.vpc_network.name}" + subnetwork = "${google_compute_subnetwork.private_subnet.name}" + location = "${var.GCP_REGION}" + project = "${var.GCP_PROJECT}" + + master_auth { + username = "" + password = "" + + // due to https://github.com/terraform-providers/terraform-provider-google/issues/3369 + client_certificate_config { + issue_client_certificate = false + } + } + + master_authorized_networks_config { + cidr_blocks { + cidr_block = "0.0.0.0/0" + } + } + + ip_allocation_policy { + use_ip_aliases = true + } + + remove_default_node_pool = true + initial_node_count = 1 + + min_master_version = "latest" + + lifecycle { + ignore_changes = ["master_auth"] // see above linked issue + } +} + +resource "google_container_node_pool" "pd_pool" { + provider = "google-beta" + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "pd-pool" + initial_node_count = "${var.pd_count}" + + node_config { + machine_type = "${var.pd_instance_type}" + image_type = "UBUNTU" + local_ssd_count = 1 + + taint { + effect = "NO_SCHEDULE" + key = "dedicated" + value = "pd" + } + + labels { + dedicated = "pd" + } + + tags = ["pd"] + oauth_scopes = ["storage-ro", "logging-write", "monitoring"] + } +} + +resource "google_container_node_pool" "tikv_pool" { + provider = "google-beta" + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "tikv-pool" + initial_node_count = "${var.tikv_count}" + + node_config { + machine_type = "${var.tikv_instance_type}" + image_type = "UBUNTU" + local_ssd_count = 1 + + taint { + effect = "NO_SCHEDULE" + key = "dedicated" + value = "tikv" + } + + labels { + dedicated = "tikv" + } + + tags = ["tikv"] + oauth_scopes = ["storage-ro", "logging-write", "monitoring"] + } +} + +resource "google_container_node_pool" "tidb_pool" { + provider = "google-beta" + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "tidb-pool" + initial_node_count = "${var.tidb_count}" + + node_config { + machine_type = "${var.tidb_instance_type}" + + taint { + effect = "NO_SCHEDULE" + key = "dedicated" + value = "tidb" + } + + labels { + dedicated = "tidb" + } + + tags = ["tidb"] + oauth_scopes = ["storage-ro", "logging-write", "monitoring"] + } +} + +resource "google_container_node_pool" "monitor_pool" { + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "monitor-pool" + initial_node_count = "${var.monitor_count}" + + node_config { + machine_type = "${var.monitor_instance_type}" + tags = ["monitor"] + oauth_scopes = ["storage-ro", "logging-write", "monitoring"] + } +} + +resource "google_compute_firewall" "allow_ssh_bastion" { + name = "allow-ssh-bastion" + network = "${google_compute_network.vpc_network.self_link}" + project = "${var.GCP_PROJECT}" + + allow { + protocol = "tcp" + ports = ["22"] + } + + source_ranges = ["0.0.0.0/0"] + target_tags = ["bastion"] +} + +resource "google_compute_firewall" "allow_mysql_from_bastion" { + name = "allow-mysql-from-bastion" + network = "${google_compute_network.vpc_network.self_link}" + project = "${var.GCP_PROJECT}" + + allow { + protocol = "tcp" + ports = ["4000"] + } + + source_tags = ["bastion"] + target_tags = ["tidb"] +} + +resource "google_compute_firewall" "allow_ssh_from_bastion" { + name = "allow-ssh-from-bastion" + network = "${google_compute_network.vpc_network.self_link}" + project = "${var.GCP_PROJECT}" + + allow { + protocol = "tcp" + ports = ["22"] + } + + source_tags = ["bastion"] + target_tags = ["tidb", "tikv", "pd", "monitor"] +} + +resource "google_compute_instance" "bastion" { + project = "${var.GCP_PROJECT}" + zone = "${var.GCP_REGION}-a" + machine_type = "${var.bastion_instance_type}" + name = "bastion" + + "boot_disk" { + initialize_params { + image = "ubuntu-os-cloud/ubuntu-1804-lts" + } + } + + "network_interface" { + subnetwork = "${google_compute_subnetwork.public_subnet.self_link}" + access_config = {} + } + + tags = ["bastion"] + + metadata_startup_script = "sudo apt-get install -y mysql-client && curl -s https://packagecloud.io/install/repositories/akopytov/sysbench/script.deb.sh | bash && sudo apt-get -y install sysbench" +} + +resource "null_resource" "get-credentials" { + provisioner "local-exec" { + command = "gcloud container clusters get-credentials ${google_container_cluster.cluster.name} --region ${var.GCP_REGION}" + + environment { + KUBECONFIG = "${local.kubeconfig}" + } + } + + provisioner "local-exec" { + when = "destroy" + + command = < /etc/security/limits.d/99-tidb.conf + root soft nofile 1000000 + root hard nofile 1000000 + root soft core unlimited + root soft stack 10240 + EOF + volumeMounts: + - mountPath: /mnt/disks + name: local-ssd + mountPropagation: Bidirectional + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - name: local-ssd + hostPath: + path: /mnt/disks \ No newline at end of file diff --git a/deploy/gcp/manifests/tiller-rbac.yaml b/deploy/gcp/manifests/tiller-rbac.yaml new file mode 120000 index 0000000000..cd2771d346 --- /dev/null +++ b/deploy/gcp/manifests/tiller-rbac.yaml @@ -0,0 +1 @@ +../../../manifests/tiller-rbac.yaml \ No newline at end of file diff --git a/deploy/gcp/outputs.tf b/deploy/gcp/outputs.tf new file mode 100644 index 0000000000..33f4ca2f83 --- /dev/null +++ b/deploy/gcp/outputs.tf @@ -0,0 +1,43 @@ +output "region" { + value = "${var.GCP_REGION}" +} + +output "cluster_id" { + value = "${google_container_cluster.cluster.id}" +} + +output "cluster_name" { + value = "${google_container_cluster.cluster.name}" +} + +output "kubeconfig_file" { + value = "${local.kubeconfig}" +} + +output "tidb_version" { + value = "${var.tidb_version}" +} + +output "tidb_ilb_ip" { + value = "${data.external.tidb_ilb_ip.result["ip"]}" +} + +output "tidb_port" { + value = "${data.external.tidb_port.result["port"]}" +} + +output "monitor_ilb_ip" { + value = "${data.external.monitor_ilb_ip.result["ip"]}" +} + +output "monitor_port" { + value = "${data.external.monitor_port.result["port"]}" +} + +output "how_to_ssh_to_bastion" { + value = "gcloud compute ssh bastion --zone ${var.GCP_REGION}-a" +} + +output "how_to_connect_to_mysql_from_bastion" { + value = "mysql -h ${data.external.tidb_ilb_ip.result["ip"]} -P ${data.external.tidb_port.result["port"]} -u root" +} diff --git a/deploy/gcp/templates/tidb-cluster-values.yaml.tpl b/deploy/gcp/templates/tidb-cluster-values.yaml.tpl new file mode 100644 index 0000000000..496c786f07 --- /dev/null +++ b/deploy/gcp/templates/tidb-cluster-values.yaml.tpl @@ -0,0 +1,407 @@ +# Default values for tidb-cluster. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# Also see monitor.serviceAccount +# If you set rbac.create to false, you need to provide a value for monitor.serviceAccount +rbac: + create: true + +# clusterName is the TiDB cluster name, if not specified, the chart release name will be used +# clusterName: demo + +# Add additional TidbCluster labels +# ref: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ +extraLabels: {} + +# schedulerName must be same with charts/tidb-operator/values#scheduler.schedulerName +schedulerName: tidb-scheduler + +# timezone is the default system timzone for TiDB +timezone: UTC + +# default reclaim policy of a PV +pvReclaimPolicy: Retain + +# services is the service list to expose, default is ClusterIP +# can be ClusterIP | NodePort | LoadBalancer +services: + - name: pd + type: ClusterIP + +discovery: + image: pingcap/tidb-operator:${operator_version} + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 250m + memory: 150Mi + requests: + cpu: 80m + memory: 50Mi + +pd: + replicas: ${pd_replicas} + image: "pingcap/pd:${cluster_version}" + logLevel: info + # storageClassName is a StorageClass provides a way for administrators to describe the "classes" of storage they offer. + # different classes might map to quality-of-service levels, or to backup policies, + # or to arbitrary policies determined by the cluster administrators. + # refer to https://kubernetes.io/docs/concepts/storage/storage-classes + storageClassName: local-storage + + # Image pull policy. + imagePullPolicy: IfNotPresent + + # maxStoreDownTime is how long a store will be considered `down` when disconnected + # if a store is considered `down`, the regions will be migrated to other stores + maxStoreDownTime: 30m + # maxReplicas is the number of replicas for each region + maxReplicas: 3 + resources: + limits: {} + # cpu: 8000m + # memory: 8Gi + requests: + # cpu: 4000m + # memory: 4Gi + storage: 1Gi + # nodeSelector is used for scheduling pod, + # if nodeSelectorRequired is true, all the following labels must be matched + nodeSelector: + dedicated: pd + # kind: pd + # # zone is comma separated availability zone list + # zone: cn-bj1-01,cn-bj1-02 + # # region is comma separated region list + # region: cn-bj1 + # Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. + # refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration + tolerations: + - key: dedicated + operator: Equal + value: pd + effect: "NoSchedule" + +tikv: + replicas: ${tikv_replicas} + image: "pingcap/tikv:${cluster_version}" + logLevel: info + # storageClassName is a StorageClass provides a way for administrators to describe the "classes" of storage they offer. + # different classes might map to quality-of-service levels, or to backup policies, + # or to arbitrary policies determined by the cluster administrators. + # refer to https://kubernetes.io/docs/concepts/storage/storage-classes + storageClassName: local-storage + + # Image pull policy. + imagePullPolicy: IfNotPresent + + # syncLog is a bool value to enable or disable syc-log for raftstore, default is true + # enable this can prevent data loss when power failure + syncLog: true + # size of thread pool for grpc server. + # grpcConcurrency: 4 + resources: + limits: {} + # cpu: 16000m + # memory: 32Gi + # storage: 300Gi + requests: + # cpu: 12000m + # memory: 24Gi + storage: 10Gi + nodeSelector: + dedicated: tikv + # kind: tikv + # zone: cn-bj1-01,cn-bj1-02 + # region: cn-bj1 + tolerations: + - key: dedicated + operator: Equal + value: tikv + effect: "NoSchedule" + + # block-cache used to cache uncompressed blocks, big block-cache can speed up read. + # in normal cases should tune to 30%-50% tikv.resources.limits.memory + # defaultcfBlockCacheSize: "1GB" + + # in normal cases should tune to 10%-30% tikv.resources.limits.memory + # writecfBlockCacheSize: "256MB" + + # size of thread pool for high-priority/normal-priority/low-priority operations + # readpoolStorageConcurrency: 4 + + # Notice: if tikv.resources.limits.cpu > 8, default thread pool size for coprocessors + # will be set to tikv.resources.limits.cpu * 0.8. + # readpoolCoprocessorConcurrency: 8 + + # scheduler's worker pool size, should increase it in heavy write cases, + # also should less than total cpu cores. + # storageSchedulerWorkerPoolSize: 4 + +tidb: + replicas: ${tidb_replicas} + # The secret name of root password, you can create secret with following command: + # kubectl create secret generic tidb-secret --from-literal=root= --namespace= + # If unset, the root password will be empty and you can set it after connecting + # passwordSecretName: tidb-secret + # initSql is the SQL statements executed after the TiDB cluster is bootstrapped. + # initSql: |- + # create database app; + image: "pingcap/tidb:${cluster_version}" + # Image pull policy. + imagePullPolicy: IfNotPresent + logLevel: info + preparedPlanCacheEnabled: false + preparedPlanCacheCapacity: 100 + # Enable local latches for transactions. Enable it when + # there are lots of conflicts between transactions. + txnLocalLatchesEnabled: false + txnLocalLatchesCapacity: "10240000" + # The limit of concurrent executed sessions. + tokenLimit: "1000" + # Set the memory quota for a query in bytes. Default: 32GB + memQuotaQuery: "34359738368" + # The limitation of the number for the entries in one transaction. + # If using TiKV as the storage, the entry represents a key/value pair. + # WARNING: Do not set the value too large, otherwise it will make a very large impact on the TiKV cluster. + # Please adjust this configuration carefully. + txnEntryCountLimit: "300000" + # The limitation of the size in byte for the entries in one transaction. + # If using TiKV as the storage, the entry represents a key/value pair. + # WARNING: Do not set the value too large, otherwise it will make a very large impact on the TiKV cluster. + # Please adjust this configuration carefully. + txnTotalSizeLimit: "104857600" + # enableBatchDml enables batch commit for the DMLs + enableBatchDml: false + # check mb4 value in utf8 is used to control whether to check the mb4 characters when the charset is utf8. + checkMb4ValueInUtf8: true + # treat-old-version-utf8-as-utf8mb4 use for upgrade compatibility. Set to true will treat old version table/column UTF8 charset as UTF8MB4. + treatOldVersionUtf8AsUtf8mb4: true + # lease is schema lease duration, very dangerous to change only if you know what you do. + lease: 45s + # Max CPUs to use, 0 use number of CPUs in the machine. + maxProcs: 0 + resources: + limits: {} + # cpu: 16000m + # memory: 16Gi + requests: {} + # cpu: 12000m + # memory: 12Gi + nodeSelector: + dedicated: tidb + # kind: tidb + # zone: cn-bj1-01,cn-bj1-02 + # region: cn-bj1 + tolerations: + - key: dedicated + operator: Equal + value: tidb + effect: "NoSchedule" + maxFailoverCount: 3 + service: + type: LoadBalancer + exposeStatus: true + annotations: + cloud.google.com/load-balancer-type: "Internal" + # separateSlowLog: true + slowLogTailer: + image: busybox:1.26.2 + resources: + limits: + cpu: 100m + memory: 50Mi + requests: + cpu: 20m + memory: 5Mi + + # tidb plugin configuration + plugin: + # enable plugin or not + enable: false + # the start argument to specify the folder containing + directory: /plugins + # the start argument to specify the plugin id (name "-" version) that needs to be loaded, e.g. 'conn_limit-1'. + list: ["whitelist-1"] + +# mysqlClient is used to set password for TiDB +# it must has Python MySQL client installed +mysqlClient: + image: tnir/mysqlclient + imagePullPolicy: IfNotPresent + +monitor: + create: true + # Also see rbac.create + # If you set rbac.create to false, you need to provide a value here. + # If you set rbac.create to true, you should leave this empty. + # serviceAccount: + persistent: true + storageClassName: pd-ssd + storage: 500Gi + grafana: + create: true + image: grafana/grafana:6.0.1 + imagePullPolicy: IfNotPresent + logLevel: info + resources: + limits: {} + # cpu: 8000m + # memory: 8Gi + requests: {} + # cpu: 4000m + # memory: 4Gi + username: admin + password: admin + config: + # Configure Grafana using environment variables except GF_PATHS_DATA, GF_SECURITY_ADMIN_USER and GF_SECURITY_ADMIN_PASSWORD + # Ref https://grafana.com/docs/installation/configuration/#using-environment-variables + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_NAME: "Main Org." + GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer" + # if grafana is running behind a reverse proxy with subpath http://foo.bar/grafana + # GF_SERVER_DOMAIN: foo.bar + # GF_SERVER_ROOT_URL: "%(protocol)s://%(domain)s/grafana/" + service: + type: LoadBalancer + prometheus: + image: prom/prometheus:v2.2.1 + imagePullPolicy: IfNotPresent + logLevel: info + resources: + limits: {} + # cpu: 8000m + # memory: 8Gi + requests: {} + # cpu: 4000m + # memory: 4Gi + service: + type: NodePort + reserveDays: 12 + # alertmanagerURL: "" + nodeSelector: {} + # kind: monitor + # zone: cn-bj1-01,cn-bj1-02 + # region: cn-bj1 + tolerations: [] + # - key: node-role + # operator: Equal + # value: tidb + # effect: "NoSchedule" + +binlog: + pump: + create: false + replicas: 1 + image: "pingcap/tidb-binlog:${cluster_version}" + imagePullPolicy: IfNotPresent + logLevel: info + # storageClassName is a StorageClass provides a way for administrators to describe the "classes" of storage they offer. + # different classes might map to quality-of-service levels, or to backup policies, + # or to arbitrary policies determined by the cluster administrators. + # refer to https://kubernetes.io/docs/concepts/storage/storage-classes + storageClassName: local-storage + storage: 10Gi + syncLog: true + # a integer value to control expiry date of the binlog data, indicates for how long (in days) the binlog data would be stored. + # must bigger than 0 + gc: 7 + # number of seconds between heartbeat ticks (in 2 seconds) + heartbeatInterval: 2 + + drainer: + create: false + image: "pingcap/tidb-binlog:${cluster_version}" + imagePullPolicy: IfNotPresent + logLevel: info + # storageClassName is a StorageClass provides a way for administrators to describe the "classes" of storage they offer. + # different classes might map to quality-of-service levels, or to backup policies, + # or to arbitrary policies determined by the cluster administrators. + # refer to https://kubernetes.io/docs/concepts/storage/storage-classes + storageClassName: local-storage + storage: 10Gi + # parallel worker count (default 16) + workerCount: 16 + # the interval time (in seconds) of detect pumps' status (default 10) + detectInterval: 10 + # disbale detect causality + disableDetect: false + # disable dispatching sqls that in one same binlog; if set true, work-count and txn-batch would be useless + disableDispatch: false + # # disable sync these schema + ignoreSchemas: "INFORMATION_SCHEMA,PERFORMANCE_SCHEMA,mysql,test" + # if drainer donesn't have checkpoint, use initial commitTS to initial checkpoint + initialCommitTs: 0 + # enable safe mode to make syncer reentrant + safeMode: false + # number of binlog events in a transaction batch (default 20) + txnBatch: 20 + # downstream storage, equal to --dest-db-type + # valid values are "mysql", "pb", "kafka" + destDBType: pb + mysql: {} + # host: "127.0.0.1" + # user: "root" + # password: "" + # port: 3306 + # # Time and size limits for flash batch write + # timeLimit: "30s" + # sizeLimit: "100000" + kafka: {} + # only need config one of zookeeper-addrs and kafka-addrs, will get kafka address if zookeeper-addrs is configed. + # zookeeperAddrs: "127.0.0.1:2181" + # kafkaAddrs: "127.0.0.1:9092" + # kafkaVersion: "0.8.2.0" + +scheduledBackup: + create: false + binlogImage: "pingcap/tidb-binlog:${cluster_version}" + binlogImagePullPolicy: IfNotPresent + # https://github.com/tennix/tidb-cloud-backup + mydumperImage: pingcap/tidb-cloud-backup:latest + mydumperImagePullPolicy: IfNotPresent + # storageClassName is a StorageClass provides a way for administrators to describe the "classes" of storage they offer. + # different classes might map to quality-of-service levels, or to backup policies, + # or to arbitrary policies determined by the cluster administrators. + # refer to https://kubernetes.io/docs/concepts/storage/storage-classes + storageClassName: local-storage + storage: 100Gi + # https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#schedule + schedule: "0 0 * * *" + # https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend + suspend: false + # https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#jobs-history-limits + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + # https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#starting-deadline + startingDeadlineSeconds: 3600 + # https://github.com/maxbube/mydumper/blob/master/docs/mydumper_usage.rst#options + options: "--chunk-filesize=100" + # secretName is the name of the secret which stores user and password used for backup + # Note: you must give the user enough privilege to do the backup + # you can create the secret by: + # kubectl create secret generic backup-secret --from-literal=user=root --from-literal=password= + secretName: backup-secret + # backup to gcp + gcp: {} + # bucket: "" + # secretName is the name of the secret which stores the gcp service account credentials json file + # The service account must have read/write permission to the above bucket. + # Read the following document to create the service account and download the credentials file as credentials.json: + # https://cloud.google.com/docs/authentication/production#obtaining_and_providing_service_account_credentials_manually + # And then create the secret by: kubectl create secret generic gcp-backup-secret --from-file=./credentials.json + # secretName: gcp-backup-secret + + # backup to ceph object storage + ceph: {} + # endpoint: "" + # bucket: "" + # secretName is the name of the secret which stores ceph object store access key and secret key + # You can create the secret by: + # kubectl create secret generic ceph-backup-secret --from-literal=access_key= --from-literal=secret_key= + # secretName: ceph-backup-secret + +metaInstance: "{{ $labels.instance }}" +metaType: "{{ $labels.type }}" +metaValue: "{{ $value }}" diff --git a/deploy/gcp/variables.tf b/deploy/gcp/variables.tf new file mode 100644 index 0000000000..ec01b8d632 --- /dev/null +++ b/deploy/gcp/variables.tf @@ -0,0 +1,66 @@ +variable "cluster_name" { + description = "TiDB clustername" + default = "my-cluster" +} + +variable "tidb_version" { + description = "TiDB version" + default = "v2.1.8" +} + +variable "tidb_operator_version" { + description = "TiDB operator version" + default = "v1.0.0-beta.2" +} + +variable "pd_replica_count" { + default = 3 +} + +variable "tikv_replica_count" { + default = 3 +} + +variable "tidb_replica_count" { + default = 3 +} + +variable "pd_count" { + description = "Number of PD nodes per availability zone" + default = 1 +} + +variable "tikv_count" { + description = "Number of TiKV nodes per availability zone" + default = 1 +} + +variable "tidb_count" { + description = "Number of TiDB nodes per availability zone" + default = 1 +} + +variable "monitor_count" { + description = "Number of monitor nodes per availability zone" + default = 1 +} + +variable "pd_instance_type" { + default = "n1-standard-4" +} + +variable "tikv_instance_type" { + default = "n1-highmem-8" +} + +variable "tidb_instance_type" { + default = "n1-standard-16" +} + +variable "monitor_instance_type" { + default = "n1-standard-2" +} + +variable "bastion_instance_type" { + default = "f1-micro" +}