Skip to content

Commit

Permalink
feat: configure the federated learning use case (#74)
Browse files Browse the repository at this point in the history
- configure firewall for federated learning
- configure iam roles and service accounts
- configure dedicated node pools
- configure policy controller and policies
- configure dedicated Kubernetes namespaces
  • Loading branch information
ferrarimarco authored and arueth committed Jan 14, 2025
1 parent 75aa34d commit 9588c0b
Show file tree
Hide file tree
Showing 71 changed files with 1,547 additions and 99 deletions.
11 changes: 11 additions & 0 deletions platforms/gke/base/_shared_config/cluster_variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ locals {

kubeconfig_directory = abspath("${path.module}/../kubeconfig")
kubeconfig_file = abspath("${local.kubeconfig_directory}/${var.cluster_project_id}-${local.unique_identifier_prefix}")

# Minimal roles for nodepool SA https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster#use_least_privilege_sa
cluster_sa_roles = [
"roles/artifactregistry.reader",
"roles/autoscaling.metricsWriter",
"roles/logging.logWriter",
"roles/monitoring.metricWriter",
"roles/monitoring.viewer",
"roles/serviceusage.serviceUsageConsumer",
"roles/stackdriver.resourceMetadata.writer",
]
}

variable "cluster_binary_authorization_evaluation_mode" {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ locals {
git_creds_secret = var.configmanagement_git_credentials.secret_name == null ? "${var.platform_name}-git-creds" : var.configmanagement_git_credentials.secret_name

oci_repo_id = "${local.unique_identifier_prefix}-config-sync"
oci_repo_url = "${var.cluster_region}-docker.pkg.dev/${data.google_project.cluster.project_id}/${local.oci_repo_id}"
oci_repo_domain = "${var.cluster_region}-docker.pkg.dev"
oci_repo_url = "${local.oci_repo_domain}/${data.google_project.cluster.project_id}/${local.oci_repo_id}"
oci_root_sync_image = "${local.oci_root_sync_image_name}:${local.oci_root_sync_image_tag}"
oci_root_sync_image_name = "root-sync"
oci_root_sync_image_tag = "latest"
Expand Down
13 changes: 0 additions & 13 deletions platforms/gke/base/core/container_cluster/service_account.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

locals {
# Minimal roles for nodepool SA https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster#use_least_privilege_sa
cluster_sa_roles = [
"roles/artifactregistry.reader",
"roles/autoscaling.metricsWriter",
"roles/logging.logWriter",
"roles/monitoring.metricWriter",
"roles/monitoring.viewer",
"roles/serviceusage.serviceUsageConsumer",
"roles/stackdriver.resourceMetadata.writer",
]
}

# Create dedicated service account for the cluster nodes
resource "google_service_account" "cluster" {
for_each = toset(var.cluster_node_pool_default_service_account_id == null ? ["created"] : [])
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ resource "google_gke_hub_feature_membership" "cluster_policycontroller" {

policycontroller {
policy_controller_hub_config {
audit_interval_seconds = 60
install_spec = "INSTALL_SPEC_ENABLED"
log_denies_enabled = true
mutation_enabled = true
Expand Down
73 changes: 40 additions & 33 deletions platforms/gke/base/core/teardown.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ else
"container_node_pool"
"container_cluster"
"networking"
"initialize"
)
fi
echo "Core platform terraservices to destroy: ${terraservices[*]}"
Expand All @@ -54,42 +55,48 @@ cd "${ACP_PLATFORM_CORE_DIR}/initialize" &&
rm tfplan

for terraservice in "${terraservices[@]}"; do
cd "${ACP_PLATFORM_CORE_DIR}/${terraservice}" &&
echo "Current directory: $(pwd)" &&
terraform init &&
terraform destroy -auto-approve || exit 1
rm -rf .terraform/
done
if [[ "${terraservice}" != "initialize" ]]; then
cd "${ACP_PLATFORM_CORE_DIR}/${terraservice}" &&
echo "Current directory: $(pwd)" &&
terraform init &&
terraform destroy -auto-approve || exit 1
rm -rf .terraform/
# Destroy the backend only if we're destroying the initialize service,
# otherwise we wouldn't be able to support a tiered core platform provisioning
# and teardown
else
cd "${ACP_PLATFORM_CORE_DIR}/${terraservice}" &&
echo "Current directory: $(pwd)" &&
rm -rf backend.tf &&
terraform init -force-copy -lock=false -migrate-state || exit 1

cd "${ACP_PLATFORM_CORE_DIR}/initialize" &&
echo "Current directory: $(pwd)" &&
rm -rf backend.tf &&
terraform init -force-copy -lock=false -migrate-state || exit 1
# Quote the globbing expression because we don't want to expand it with the
# shell
gcloud storage rm -r "gs://${terraform_bucket_name}/*" &&
terraform destroy -auto-approve || exit 1
# Quote the globbing expression because we don't want to expand it with the
# shell
gcloud storage rm -r "gs://${terraform_bucket_name}/*" &&
terraform destroy -auto-approve || exit 1

rm -rf \
"${ACP_PLATFORM_BASE_DIR}/_shared_config/.terraform/" \
"${ACP_PLATFORM_BASE_DIR}/_shared_config"/terraform.tfstate* \
"${ACP_PLATFORM_CORE_DIR}/initialize/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/initialize"/terraform.tfstate* \
"${ACP_PLATFORM_CORE_DIR}/networking/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/container_cluster/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/container_node_pool/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/container_node_pool"/container_node_pool_*.tf \
"${ACP_PLATFORM_CORE_DIR}/gke_enterprise/configmanagement/git/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/gke_enterprise/configmanagement/oci/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/gke_enterprise/fleet_membership/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/gke_enterprise/servicemesh/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/workloads/kueue.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/workloads/kubeconfig" \
"${ACP_PLATFORM_CORE_DIR}/workloads/manifests"
rm -rf \
"${ACP_PLATFORM_BASE_DIR}/_shared_config/.terraform/" \
"${ACP_PLATFORM_BASE_DIR}/_shared_config"/terraform.tfstate* \
"${ACP_PLATFORM_CORE_DIR}/initialize/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/initialize"/terraform.tfstate* \
"${ACP_PLATFORM_CORE_DIR}/networking/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/container_cluster/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/container_node_pool/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/container_node_pool"/container_node_pool_*.tf \
"${ACP_PLATFORM_CORE_DIR}/gke_enterprise/configmanagement/git/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/gke_enterprise/configmanagement/oci/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/gke_enterprise/fleet_membership/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/gke_enterprise/servicemesh/.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/workloads/kueue.terraform/" \
"${ACP_PLATFORM_CORE_DIR}/workloads/kubeconfig" \
"${ACP_PLATFORM_CORE_DIR}/workloads/manifests"

git restore \
"${ACP_PLATFORM_CORE_DIR}/initialize/backend.tf.bucket" \
"${ACP_PLATFORM_CORE_DIR}/container_node_pool"/container_node_pool_*.tf
git restore \
"${ACP_PLATFORM_CORE_DIR}/initialize/backend.tf.bucket" \
"${ACP_PLATFORM_CORE_DIR}/container_node_pool"/container_node_pool_*.tf
fi
done

end_timestamp=$(date +%s)
total_runtime_value=$((end_timestamp - start_timestamp))
Expand Down
24 changes: 22 additions & 2 deletions platforms/gke/base/use-cases/federated-learning/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,26 @@ federated_learning_core_platform_terraservices=(

# shellcheck disable=SC2034 # Variable is used in other scripts
federated_learning_terraservices=(
"firewall"
"container_image_repository"
"private_google_access"
"workload_identity"
"container_node_pool"
"config_management"
)

# shellcheck disable=SC2034 # Variable is used in other scripts
core_platform_init_terraservices=(
"initialize"
"networking"
)

# shellcheck disable=SC2034 # Variable is used in other scripts
core_platform_terraservices=(
"container_cluster"
"gke_enterprise/fleet_membership"
"gke_enterprise/configmanagement/oci"
"gke_enterprise/policycontroller"
)

# shellcheck disable=SC2034 # Variable is used in other scripts
Expand All @@ -68,7 +86,7 @@ apply_or_destroy_terraservice() {

echo "Initializing ${terraservice} Terraform environment"
cd "${FEDERATED_LEARNING_USE_CASE_TERRAFORM_DIR}/${terraservice}" &&
terraform init
terraform init -input=false

echo "Current working directory: $(pwd)"

Expand All @@ -79,7 +97,9 @@ apply_or_destroy_terraservice() {
_terraform_result=$?
elif [[ "${operation_mode}" == "destroy" ]]; then
echo "Destroying ${terraservice}"
terraform destroy -auto-approve
terraform destroy \
-auto-approve \
-input=false
_terraform_result=$?
else
echo "Error: operation mode not supported: ${operation_mode}"
Expand Down
18 changes: 9 additions & 9 deletions platforms/gke/base/use-cases/federated-learning/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@ source "${ACP_PLATFORM_BASE_DIR}/use-cases/federated-learning/common.sh"

start_timestamp_federated_learning=$(date +%s)

echo "Initializing the core platform"
# Don't provision any core platform terraservice becuase we just need
# to initialize the terraform environment and remote backend
# shellcheck disable=SC1091
CORE_TERRASERVICES_APPLY="initialize" \
"${ACP_PLATFORM_CORE_DIR}/deploy.sh"

echo "Preparing core platform configuration files"
for configuration_variable in "${TERRAFORM_CLUSTER_CONFIGURATION[@]}"; do
write_terraform_configuration_variable_to_file "${configuration_variable}" "${ACP_PLATFORM_SHARED_CONFIG_CLUSTER_AUTO_VARS_FILE}"
Expand All @@ -38,6 +31,13 @@ for configuration_variable in "${TERRAFORM_CORE_INITIALIZE_CONFIGURATION[@]}"; d
write_terraform_configuration_variable_to_file "${configuration_variable}" "${ACP_PLATFORM_SHARED_CONFIG_INITIALIZE_AUTO_VARS_FILE}"
done

echo "Initializing the core platform"
# Don't provision any core platform terraservice becuase we just need
# to initialize the terraform environment and remote backend
# shellcheck disable=SC1091,SC2154
CORE_TERRASERVICES_APPLY="${core_platform_init_terraservices[*]}" \
"${ACP_PLATFORM_CORE_DIR}/deploy.sh"

echo "Provision services that the core platform depends on"
# shellcheck disable=SC2154 # variable defined in common.sh
for terraservice in "${federated_learning_core_platform_terraservices[@]}"; do
Expand All @@ -50,8 +50,8 @@ fi
edit_terraform_configuration_variable_value_in_file "cluster_database_encryption_key_name_placeholder" "${cluster_database_encryption_key_id}" "${ACP_PLATFORM_SHARED_CONFIG_CLUSTER_AUTO_VARS_FILE}"

echo "Provisioning the core platform"
# shellcheck disable=SC1091,SC2034 # Variable is used in other scripts
CORE_TERRASERVICES_APPLY="networking container_cluster gke_enterprise/fleet_membership" \
# shellcheck disable=SC1091,SC2034,SC2154 # Variable is used in other scripts
CORE_TERRASERVICES_APPLY="${core_platform_terraservices[*]}" \
"${ACP_PLATFORM_CORE_DIR}/deploy.sh"

echo "Provisioning the use case resources"
Expand Down
32 changes: 29 additions & 3 deletions platforms/gke/base/use-cases/federated-learning/teardown.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,42 @@ source "${ACP_PLATFORM_BASE_DIR}/use-cases/federated-learning/common.sh"

start_timestamp_federated_learning=$(date +%s)

# Iterate over the terraservices array so we destroy them in reverse order, keeping the
# initialize terraservice last.
echo "Destroy the use case terraservices"
# Iterate over the terraservices array so we destroy them in reverse order
# shellcheck disable=SC2154 # variable defined in common.sh
for ((i = ${#federated_learning_terraservices[@]} - 1; i >= 0; i--)); do
terraservice=${federated_learning_terraservices[i]}
destroy_terraservice "${terraservice}"
done

echo "Destroying the core platform"
"${ACP_PLATFORM_CORE_DIR}/teardown.sh"
CORE_TERRASERVICES_DESTROY=""
# shellcheck disable=SC2154 # variable defined in common.sh
for ((i = ${#core_platform_terraservices[@]} - 1; i >= 0; i--)); do
CORE_TERRASERVICES_DESTROY="${CORE_TERRASERVICES_DESTROY} ${core_platform_terraservices[i]}"
done
# Trim leading space
CORE_TERRASERVICES_DESTROY="${CORE_TERRASERVICES_DESTROY#"${CORE_TERRASERVICES_DESTROY%%[![:space:]]*}"}"
CORE_TERRASERVICES_DESTROY="${CORE_TERRASERVICES_DESTROY}" \
"${ACP_PLATFORM_CORE_DIR}/teardown.sh"

echo "Destroying the services that the core platform depends on"
# shellcheck disable=SC2154 # variable defined in common.sh
for ((i = ${#federated_learning_core_platform_terraservices[@]} - 1; i >= 0; i--)); do
terraservice=${federated_learning_core_platform_terraservices[i]}
destroy_terraservice "${terraservice}"
done

echo "Destroying the initialization core platform services"
CORE_TERRASERVICES_DESTROY=""
# shellcheck disable=SC2154 # variable defined in common.sh
for ((i = ${#core_platform_init_terraservices[@]} - 1; i >= 0; i--)); do
CORE_TERRASERVICES_DESTROY="${CORE_TERRASERVICES_DESTROY} ${core_platform_init_terraservices[i]}"
done
# Trim leading space
CORE_TERRASERVICES_DESTROY="${CORE_TERRASERVICES_DESTROY#"${CORE_TERRASERVICES_DESTROY%%[![:space:]]*}"}"
CORE_TERRASERVICES_DESTROY="${CORE_TERRASERVICES_DESTROY}" \
"${ACP_PLATFORM_CORE_DIR}/teardown.sh"

for configuration_variable in "${TERRAFORM_CLUSTER_CONFIGURATION[@]}"; do
remove_terraform_configuration_variable_from_file "${configuration_variable}" "${ACP_PLATFORM_SHARED_CONFIG_CLUSTER_AUTO_VARS_FILE}"
Expand Down
Loading

0 comments on commit 9588c0b

Please sign in to comment.