From 1e8a397558ef08bf55d792eaf2014c34b0345322 Mon Sep 17 00:00:00 2001 From: Isaac Yang Date: Thu, 11 Jan 2024 11:38:21 -0800 Subject: [PATCH] AWS server and client OK Azure template updated --- nvflare/dashboard/application/blob.py | 42 ++-- nvflare/lighter/impl/master_template.yml | 233 ++--------------------- nvflare/lighter/impl/static_file.py | 17 -- nvflare/lighter/tplt_utils.py | 12 ++ 4 files changed, 57 insertions(+), 247 deletions(-) diff --git a/nvflare/dashboard/application/blob.py b/nvflare/dashboard/application/blob.py index 389b5bb4c6..15caa0de3b 100644 --- a/nvflare/dashboard/application/blob.py +++ b/nvflare/dashboard/application/blob.py @@ -27,10 +27,6 @@ template = utils.load_yaml(os.path.join(lighter_folder, "impl", "master_template.yml")) -def get_csp_template(csp, participant, template): - return template[f"{csp}_start_{participant}_sh"] - - def get_csp_start_script_name(csp): return f"{csp}_start.sh" @@ -157,20 +153,37 @@ def gen_server(key, first_server=True): _write(os.path.join(dest_dir, "server.key"), cert_pair.ser_pri_key, "b", exe=False) _write(os.path.join(dest_dir, "rootCA.pem"), project.root_cert, "b", exe=False) if not project.ha_mode: + azure_start_svr_header_sh = tplt.get_azure_start_svr_header_sh() + azure_start_common_sh = tplt.get_azure_start_common_sh() + script = tplt.get_cloud_script_header() + azure_start_svr_header_sh + azure_start_common_sh _write( os.path.join(dest_dir, get_csp_start_script_name("azure")), utils.sh_replace( - tplt.get_cloud_script_header() + get_csp_template("azure", "svr", template), - {"server_name": entity.name, "ORG": ""}, + script, + { + "type": "server", + "docker_network": "--network host", + "uid_cln": "", + "server_name": entity.name, + "ORG": "", + }, ), "t", exe=True, ) + aws_start_svr_cln_sh = tplt.get_aws_start_sh() + script = tplt.get_cloud_script_header() + aws_start_svr_cln_sh _write( os.path.join(dest_dir, get_csp_start_script_name("aws")), utils.sh_replace( - tplt.get_cloud_script_header() + get_csp_template("aws", "svr", template), - {"server_name": entity.name, "ORG": ""}, + script, + { + "type": "server", + "inbound_rule": "aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 8002-8003 --cidr 0.0.0.0/0 >> /tmp/sec_grp.log", + "cln_uid": "", + "server_name": entity.name, + "ORG": "", + }, ), "t", exe=True, @@ -282,20 +295,25 @@ def gen_client(key, id): _write(os.path.join(dest_dir, "client.crt"), cert_pair.ser_cert, "b", exe=False) _write(os.path.join(dest_dir, "client.key"), cert_pair.ser_pri_key, "b", exe=False) _write(os.path.join(dest_dir, "rootCA.pem"), project.root_cert, "b", exe=False) + azure_start_cln_header_sh = tplt.get_azure_start_cln_header_sh() + azure_start_common_sh = tplt.get_azure_start_common_sh() + script = tplt.get_cloud_script_header() + azure_start_cln_header_sh + azure_start_common_sh _write( os.path.join(dest_dir, get_csp_start_script_name("azure")), utils.sh_replace( - tplt.get_cloud_script_header() + get_csp_template("azure", "cln", template), - {"SITE": entity.name, "ORG": entity.org}, + script, + {"type": "client", "docker_network": "", "uid_cln": f"uid={entity.name}", "ORG": entity.org}, ), "t", exe=True, ) + aws_start_svr_cln_sh = tplt.get_aws_start_sh() + script = tplt.get_cloud_script_header() + aws_start_svr_cln_sh _write( os.path.join(dest_dir, get_csp_start_script_name("aws")), utils.sh_replace( - tplt.get_cloud_script_header() + get_csp_template("aws", "cln", template), - {"SITE": entity.name, "ORG": entity.org}, + script, + {"type": "client", "inbound_rule": "", "cln_uid": f"uid={entity.name}", "ORG": entity.org}, ), "t", exe=True, diff --git a/nvflare/lighter/impl/master_template.yml b/nvflare/lighter/impl/master_template.yml index 5d816858c6..0c84095162 100644 --- a/nvflare/lighter/impl/master_template.yml +++ b/nvflare/lighter/impl/master_template.yml @@ -910,7 +910,7 @@ cloud_script_header: | shift done -azure_start_svr_sh: | +azure_start_svr_header_sh: | RESOURCE_GROUP=nvflare_rg VM_NAME=nvflare_server VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest @@ -1087,74 +1087,7 @@ azure_start_svr_sh: | --destination-port-ranges $ADMIN_PORT report_status "$?" "creating network security group rule for Admin port" - az network nic update \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --name $NIC_NAME \ - --network-security-group $NSG_NAME - report_status "$?" "updating network interface card" - - echo "Copying files to $VM_NAME" - DEST=$ADMIN_USERNAME@${IP_ADDRESS}:$DEST_FOLDER - echo "Destination folder is ${DEST}" - cd $DIR/.. && sshpass -p $PASSWORD scp -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $PWD $DEST - report_status "$?" "copying startup kits to VM" - - if [ $container = true ] - then - echo "Installing and lauching container in $VM_NAME, may take a few minutes." - scripts=$(cat << 'EOF' - sudo apt-get update && \ - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates curl gnupg lsb-release && \ - sudo mkdir -p /etc/apt/keyrings && \ - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null && \ - sudo apt-get update && \ - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y docker-ce docker-ce-cli containerd.io - EOF - ) - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "$scripts" > /tmp/docker_engine.json - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "sudo usermod -aG docker $ADMIN_USERNAME" >> /tmp/docker_engine.json - report_status "$?" "installing docker engine" - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} --network host ${DOCKER_OPTION} ${image_name} /bin/bash -c \"python -u -m nvflare.private.fed.app.server.server_train -m ${DEST_FOLDER} -s fed_server.json --set secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/vm_create.json 2>&1 - report_status "$?" "launching container" - else - echo "Installing packages in $VM_NAME, may take a few minutes." - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "echo ${DEST_FOLDER} && wget -q https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python3 -m pip install --ignore-installed nvflare && touch ${DEST_FOLDER}/startup/requirements.txt && python3 -m pip install -r ${DEST_FOLDER}/startup/requirements.txt && ${DEST_FOLDER}/startup/start.sh && sleep 20 && cat ${DEST_FOLDER}/log.txt" > /tmp/vm_create.json - report_status "$?" "installing packages" - fi - echo "System was provisioned" - echo "To delete the resource group (also delete the VM), run the following command" - echo "az group delete -n ${RESOURCE_GROUP}" - echo "To login to the VM with SSH, use ${ADMIN_USERNAME} : ${PASSWORD}" > vm_credential.txt - -azure_start_cln_sh: | +azure_start_cln_header_sh: | RESOURCE_GROUP=nvflare_client_rg_${RANDOM}_${RANDOM} VM_NAME=nvflare_client VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest @@ -1263,6 +1196,7 @@ azure_start_cln_sh: | --destination-port-ranges 22 report_status "$?" "creating network security group rule for SSH" +azure_start_common_sh: | az network nic update \ --output none \ --resource-group $RESOURCE_GROUP \ @@ -1271,7 +1205,7 @@ azure_start_cln_sh: | report_status "$?" "updating network interface card" echo "Copying files to $VM_NAME" - DEST=$ADMIN_USERNAME@$IP_ADDRESS:$DEST_FOLDER + DEST=$ADMIN_USERNAME@${IP_ADDRESS}:$DEST_FOLDER echo "Destination folder is ${DEST}" cd $DIR/.. && sshpass -p $PASSWORD scp -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $PWD $DEST report_status "$?" "copying startup kits to VM" @@ -1279,7 +1213,7 @@ azure_start_cln_sh: | if [ $container = true ] then echo "Installing and lauching container in $VM_NAME, may take a few minutes." - scripts=$(cat <<- 'EOF' + scripts=$(cat << 'EOF' sudo apt-get update && \ sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates curl gnupg lsb-release && \ sudo mkdir -p /etc/apt/keyrings && \ @@ -1306,13 +1240,14 @@ azure_start_cln_sh: | --name $VM_NAME \ --scripts \ "sudo usermod -aG docker $ADMIN_USERNAME" >> /tmp/docker_engine.json + report_status "$?" "Setting user group" az vm run-command invoke \ --output json \ --resource-group $RESOURCE_GROUP \ --command-id RunShellScript \ --name $VM_NAME \ --scripts \ - "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} ${image_name} /bin/bash -c \"python -u -m nvflare.private.fed.app.client.client_train -m ${DEST_FOLDER} -s fed_client.json --set uid={~~SITE~~} secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/vm_create.json 2>&1 + "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} {~~docker_network~~} ${image_name} /bin/bash -c \"python -u -m nvflare.private.fed.app.{~~type~~}.{~~type~~}_train -m ${DEST_FOLDER} -s fed_{~~type~~}.json --set {~~uid_cln~~} secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/vm_create.json 2>&1 report_status "$?" "launching container" else echo "Installing packages in $VM_NAME, may take a few minutes." @@ -1611,11 +1546,11 @@ adm_notebook: | "nbformat_minor": 5 } -aws_start_svr_sh: | - VM_NAME=nvflare_server - SECURITY_GROUP=nvflare_server_sg +aws_start_sh: | + VM_NAME=nvflare_{~~type~~} + SECURITY_GROUP=nvflare_{~~type~~}_sg_$RANDOM DEST_FOLDER=/var/tmp/cloud - KEY_PAIR=NVFlareServerKeyPair + KEY_PAIR=NVFlare{~~type~~}KeyPair KEY_FILE=${KEY_PAIR}.pem echo "This script requires aws (AWS CLI), sshpass, dig and jq. Now checking if they are installed." @@ -1670,144 +1605,7 @@ aws_start_svr_sh: | if [ $container = false ] then - echo "If the server requires additional dependencies, please copy the requirements.txt to ${DIR}." - prompt ans "Press ENTER when it's done or no additional dependencies. " - fi - - cd $DIR/.. - # Generate key pair - - echo "Generating key pair for VM" - - aws ec2 delete-key-pair --key-name $KEY_PAIR > /dev/null 2>&1 - rm -rf $KEY_FILE - aws ec2 create-key-pair --key-name $KEY_PAIR --query 'KeyMaterial' --output text > $KEY_FILE - report_status "$?" "creating key pair" - chmod 400 $KEY_FILE - - # Generate Security Group - - sg_result=$(aws ec2 create-security-group --group-name $SECURITY_GROUP --description "NVFlare security group") - report_status "$?" "Only one NVFL server VM and its security group is allowed. $SECURITY_GROUP exists and thus creating duplicate security group" - sg_id=$(echo $sg_result | jq -r .GroupId) - my_public_ip=$(dig +short myip.opendns.com @resolver1.opendns.com) - if [ "$?" -eq 0 ] && [[ "$my_public_ip" =~ ^(([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))\.){3}([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))$ ]] - then - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr ${my_public_ip}/32 > /tmp/sec_grp.log - else - echo "getting my public IP failed, please manually configure the inbound rule to limit SSH access" - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr 0.0.0.0/0 > /tmp/sec_grp.log - fi - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 8002-8003 --cidr 0.0.0.0/0 >> /tmp/sec_grp.log - report_status "$?" "creating security group rules" - - # Start provisioning - - echo "Creating VM at region $REGION, may take a few minutes." - - aws ec2 run-instances --region $REGION --image-id $AMI_IMAGE --count 1 --instance-type $EC2_TYPE --key-name $KEY_PAIR --security-group-ids $sg_id > vm_create.json - report_status "$?" "creating VM" - instance_id=$(jq -r .Instances[0].InstanceId vm_create.json) - - aws ec2 wait instance-status-ok --instance-ids $instance_id - aws ec2 describe-instances --instance-ids $instance_id > vm_result.json - - IP_ADDRESS=$(jq -r .Reservations[0].Instances[0].PublicIpAddress vm_result.json) - - echo "VM created with IP address: ${IP_ADDRESS}" - - echo "Copying files to $VM_NAME" - DEST_SITE=ubuntu@${IP_ADDRESS} - DEST=${DEST_SITE}:${DEST_FOLDER} - echo "Destination folder is ${DEST}" - scp -q -i $KEY_FILE -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $PWD $DEST - report_status "$?" "copying startup kits to VM" - - if [ $container = true ] - then - echo "Launching container with docker option ${DOCKER_OPTION}." - ssh -f -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ - "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} --network host ${DOCKER_OPTION} ${image_name} \ - /bin/bash -c \"python -u -m nvflare.private.fed.app.server.server_train -m ${DEST_FOLDER} \ - -s fed_server.json --set secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/nvflare.log 2>&1 - report_status "$?" "launching container" - else - ssh -f -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ - "pwd && wget -q https://bootstrap.pypa.io/get-pip.py && \ - python3 get-pip.py && python3 -m pip install nvflare && \ - touch ${DEST_FOLDER}/startup/requirements.txt && \ - python3 -m pip install -r ${DEST_FOLDER}/startup/requirements.txt && \ - nohup ${DEST_FOLDER}/startup/start.sh && sleep 20 && \ - exit" > /tmp/nvflare.log 2>&1 - report_status "$?" "installing packages" - fi - - echo "System was provisioned" - echo "To terminate the EC2 instance, run the following command." - echo "aws ec2 terminate-instances --instance-ids ${instance_id}" - echo "Other resources provisioned" - echo "security group: ${SECURITY_GROUP}" - echo "key pair: ${KEY_PAIR}" - -aws_start_cln_sh: | - VM_NAME=nvflare_client - SECURITY_GROUP=nvflare_client_sg_$RANDOM - DEST_FOLDER=/var/tmp/cloud - KEY_PAIR=NVFlareClientKeyPair - KEY_FILE=${KEY_PAIR}.pem - - echo "This script requires aws (AWS CLI), sshpass, dig and jq. Now checking if they are installed." - - check_binary aws "Please see https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html on how to install it on your system." - check_binary sshpass "Please install it first." - check_binary dig "Please install it first." - check_binary jq "Please install it first." - - if [ -z ${image_name+x} ] - then - container=false - else - container=true - fi - - if [ $container = true ] - then - AMI_IMAGE=ami-06b8d5099f3a8d79d - EC2_TYPE=t2.xlarge - REGION=us-west-2 - else - AMI_IMAGE=ami-04bad3c587fe60d89 - EC2_TYPE=t2.small - REGION=us-west-2 - fi - - if [ -z ${config_file+x} ] - then - useDefault=true - else - useDefault=false - . $config_file - report_status "$?" "Loading config file" - fi - - if [ $useDefault = true ] - then - while true - do - prompt AMI_IMAGE "Cloud AMI image, press ENTER to accept default ${AMI_IMAGE}: " - prompt EC2_TYPE "Cloud EC2 type, press ENTER to accept default ${EC2_TYPE}: " - prompt REGIION "Cloud EC2 region, press ENTER to accept default ${REGION}: " - prompt ans "region = ${REGION}, ami image = ${AMI_IMAGE}, EC2 type = ${EC2_TYPE}, OK? (Y/n) " - if [[ $ans = "" ]] || [[ $ans =~ ^(y|Y)$ ]] - then - break - fi - done - fi - - if [ $container = false ] - then - echo "If the client requires additional dependencies, please copy the requirements.txt to ${DIR}." + echo "If the {~~type~~} requires additional dependencies, please copy the requirements.txt to ${DIR}." prompt ans "Press ENTER when it's done or no additional dependencies. " fi @@ -1834,6 +1632,7 @@ aws_start_cln_sh: | echo "getting my public IP failed, please manually configure the inbound rule to limit SSH access" aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr 0.0.0.0/0 > /tmp/sec_grp.log fi + {~~inbound_rule~~} report_status "$?" "creating security group rules" # Start provisioning @@ -1863,8 +1662,8 @@ aws_start_cln_sh: | echo "Launching container with docker option ${DOCKER_OPTION}." ssh -f -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} --network host ${DOCKER_OPTION} ${image_name} \ - /bin/bash -c \"python -u -m nvflare.private.fed.app.client.client_train -m ${DEST_FOLDER} \ - -s fed_client.json --set uid={~~SITE~~} secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/nvflare.log 2>&1 + /bin/bash -c \"python -u -m nvflare.private.fed.app.{~~type~~}.{~~type~~}_train -m ${DEST_FOLDER} \ + -s fed_{~~type~~}.json --set {~~cln_uid~~} secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/nvflare.log 2>&1 report_status "$?" "launching container" else echo "Installing packages in $VM_NAME, may take a few minutes." @@ -1875,7 +1674,6 @@ aws_start_cln_sh: | python3 -m pip install -r ${DEST_FOLDER}/startup/requirements.txt && \ nohup ${DEST_FOLDER}/startup/start.sh && sleep 20 && \ exit" > /tmp/nvflare.log 2>&1 - report_status "$?" "installing packages" fi @@ -1886,7 +1684,6 @@ aws_start_cln_sh: | echo "security group: ${SECURITY_GROUP}" echo "key pair: ${KEY_PAIR}" - aws_start_dsb_sh: | VM_NAME=nvflare_dashboard AMI_IMAGE=ami-04bad3c587fe60d89 diff --git a/nvflare/lighter/impl/static_file.py b/nvflare/lighter/impl/static_file.py index 21ef4c8f04..ddee5c596b 100644 --- a/nvflare/lighter/impl/static_file.py +++ b/nvflare/lighter/impl/static_file.py @@ -140,11 +140,6 @@ def _build_server(self, server, ctx): server_0["service"]["scheme"] = self.scheme server_0["admin_host"] = self.get_server_name(server) server_0["admin_port"] = admin_port - # if self.download_job_url: - # server_0["download_job_url"] = self.download_job_url - # config["enable_byoc"] = server.enable_byoc - # if self.app_validator: - # config["app_validator"] = {"path": self.app_validator} if self.overseer_agent: overseer_agent = copy.deepcopy(self.overseer_agent) if overseer_agent.get("overseer_exists", True): @@ -158,18 +153,6 @@ def _build_server(self, server, ctx): } overseer_agent.pop("overseer_exists", None) config["overseer_agent"] = overseer_agent - # if self.snapshot_persistor: - # config["snapshot_persistor"] = self.snapshot_persistor - # components = server.props.get("components", []) - # config["components"] = list() - # for comp in components: - # temp_dict = {"id": comp} - # temp_dict.update(components[comp]) - # config["components"].append(temp_dict) - # provisioned_client_list = list() - # for client in self.project.get_participants_by_type("client", first_only=False): - # provisioned_client_list.append(client.name) - # config["provisioned_client_list"] = provisioned_client_list self._write(os.path.join(dest_dir, "fed_server.json"), json.dumps(config, indent=2), "t") replacement_dict = { "admin_port": admin_port, diff --git a/nvflare/lighter/tplt_utils.py b/nvflare/lighter/tplt_utils.py index e0ebf5aec9..c64222dd39 100644 --- a/nvflare/lighter/tplt_utils.py +++ b/nvflare/lighter/tplt_utils.py @@ -19,3 +19,15 @@ def __init__(self, template): def get_cloud_script_header(self): return self.template.get("cloud_script_header") + + def get_aws_start_sh(self): + return self.template.get("aws_start_sh") + + def get_azure_start_svr_header_sh(self): + return self.template.get("azure_start_svr_header_sh") + + def get_azure_start_cln_header_sh(self): + return self.template.get("azure_start_cln_header_sh") + + def get_azure_start_common_sh(self): + return self.template.get("azure_start_common_sh")