diff --git a/nvflare/dashboard/application/blob.py b/nvflare/dashboard/application/blob.py index 389b5bb4c6..174e2135bb 100644 --- a/nvflare/dashboard/application/blob.py +++ b/nvflare/dashboard/application/blob.py @@ -25,24 +25,17 @@ lighter_folder = os.path.dirname(utils.__file__) template = utils.load_yaml(os.path.join(lighter_folder, "impl", "master_template.yml")) - - -def get_csp_template(csp, participant, template): - return template[f"{csp}_start_{participant}_sh"] +supported_csps = ["aws", "azure"] +for csp in supported_csps: + csp_template_file = os.path.join(lighter_folder, "impl", f"{csp}_template.yml") + if os.path.exists(csp_template_file): + template.update(utils.load_yaml(csp_template_file)) def get_csp_start_script_name(csp): return f"{csp}_start.sh" -def _write(file_full_path, content, mode, exe=False): - mode = mode + "w" - with open(file_full_path, mode) as f: - f.write(content) - if exe: - os.chmod(file_full_path, 0o755) - - def gen_overseer(key): project = Project.query.first() entity = Entity(project.overseer) @@ -54,21 +47,19 @@ def gen_overseer(key): dest_dir = os.path.join(overseer_dir, "startup") os.mkdir(overseer_dir) os.mkdir(dest_dir) - _write( + utils._write( os.path.join(dest_dir, "start.sh"), template["start_ovsr_sh"], "t", exe=True, ) - _write( + utils._write( os.path.join(dest_dir, "gunicorn.conf.py"), utils.sh_replace(template["gunicorn_conf_py"], {"port": "8443"}), "t", exe=False, ) - _write(os.path.join(dest_dir, "overseer.crt"), cert_pair.ser_cert, "b", exe=False) - _write(os.path.join(dest_dir, "overseer.key"), cert_pair.ser_pri_key, "b", exe=False) - _write(os.path.join(dest_dir, "rootCA.pem"), project.root_cert, "b", exe=False) + utils._write_pki(type="overseer", dest_dir=dest_dir, cert_pair=cert_pair, root_cert=project.root_cert) run_args = ["zip", "-rq", "-P", key, "tmp.zip", "."] subprocess.run(run_args, cwd=tmp_dir) fileobj = io.BytesIO() @@ -121,6 +112,8 @@ def gen_server(key, first_server=True): "ha_mode": "true" if project.ha_mode else "false", "docker_image": project.app_location.split(" ")[-1] if project.app_location else "nvflare/nvflare", "org_name": "", + "type": "server", + "cln_uid": "", } tplt = tplt_utils.Template(template) with tempfile.TemporaryDirectory() as tmp_dir: @@ -128,82 +121,33 @@ def gen_server(key, first_server=True): dest_dir = os.path.join(server_dir, "startup") os.mkdir(server_dir) os.mkdir(dest_dir) - _write(os.path.join(dest_dir, "fed_server.json"), json.dumps(config, indent=2), "t") - _write( - os.path.join(dest_dir, "docker.sh"), - utils.sh_replace(template["docker_svr_sh"], replacement_dict), - "t", - exe=True, - ) - _write( - os.path.join(dest_dir, "start.sh"), - utils.sh_replace(template["start_svr_sh"], replacement_dict), - "t", - exe=True, - ) - _write( - os.path.join(dest_dir, "sub_start.sh"), - utils.sh_replace(template["sub_start_svr_sh"], replacement_dict), - "t", - exe=True, - ) - _write( - os.path.join(dest_dir, "stop_fl.sh"), - template["stop_fl_sh"], - "t", - exe=True, + utils._write_common( + type="server", + dest_dir=dest_dir, + template=template, + tplt=tplt, + replacement_dict=replacement_dict, + config=config, ) - _write(os.path.join(dest_dir, "server.crt"), cert_pair.ser_cert, "b", exe=False) - _write(os.path.join(dest_dir, "server.key"), cert_pair.ser_pri_key, "b", exe=False) - _write(os.path.join(dest_dir, "rootCA.pem"), project.root_cert, "b", exe=False) + utils._write_pki(type="server", dest_dir=dest_dir, cert_pair=cert_pair, root_cert=project.root_cert) if not project.ha_mode: - _write( - os.path.join(dest_dir, get_csp_start_script_name("azure")), - utils.sh_replace( - tplt.get_cloud_script_header() + get_csp_template("azure", "svr", template), - {"server_name": entity.name, "ORG": ""}, - ), - "t", - exe=True, - ) - _write( - os.path.join(dest_dir, get_csp_start_script_name("aws")), - utils.sh_replace( - tplt.get_cloud_script_header() + get_csp_template("aws", "svr", template), - {"server_name": entity.name, "ORG": ""}, - ), - "t", - exe=True, - ) + for csp in supported_csps: + utils._write( + os.path.join(dest_dir, get_csp_start_script_name(csp)), + tplt.get_start_sh(csp=csp, type="server", entity=entity), + "t", + exe=True, + ) signatures = utils.sign_all(dest_dir, deserialize_ca_key(project.root_key)) json.dump(signatures, open(os.path.join(dest_dir, "signature.json"), "wt")) # local folder creation dest_dir = os.path.join(server_dir, "local") os.mkdir(dest_dir) - _write( - os.path.join(dest_dir, "log.config.default"), - template["log_config"], - "t", - ) - _write( - os.path.join(dest_dir, "resources.json.default"), - template["local_server_resources"], - "t", - ) - _write( - os.path.join(dest_dir, "privacy.json.sample"), - template["sample_privacy"], - "t", - ) - _write( - os.path.join(dest_dir, "authorization.json.default"), - template["default_authz"], - "t", - ) + utils._write_local(type="server", dest_dir=dest_dir, template=template) # workspace folder file - _write( + utils._write( os.path.join(server_dir, "readme.txt"), template["readme_fs"], "t", @@ -233,6 +177,8 @@ def gen_client(key, id): "config_folder": "config", "docker_image": project.app_location.split(" ")[-1] if project.app_location else "nvflare/nvflare", "org_name": entity.org, + "type": "client", + "cln_uid": f"uid={entity.name}", } if project.ha_mode: overseer_agent = {"path": "nvflare.ha.overseer_agent.HttpOverseerAgent"} @@ -254,85 +200,34 @@ def gen_client(key, id): os.mkdir(client_dir) os.mkdir(dest_dir) - _write(os.path.join(dest_dir, "fed_client.json"), json.dumps(config, indent=2), "t") - _write( - os.path.join(dest_dir, "docker.sh"), - utils.sh_replace(template["docker_cln_sh"], replacement_dict), - "t", - exe=True, - ) - _write( - os.path.join(dest_dir, "start.sh"), - template["start_cln_sh"], - "t", - exe=True, - ) - _write( - os.path.join(dest_dir, "sub_start.sh"), - utils.sh_replace(template["sub_start_cln_sh"], replacement_dict), - "t", - exe=True, - ) - _write( - os.path.join(dest_dir, "stop_fl.sh"), - template["stop_fl_sh"], - "t", - exe=True, - ) - _write(os.path.join(dest_dir, "client.crt"), cert_pair.ser_cert, "b", exe=False) - _write(os.path.join(dest_dir, "client.key"), cert_pair.ser_pri_key, "b", exe=False) - _write(os.path.join(dest_dir, "rootCA.pem"), project.root_cert, "b", exe=False) - _write( - os.path.join(dest_dir, get_csp_start_script_name("azure")), - utils.sh_replace( - tplt.get_cloud_script_header() + get_csp_template("azure", "cln", template), - {"SITE": entity.name, "ORG": entity.org}, - ), - "t", - exe=True, - ) - _write( - os.path.join(dest_dir, get_csp_start_script_name("aws")), - utils.sh_replace( - tplt.get_cloud_script_header() + get_csp_template("aws", "cln", template), - {"SITE": entity.name, "ORG": entity.org}, - ), - "t", - exe=True, + utils._write_pki(type="client", dest_dir=dest_dir, cert_pair=cert_pair, root_cert=project.root_cert) + utils._write_common( + type="client", + dest_dir=dest_dir, + template=template, + tplt=tplt, + replacement_dict=replacement_dict, + config=config, ) + + for csp in supported_csps: + utils._write( + os.path.join(dest_dir, get_csp_start_script_name(csp)), + tplt.get_start_sh(csp=csp, type="client", entity=entity), + "t", + exe=True, + ) + signatures = utils.sign_all(dest_dir, deserialize_ca_key(project.root_key)) json.dump(signatures, open(os.path.join(dest_dir, "signature.json"), "wt")) # local folder creation dest_dir = os.path.join(client_dir, "local") os.mkdir(dest_dir) - _write( - os.path.join(dest_dir, "log.config.default"), - template["log_config"], - "t", - ) - resources = json.loads(template["local_client_resources"]) - for component in resources["components"]: - if "nvflare.app_common.resource_managers.gpu_resource_manager.GPUResourceManager" == component["path"]: - component["args"] = json.loads(client.capacity.capacity) - break - _write( - os.path.join(dest_dir, "resources.json.default"), - json.dumps(resources, indent=2), - "t", - ) - _write( - os.path.join(dest_dir, "privacy.json.sample"), - template["sample_privacy"], - "t", - ) - _write( - os.path.join(dest_dir, "authorization.json.default"), - template["default_authz"], - "t", - ) + utils._write_local(type="client", dest_dir=dest_dir, template=template, capacity=client.capacity.capacity) + # workspace folder file - _write( + utils._write( os.path.join(client_dir, "readme.txt"), template["readme_fc"], "t", @@ -378,16 +273,14 @@ def gen_user(key, id): os.mkdir(user_dir) os.mkdir(dest_dir) - _write(os.path.join(dest_dir, "fed_admin.json"), json.dumps(config, indent=2), "t") - _write( + utils._write(os.path.join(dest_dir, "fed_admin.json"), json.dumps(config, indent=2), "t") + utils._write( os.path.join(dest_dir, "fl_admin.sh"), utils.sh_replace(template["fl_admin_sh"], replacement_dict), "t", exe=True, ) - _write(os.path.join(dest_dir, "client.crt"), cert_pair.ser_cert, "b", exe=False) - _write(os.path.join(dest_dir, "client.key"), cert_pair.ser_pri_key, "b", exe=False) - _write(os.path.join(dest_dir, "rootCA.pem"), project.root_cert, "b", exe=False) + utils._write_pki(type="client", dest_dir=dest_dir, cert_pair=cert_pair, root_cert=project.root_cert) signatures = utils.sign_all(dest_dir, deserialize_ca_key(project.root_key)) json.dump(signatures, open(os.path.join(dest_dir, "signature.json"), "wt")) @@ -396,12 +289,12 @@ def gen_user(key, id): os.mkdir(dest_dir) # workspace folder file - _write( + utils._write( os.path.join(user_dir, "readme.txt"), template["readme_am"], "t", ) - _write( + utils._write( os.path.join(user_dir, "system_info.ipynb"), utils.sh_replace(template["adm_notebook"], replacement_dict), "t", diff --git a/nvflare/dashboard/cli.py b/nvflare/dashboard/cli.py index a31409b545..58e45dbe10 100644 --- a/nvflare/dashboard/cli.py +++ b/nvflare/dashboard/cli.py @@ -21,7 +21,6 @@ import docker import nvflare from nvflare.apis.utils.format_check import name_check -from nvflare.dashboard.application.blob import _write from nvflare.lighter import tplt_utils, utils supported_csp = ("azure", "aws") @@ -146,7 +145,7 @@ def cloud(args): dsb_start = template[f"{csp}_start_dsb_sh"] version = nvflare.__version__ replacement_dict = {"NVFLARE": f"nvflare=={version}", "START_OPT": f"-i {args.image}" if args.image else ""} - _write( + utils._write( dest, utils.sh_replace(tplt.get_cloud_script_header() + dsb_start, replacement_dict), "t", diff --git a/nvflare/lighter/dummy_project.yml b/nvflare/lighter/dummy_project.yml index 51d8cc6379..fb5a759b95 100644 --- a/nvflare/lighter/dummy_project.yml +++ b/nvflare/lighter/dummy_project.yml @@ -28,7 +28,10 @@ participants: builders: - path: nvflare.lighter.impl.workspace.WorkspaceBuilder args: - template_file: master_template.yml + template_file: + - master_template.yml + - aws_template.yml + - azure_template.yml - path: nvflare.lighter.impl.template.TemplateBuilder - path: nvflare.lighter.impl.static_file.StaticFileBuilder args: diff --git a/nvflare/lighter/ha_project.yml b/nvflare/lighter/ha_project.yml index 2a5fecad28..7216dcc762 100644 --- a/nvflare/lighter/ha_project.yml +++ b/nvflare/lighter/ha_project.yml @@ -40,7 +40,10 @@ participants: builders: - path: nvflare.lighter.impl.workspace.WorkspaceBuilder args: - template_file: master_template.yml + template_file: + - master_template.yml + - aws_template.yml + - azure_template.yml - path: nvflare.lighter.impl.template.TemplateBuilder - path: nvflare.lighter.impl.docker.DockerBuilder args: diff --git a/nvflare/lighter/impl/aws_template.yml b/nvflare/lighter/impl/aws_template.yml new file mode 100644 index 0000000000..8ba14d6f2d --- /dev/null +++ b/nvflare/lighter/impl/aws_template.yml @@ -0,0 +1,261 @@ +aws_start_sh: | + VM_NAME=nvflare_{~~type~~} + SECURITY_GROUP=nvflare_{~~type~~}_sg_$RANDOM + DEST_FOLDER=/var/tmp/cloud + KEY_PAIR=NVFlare{~~type~~}KeyPair + KEY_FILE=${KEY_PAIR}.pem + + echo "This script requires aws (AWS CLI), sshpass, dig and jq. Now checking if they are installed." + + check_binary aws "Please see https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html on how to install it on your system." + check_binary sshpass "Please install it first." + check_binary dig "Please install it first." + check_binary jq "Please install it first." + + if [ -z ${image_name+x} ] + then + container=false + else + container=true + fi + + if [ $container = true ] + then + AMI_IMAGE=ami-06b8d5099f3a8d79d + EC2_TYPE=t2.xlarge + REGION=us-west-2 + else + AMI_IMAGE=ami-04bad3c587fe60d89 + EC2_TYPE=t2.small + REGION=us-west-2 + fi + + if [ -z ${config_file+x} ] + then + useDefault=true + else + useDefault=false + . $config_file + report_status "$?" "Loading config file" + fi + + + if [ $useDefault = true ] + then + while true + do + prompt AMI_IMAGE "Cloud AMI image, press ENTER to accept default ${AMI_IMAGE}: " + prompt EC2_TYPE "Cloud EC2 type, press ENTER to accept default ${EC2_TYPE}: " + prompt REGIION "Cloud EC2 region, press ENTER to accept default ${REGION}: " + prompt ans "region = ${REGION}, ami image = ${AMI_IMAGE}, EC2 type = ${EC2_TYPE}, OK? (Y/n) " + if [[ $ans = "" ]] || [[ $ans =~ ^(y|Y)$ ]] + then + break + fi + done + fi + + if [ $container = false ] + then + echo "If the {~~type~~} requires additional dependencies, please copy the requirements.txt to ${DIR}." + prompt ans "Press ENTER when it's done or no additional dependencies. " + fi + + cd $DIR/.. + # Generate key pair + + echo "Generating key pair for VM" + + aws ec2 delete-key-pair --key-name $KEY_PAIR > /dev/null 2>&1 + rm -rf $KEY_FILE + aws ec2 create-key-pair --key-name $KEY_PAIR --query 'KeyMaterial' --output text > $KEY_FILE + report_status "$?" "creating key pair" + chmod 400 $KEY_FILE + + # Generate Security Group + # Try not reusing existing security group because we have to modify it for our own need. + sg_id=$(aws ec2 create-security-group --group-name $SECURITY_GROUP --description "NVFlare security group" | jq -r .GroupId) + report_status "$?" "creating security group" + my_public_ip=$(dig +short myip.opendns.com @resolver1.opendns.com) + if [ "$?" -eq 0 ] && [[ "$my_public_ip" =~ ^(([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))\.){3}([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))$ ]] + then + aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr ${my_public_ip}/32 > /tmp/sec_grp.log + else + echo "getting my public IP failed, please manually configure the inbound rule to limit SSH access" + aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr 0.0.0.0/0 > /tmp/sec_grp.log + fi + {~~inbound_rule~~} + report_status "$?" "creating security group rules" + + # Start provisioning + + echo "Creating VM at region $REGION, may take a few minutes." + + aws ec2 run-instances --region $REGION --image-id $AMI_IMAGE --count 1 --instance-type $EC2_TYPE --key-name $KEY_PAIR --security-group-ids $sg_id > vm_create.json + report_status "$?" "creating VM" + instance_id=$(jq -r .Instances[0].InstanceId vm_create.json) + + aws ec2 wait instance-status-ok --instance-ids $instance_id + aws ec2 describe-instances --instance-ids $instance_id > vm_result.json + + IP_ADDRESS=$(jq -r .Reservations[0].Instances[0].PublicIpAddress vm_result.json) + + echo "VM created with IP address: ${IP_ADDRESS}" + + echo "Copying files to $VM_NAME" + DEST_SITE=ubuntu@${IP_ADDRESS} + DEST=${DEST_SITE}:${DEST_FOLDER} + echo "Destination folder is ${DEST}" + scp -q -i $KEY_FILE -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $PWD $DEST + report_status "$?" "copying startup kits to VM" + + if [ $container = true ] + then + echo "Launching container with docker option ${DOCKER_OPTION}." + ssh -f -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ + "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} --network host ${DOCKER_OPTION} ${image_name} \ + /bin/bash -c \"python -u -m nvflare.private.fed.app.{~~type~~}.{~~type~~}_train -m ${DEST_FOLDER} \ + -s fed_{~~type~~}.json --set {~~cln_uid~~} secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/nvflare.log 2>&1 + report_status "$?" "launching container" + else + echo "Installing packages in $VM_NAME, may take a few minutes." + ssh -f -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ + "pwd && wget -q https://bootstrap.pypa.io/get-pip.py && \ + python3 get-pip.py && python3 -m pip install nvflare && \ + touch ${DEST_FOLDER}/startup/requirements.txt && \ + python3 -m pip install -r ${DEST_FOLDER}/startup/requirements.txt && \ + nohup ${DEST_FOLDER}/startup/start.sh && sleep 20 && \ + exit" > /tmp/nvflare.log 2>&1 + report_status "$?" "installing packages" + fi + + echo "System was provisioned" + echo "To terminate the EC2 instance, run the following command." + echo "aws ec2 terminate-instances --instance-ids ${instance_id}" + echo "Other resources provisioned" + echo "security group: ${SECURITY_GROUP}" + echo "key pair: ${KEY_PAIR}" + +aws_start_dsb_sh: | + VM_NAME=nvflare_dashboard + AMI_IMAGE=ami-04bad3c587fe60d89 + EC2_TYPE=t2.small + SECURITY_GROUP=nvflare_dashboard_sg_$RANDOM + REGION=us-west-2 + ADMIN_USERNAME=ubuntu + DEST_FOLDER=/home/${ADMIN_USERNAME} + KEY_PAIR=NVFlareDashboardKeyPair + KEY_FILE=${KEY_PAIR}.pem + + echo "This script requires aws (AWS CLI), sshpass, dig and jq. Now checking if they are installed." + + check_binary aws "Please see https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html on how to install it on your system." + check_binary sshpass "Please install it first." + check_binary dig "Please install it first." + check_binary jq "Please install it first." + + echo "One initial user will be created when starting dashboard." + echo "Please enter the email address for this user." + read email + credential="${email}:$RANDOM" + + # Generate key pair + + echo "Generating key pair for VM" + + aws ec2 delete-key-pair --key-name $KEY_PAIR > /dev/null 2>&1 + rm -rf $KEY_FILE + aws ec2 create-key-pair --key-name $KEY_PAIR --query 'KeyMaterial' --output text > $KEY_FILE + report_status "$?" "creating key pair" + chmod 400 $KEY_FILE + + # Generate Security Group + + sg_id=$(aws ec2 create-security-group --group-name $SECURITY_GROUP --description "NVFlare security group" | jq -r .GroupId) + report_status "$?" "creating security group" + echo "Security group id: ${sg_id}" + my_public_ip=$(dig +short myip.opendns.com @resolver1.opendns.com) + if [ "$?" -eq 0 ] && [[ "$my_public_ip" =~ ^(([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))\.){3}([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))$ ]] + then + aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr ${my_public_ip}/32 > /tmp/sec_grp.log + else + echo "getting my public IP failed, please manually configure the inbound rule to limit SSH access" + aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr 0.0.0.0/0 > /tmp/sec_grp.log + fi + aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 443 --cidr 0.0.0.0/0 >> /tmp/sec_grp.log + report_status "$?" "creating security group rules" + + # Start provisioning + + echo "Creating VM at region $REGION, may take a few minutes." + + aws ec2 run-instances --region $REGION --image-id $AMI_IMAGE --count 1 --instance-type $EC2_TYPE --key-name $KEY_PAIR --security-group-ids $sg_id > vm_create.json + report_status "$?" "creating VM" + instance_id=$(jq -r .Instances[0].InstanceId vm_create.json) + + aws ec2 wait instance-status-ok --instance-ids $instance_id + aws ec2 describe-instances --instance-ids $instance_id > vm_result.json + + IP_ADDRESS=$(jq -r .Reservations[0].Instances[0].PublicIpAddress vm_result.json) + + echo "VM created with IP address: ${IP_ADDRESS}" + + echo "Installing docker engine in $VM_NAME, may take a few minutes." + DEST_SITE=${ADMIN_USERNAME}@${IP_ADDRESS} + scripts=$(cat << 'EOF' + sudo apt-get update && \ + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates curl gnupg lsb-release && \ + sudo mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null && \ + sudo apt-get update && \ + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y docker-ce docker-ce-cli containerd.io + EOF + ) + ssh -t -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} "$scripts" > /tmp/docker_engine.log + report_status "$?" "installing docker engine" + ssh -t -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} "sudo usermod -aG docker $ADMIN_USERNAME && exit" >> /tmp/docker_engine.log + report_status "$?" "installing docker engine" + + echo "Installing nvflare in $VM_NAME, may take a few minutes." + ssh -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ + "export PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin && \ + wget -q https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && \ + python3 -m pip install {~~NVFLARE~~} && \ + mkdir -p ./cert && \ + exit" > /tmp/nvflare.json + report_status "$?" "installing nvflare" + + echo "Checking if certificate (web.crt) and private key (web.key) are available" + if [[ -f "web.crt" && -f "web.key" ]]; then + CERT_FOLDER=${DEST_SITE}:${DEST_FOLDER}/cert + echo "Cert folder is ${CERT_FOLDER}" + scp -i $KEY_FILE -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null web.{crt,key} $CERT_FOLDER + report_status "$?" "copying cert/key to VM ${CERT_FOLDER} folder" + secure=true + else + echo "No web.crt and web.key found" + secure=false + fi + + echo "Starting dashboard" + ssh -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ + "export PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin && \ + python3 -m nvflare.dashboard.cli --start -f ${DEST_FOLDER} --cred ${credential} {~~START_OPT~~}" > /tmp/dashboard.json + + echo "Dashboard url is running at IP address ${IP_ADDRESS}, listening to port 443." + if [ "$secure" = true ] + then + echo "URL is https://${IP_ADDRESS}" + else + echo "URL is http://${IP_ADDRESS}:443" + fi + echo "Note: you may need to configure DNS server with your DNS hostname and the above IP address." + echo "Project admin credential (username:password) is ${credential} ." + echo "To terminate the EC2 instance, run the following command." + echo "aws ec2 terminate-instances --instance-ids ${instance_id}" + echo "Other resources provisioned" + echo "security group: ${SECURITY_GROUP}" + echo "key pair: ${KEY_PAIR}" diff --git a/nvflare/lighter/impl/azure_template.yml b/nvflare/lighter/impl/azure_template.yml new file mode 100644 index 0000000000..9c42a10cf3 --- /dev/null +++ b/nvflare/lighter/impl/azure_template.yml @@ -0,0 +1,517 @@ +azure_start_svr_header_sh: | + RESOURCE_GROUP=nvflare_rg + VM_NAME=nvflare_server + VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest + VM_SIZE=Standard_B2ms + NSG_NAME=nvflare_nsgs + ADMIN_USERNAME=nvflare + PASSWORD="NVFl@r3_P@88"$RANDOM"w0rd" + DEST_FOLDER=/var/tmp/cloud + NIC_NAME=${VM_NAME}VMNic + SERVER_NAME={~~server_name~~} + FL_PORT=8002 + ADMIN_PORT=8003 + + echo "This script requires az (Azure CLI), sshpass and jq. Now checking if they are installed." + + check_binary az "Please see https://learn.microsoft.com/en-us/cli/azure/install-azure-cli on how to install it on your system." + check_binary sshpass "Please install it first." + check_binary jq "Please install it first." + + self_dns=true + if [[ "$SERVER_NAME" = *".cloudapp.azure.com"* ]] + then + DNS_TAG=$(echo $SERVER_NAME | cut -d "." -f 1) + DERIVED_LOCATION=$(echo $SERVER_NAME | cut -d "." -f 2) + LOCATION=$DERIVED_LOCATION + self_dns=false + else + echo "Warning: ${SERVER_NAME} does not end with .cloudapp.azure.com." + echo "The cloud launch process will not create the domain name for you." + echo "Please use your own DNS to set the information." + LOCATION=westus2 + fi + + if [ -z ${image_name+x} ] + then + container=false + else + container=true + fi + + if [ $container = true ] + then + VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest + VM_SIZE=Standard_D8s_v3 + else + VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest + VM_SIZE=Standard_B2ms + fi + + if [ -z ${config_file+x} ] + then + useDefault=true + else + useDefault=false + . $config_file + report_status "$?" "Loading config file" + if [ $self_dns = false ] && [ $DERIVED_LOCATION != $LOCATION ] + then + echo "Server name implies LOCATION=${DERIVED_LOCATION} but the config file specifies LOCATION=${LOCATION}. Unable to continue provisioning." + exit 1 + fi + fi + + if [ $useDefault = true ] + then + while true + do + prompt VM_IMAGE "Cloud VM image, press ENTER to accept default ${VM_IMAGE}: " + prompt VM_SIZE "Cloud VM size, press ENTER to accept default ${VM_SIZE}: " + if [ $self_dns = true ] + then + prompt LOCATION "Cloud location, press ENTER to accept default ${LOCATION}: " + prompt ans "VM image = ${VM_IMAGE}, VM size = ${VM_SIZE}, location = ${LOCATION}, OK? (Y/n) " + else + prompt ans "VM image = ${VM_IMAGE}, VM size = ${VM_SIZE}, OK? (Y/n) " + fi + if [[ $ans = "" ]] || [[ $ans =~ ^(y|Y)$ ]]; then break; fi + done + fi + + if [ $container = false ] + then + echo "If the client requires additional dependencies, please copy the requirements.txt to ${DIR}." + prompt ans "Press ENTER when it's done or no additional dependencies. " + fi + + az login --use-device-code -o none + report_status "$?" "login" + + # Start provisioning + + if [ $(az group exists -n $RESOURCE_GROUP) == 'false' ] + then + echo "Creating Resource Group $RESOURCE_GROUP at Location $LOCATION" + az group create --output none --name $RESOURCE_GROUP --location $LOCATION + report_status "$?" "creating resource group" + elif [ $useDefault = true ] + then + report_status "1" "Only one NVFL server VM and its resource group is allowed. $RESOURCE_GROUP exists and thus creating duplicate resource group" + else + echo "Users require to reuse Resource Group $RESOURCE_GROUP. This script will modify the group and may not work always." + fi + + echo "Creating Virtual Machine, will take a few minutes" + if [ $self_dns = true ] + then + az vm create \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --location $LOCATION \ + --name $VM_NAME \ + --image $VM_IMAGE \ + --size $VM_SIZE \ + --admin-username $ADMIN_USERNAME \ + --admin-password $PASSWORD \ + --authentication-type password \ + --public-ip-address nvflare_server_ip \ + --public-ip-address-allocation static \ + --public-ip-sku Standard > /tmp/vm.json + else + az vm create \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --location $LOCATION \ + --name $VM_NAME \ + --image $VM_IMAGE \ + --size $VM_SIZE \ + --admin-username $ADMIN_USERNAME \ + --admin-password $PASSWORD \ + --authentication-type password \ + --public-ip-address nvflare_server_ip \ + --public-ip-address-allocation static \ + --public-ip-sku Standard \ + --public-ip-address-dns-name $DNS_TAG > /tmp/vm.json + fi + report_status "$?" "creating virtual machine" + + IP_ADDRESS=$(jq -r .publicIpAddress /tmp/vm.json) + echo "Setting up network related configuration" + az network nsg create \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --location $LOCATION \ + --name $NSG_NAME + report_status "$?" "creating network security group" + + az network nsg rule create \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --name SSH \ + --nsg-name $NSG_NAME \ + --priority 1000 \ + --protocol Tcp \ + --destination-port-ranges 22 + report_status "$?" "creating network security group rule for SSH" + + az network nsg rule create \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --name FL_PORT \ + --nsg-name $NSG_NAME \ + --priority 1001 \ + --protocol Tcp \ + --destination-port-ranges $FL_PORT + report_status "$?" "creating network security group rule for FL port" + + az network nsg rule create \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --name ADMIN_PORT \ + --nsg-name $NSG_NAME \ + --priority 1002 \ + --protocol Tcp \ + --destination-port-ranges $ADMIN_PORT + report_status "$?" "creating network security group rule for Admin port" + +azure_start_cln_header_sh: | + RESOURCE_GROUP=nvflare_client_rg_${RANDOM}_${RANDOM} + VM_NAME=nvflare_client + VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest + VM_SIZE=Standard_B2ms + NSG_NAME=nvflare_nsgc + ADMIN_USERNAME=nvflare + PASSWORD="NVFl@r3_P@88"$RANDOM"w0rd" + DEST_FOLDER=/var/tmp/cloud + LOCATION=westus2 + NIC_NAME=${VM_NAME}VMNic + echo "This script requires az (Azure CLI), sshpass and jq. Now checking if they are installed." + + check_binary az "Please see https://learn.microsoft.com/en-us/cli/azure/install-azure-cli on how to install it on your system." + check_binary sshpass "Please install it first." + check_binary jq "Please install it first." + + + if [ -z ${image_name+x} ] + then + container=false + else + container=true + fi + + if [ $container = true ] + then + VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest + VM_SIZE=Standard_D8s_v3 + else + VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest + VM_SIZE=Standard_B2ms + fi + if [ -z ${config_file+x} ] + then + useDefault=true + else + useDefault=false + . $config_file + report_status "$?" "Loading config file" + fi + + if [ $useDefault = true ] + then + while true + do + prompt LOCATION "Cloud location, press ENTER to accept default ${LOCATION}: " + prompt VM_IMAGE "Cloud VM image, press ENTER to accept default ${VM_IMAGE}: " + prompt VM_SIZE "Cloud VM size, press ENTER to accept default ${VM_SIZE}: " + prompt ans "location = ${LOCATION}, VM image = ${VM_IMAGE}, VM size = ${VM_SIZE}, OK? (Y/n) " + if [[ $ans = "" ]] || [[ $ans =~ ^(y|Y)$ ]]; then break; fi + done + fi + + if [ $container = false ] + then + echo "If the client requires additional dependencies, please copy the requirements.txt to ${DIR}." + prompt ans "Press ENTER when it's done or no additional dependencies. " + fi + + az login --use-device-code -o none + report_status "$?" "login" + + # Start provisioning + + if [ $(az group exists -n $RESOURCE_GROUP) == 'false' ] + then + echo "Creating Resource Group $RESOURCE_GROUP at Location $LOCATION" + az group create --output none --name $RESOURCE_GROUP --location $LOCATION + report_status "$?" "creating resource group" + else + echo "Resource Group $RESOURCE_GROUP exists, will reuse it." + fi + + echo "Creating Virtual Machine, will take a few minutes" + az vm create \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --location $LOCATION \ + --name $VM_NAME \ + --image $VM_IMAGE \ + --size $VM_SIZE \ + --admin-username $ADMIN_USERNAME \ + --admin-password $PASSWORD \ + --authentication-type password \ + --public-ip-sku Standard > /tmp/vm.json + report_status "$?" "creating virtual machine" + + IP_ADDRESS=$(jq -r .publicIpAddress /tmp/vm.json) + + echo "Setting up network related configuration" + + az network nsg create \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --location $LOCATION \ + --name $NSG_NAME + report_status "$?" "creating network security group" + + az network nsg rule create \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --name SSH \ + --nsg-name $NSG_NAME \ + --priority 1000 \ + --protocol Tcp \ + --destination-port-ranges 22 + report_status "$?" "creating network security group rule for SSH" + +azure_start_common_sh: | + az network nic update \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --name $NIC_NAME \ + --network-security-group $NSG_NAME + report_status "$?" "updating network interface card" + + echo "Copying files to $VM_NAME" + DEST=$ADMIN_USERNAME@${IP_ADDRESS}:$DEST_FOLDER + echo "Destination folder is ${DEST}" + cd $DIR/.. && sshpass -p $PASSWORD scp -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $PWD $DEST + report_status "$?" "copying startup kits to VM" + + if [ $container = true ] + then + echo "Installing and lauching container in $VM_NAME, may take a few minutes." + scripts=$(cat << 'EOF' + sudo apt-get update && \ + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates curl gnupg lsb-release && \ + sudo mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null && \ + sudo apt-get update && \ + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y docker-ce docker-ce-cli containerd.io + EOF + ) + az vm run-command invoke \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --command-id RunShellScript \ + --name $VM_NAME \ + --scripts \ + "$scripts" > /tmp/docker_engine.json + report_status "$?" "installing docker engine" + az vm run-command invoke \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --command-id RunShellScript \ + --name $VM_NAME \ + --scripts \ + "sudo usermod -aG docker $ADMIN_USERNAME" >> /tmp/docker_engine.json + report_status "$?" "Setting user group" + az vm run-command invoke \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --command-id RunShellScript \ + --name $VM_NAME \ + --scripts \ + "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} {~~docker_network~~} ${image_name} /bin/bash -c \"python -u -m nvflare.private.fed.app.{~~type~~}.{~~type~~}_train -m ${DEST_FOLDER} -s fed_{~~type~~}.json --set {~~cln_uid~~} secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/vm_create.json 2>&1 + report_status "$?" "launching container" + else + echo "Installing packages in $VM_NAME, may take a few minutes." + az vm run-command invoke \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --command-id RunShellScript \ + --name $VM_NAME \ + --scripts \ + "echo ${DEST_FOLDER} && wget -q https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python3 -m pip install --ignore-installed nvflare && touch ${DEST_FOLDER}/startup/requirements.txt && python3 -m pip install -r ${DEST_FOLDER}/startup/requirements.txt && ${DEST_FOLDER}/startup/start.sh && sleep 20 && cat ${DEST_FOLDER}/log.txt" > /tmp/vm_create.json + report_status "$?" "installing packages" + fi + echo "System was provisioned" + echo "To delete the resource group (also delete the VM), run the following command" + echo "az group delete -n ${RESOURCE_GROUP}" + echo "To login to the VM with SSH, use ${ADMIN_USERNAME} : ${PASSWORD}" > vm_credential.txt + +azure_start_dsb_sh: | + RESOURCE_GROUP=nvflare_dashboard_rg_${RANDOM}_${RANDOM} + VM_NAME=nvflare_dashboard + VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest + VM_SIZE=Standard_B2ms + NSG_NAME=nvflare_nsgc + ADMIN_USERNAME=nvflare + PASSWORD="NVFl@r3_P@88"$RANDOM"w0rd" + DEST_FOLDER=/var/tmp/cloud + LOCATION=westus2 + NIC_NAME=${VM_NAME}VMNic + + echo "This script requires az (Azure CLI), sshpass and jq. Now checking if they are installed." + + check_binary az "Please see https://learn.microsoft.com/en-us/cli/azure/install-azure-cli on how to install it on your system." + check_binary sshpass "Please install it first." + check_binary jq "Please install it first." + + echo "One initial user will be created when starting dashboard." + echo "Please enter the email address for this user." + read email + credential="${email}:$RANDOM" + + az login --use-device-code -o none + report_status "$?" "login" + + # Start provisioning + if [ $(az group exists -n $RESOURCE_GROUP) == 'false' ] + then + echo "Creating Resource Group $RESOURCE_GROUP at Location $LOCATION" + az group create --output none --name $RESOURCE_GROUP --location $LOCATION + report_status "$?" "creating resource group" + else + echo "Resource Group $RESOURCE_GROUP exists, will reuse it." + fi + + echo "Creating Virtual Machine, will take a few minutes" + az vm create \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --location $LOCATION \ + --name $VM_NAME \ + --image $VM_IMAGE \ + --size $VM_SIZE \ + --admin-username $ADMIN_USERNAME \ + --admin-password $PASSWORD \ + --authentication-type password \ + --public-ip-sku Standard > /tmp/vm.json + report_status "$?" "creating virtual machine" + + IP_ADDRESS=$(jq -r .publicIpAddress /tmp/vm.json) + report_status "$?" "extracting ip address" + + echo "Setting up network related configuration" + az network nsg create \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --location $LOCATION \ + --name $NSG_NAME + report_status "$?" "creating network security group" + + az network nsg rule create \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --name SSH \ + --nsg-name $NSG_NAME \ + --priority 1000 \ + --protocol Tcp \ + --destination-port-ranges 22 + report_status "$?" "creating network security group rule for SSH" + + az network nsg rule create \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --name HTTPS \ + --nsg-name $NSG_NAME \ + --priority 1001 \ + --protocol Tcp \ + --destination-port-ranges 443 + report_status "$?" "creating network security group rule for HTTPS" + + az network nic update \ + --output none \ + --resource-group $RESOURCE_GROUP \ + --name $NIC_NAME \ + --network-security-group $NSG_NAME + report_status "$?" "updating network interface card" + + echo "Installing docker engine in $VM_NAME, may take a few minutes." + scripts=$(cat << 'EOF' + sudo apt-get update && \ + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates curl gnupg lsb-release && \ + sudo mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null && \ + sudo apt-get update && \ + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y docker-ce docker-ce-cli containerd.io + EOF + ) + az vm run-command invoke \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --command-id RunShellScript \ + --name $VM_NAME \ + --scripts \ + "$scripts" > /tmp/docker_engine.json + report_status "$?" "installing docker engine" + az vm run-command invoke \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --command-id RunShellScript \ + --name $VM_NAME \ + --scripts \ + "sudo usermod -aG docker $ADMIN_USERNAME" >> /tmp/docker_engine.json + report_status "$?" "installing docker engine" + + DEST_FOLDER=/home/${ADMIN_USERNAME} + echo "Installing nvflare in $VM_NAME, may take a few minutes." + az vm run-command invoke \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --command-id RunShellScript \ + --name $VM_NAME \ + --scripts \ + "echo ${DEST_FOLDER} && wget -q https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python3 -m pip install --ignore-installed {~~NVFLARE~~} && mkdir -p ${DEST_FOLDER}/cert && chown -R ${ADMIN_USERNAME} ${DEST_FOLDER}" > /tmp/nvflare.json + report_status "$?" "installing nvflare" + + echo "Checking if certificate (web.crt) and private key (web.key) are available" + if [[ -f "web.crt" && -f "web.key" ]]; then + DEST=$ADMIN_USERNAME@$IP_ADDRESS:${DEST_FOLDER}/cert + echo "Destination folder is ${DEST}" + sshpass -p $PASSWORD scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null web.{crt,key} $DEST + report_status "$?" "copying cert/key to VM ${DEST} folder" + secure=true + else + echo "No web.crt and web.key found" + secure=false + fi + + echo "Starting dashboard" + az vm run-command invoke \ + --output json \ + --resource-group $RESOURCE_GROUP \ + --command-id RunShellScript \ + --name $VM_NAME \ + --scripts \ + "cd ${DEST_FOLDER} && python3 -m nvflare.dashboard.cli --start -f ${DEST_FOLDER} --cred ${credential} {~~START_OPT~~}" > /tmp/dashboard.json + + # credential=$(jq -r .value[0].message /tmp/dashboard.json | grep "Project admin") + # echo "The VM was created with user: ${ADMIN_USERNAME} and password: ${PASSWORD}" + if [ "$secure" = true ] + then + echo "URL is https://${IP_ADDRESS}" + else + echo "URL is http://${IP_ADDRESS}:443" + fi + echo "Note: you may need to configure DNS server with your DNS hostname and the above IP address." + echo "Project admin credential (username:password) is ${credential} ." + echo "To stop the dashboard, run az group delete -n ${RESOURCE_GROUP}" + echo "To login to the VM with SSH, use ${ADMIN_USERNAME} : ${PASSWORD}" > vm_credential.txt diff --git a/nvflare/lighter/impl/master_template.yml b/nvflare/lighter/impl/master_template.yml index 5d816858c6..24342030ba 100644 --- a/nvflare/lighter/impl/master_template.yml +++ b/nvflare/lighter/impl/master_template.yml @@ -417,7 +417,7 @@ stop_fl_sh: | ;; esac -sub_start_cln_sh: | +sub_start_sh: | #!/usr/bin/env bash DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" echo "WORKSPACE set to $DIR/.." @@ -440,7 +440,7 @@ sub_start_cln_sh: | exit fi lst=$SECONDS - ((python3 -u -m nvflare.private.fed.app.client.client_train -m $DIR/.. -s fed_client.json --set secure_train=true uid={~~client_name~~} org={~~org_name~~} config_folder={~~config_folder~~} 2>&1 & echo $! >&3 ) 3>$DIR/../pid.fl ) + ((python3 -u -m nvflare.private.fed.app.{~~type~~}.{~~type~~}_train -m $DIR/.. -s fed_{~~type~~}.json --set secure_train=true {~~cln_uid~~} org={~~org_name~~} config_folder={~~config_folder~~} 2>&1 & echo $! >&3 ) 3>$DIR/../pid.fl ) pid=`cat $DIR/../pid.fl` echo "new pid ${pid}" } @@ -506,93 +506,6 @@ sub_start_cln_sh: | rm -f $DIR/../pid.fl $DIR/../shutdown.fl $DIR/../restart.fl $DIR/../daemon_pid.fl -sub_start_svr_sh: | - #!/usr/bin/env bash - DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - echo "WORKSPACE set to $DIR/.." - mkdir -p $DIR/../transfer - - SECONDS=0 - lst=-400 - restart_count=0 - start_fl() { - if [[ $(( $SECONDS - $lst )) -lt 300 ]]; then - ((restart_count++)) - else - restart_count=0 - fi - if [[ $(($SECONDS - $lst )) -lt 300 && $restart_count -ge 5 ]]; then - echo "System is in trouble and unable to start the task!!!!!" - rm -f $DIR/../pid.fl $DIR/../shutdown.fl $DIR/../restart.fl $DIR/../daemon_pid.fl - exit - fi - lst=$SECONDS - ((python3 -u -m nvflare.private.fed.app.server.server_train -m $DIR/.. -s fed_server.json --set secure_train=true org={~~org_name~~} config_folder={~~config_folder~~} 2>&1 & echo $! >&3 ) 3>$DIR/../pid.fl ) - pid=`cat $DIR/../pid.fl` - echo "new pid ${pid}" - } - - stop_fl() { - if [[ ! -f "$DIR/../pid.fl" ]]; then - echo "No pid.fl. No need to kill process." - return - fi - pid=`cat $DIR/../pid.fl` - sleep 5 - kill -0 ${pid} 2> /dev/null 1>&2 - if [[ $? -ne 0 ]]; then - echo "Process already terminated" - return - fi - kill -9 $pid - rm -f $DIR/../pid.fl $DIR/../shutdown.fl $DIR/../restart.fl - } - - if [[ -f "$DIR/../daemon_pid.fl" ]]; then - dpid=`cat $DIR/../daemon_pid.fl` - kill -0 ${dpid} 2> /dev/null 1>&2 - if [[ $? -eq 0 ]]; then - echo "There seems to be one instance, pid=$dpid, running." - echo "If you are sure it's not the case, please kill process $dpid and then remove daemon_pid.fl in $DIR/.." - exit - fi - rm -f $DIR/../daemon_pid.fl - fi - - echo $BASHPID > $DIR/../daemon_pid.fl - - while true - do - sleep 5 - if [[ ! -f "$DIR/../pid.fl" ]]; then - echo "start fl because of no pid.fl" - start_fl - continue - fi - pid=`cat $DIR/../pid.fl` - kill -0 ${pid} 2> /dev/null 1>&2 - if [[ $? -ne 0 ]]; then - if [[ -f "$DIR/../shutdown.fl" ]]; then - echo "Gracefully shutdown." - break - fi - echo "start fl because process of ${pid} does not exist" - start_fl - continue - fi - if [[ -f "$DIR/../shutdown.fl" ]]; then - echo "About to shutdown." - stop_fl - break - fi - if [[ -f "$DIR/../restart.fl" ]]; then - echo "About to restart." - stop_fl - fi - done - - rm -f $DIR/../pid.fl $DIR/../shutdown.fl $DIR/../restart.fl $DIR/../daemon_pid.fl - docker_cln_sh: | #!/usr/bin/env bash DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" @@ -910,589 +823,6 @@ cloud_script_header: | shift done -azure_start_svr_sh: | - RESOURCE_GROUP=nvflare_rg - VM_NAME=nvflare_server - VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest - VM_SIZE=Standard_B2ms - NSG_NAME=nvflare_nsgs - ADMIN_USERNAME=nvflare - PASSWORD="NVFl@r3_P@88"$RANDOM"w0rd" - DEST_FOLDER=/var/tmp/cloud - NIC_NAME=${VM_NAME}VMNic - SERVER_NAME={~~server_name~~} - FL_PORT=8002 - ADMIN_PORT=8003 - - echo "This script requires az (Azure CLI), sshpass and jq. Now checking if they are installed." - - check_binary az "Please see https://learn.microsoft.com/en-us/cli/azure/install-azure-cli on how to install it on your system." - check_binary sshpass "Please install it first." - check_binary jq "Please install it first." - - self_dns=true - if [[ "$SERVER_NAME" = *".cloudapp.azure.com"* ]] - then - DNS_TAG=$(echo $SERVER_NAME | cut -d "." -f 1) - DERIVED_LOCATION=$(echo $SERVER_NAME | cut -d "." -f 2) - LOCATION=$DERIVED_LOCATION - self_dns=false - else - echo "Warning: ${SERVER_NAME} does not end with .cloudapp.azure.com." - echo "The cloud launch process will not create the domain name for you." - echo "Please use your own DNS to set the information." - LOCATION=westus2 - fi - - if [ -z ${image_name+x} ] - then - container=false - else - container=true - fi - - if [ $container = true ] - then - VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest - VM_SIZE=Standard_D8s_v3 - else - VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest - VM_SIZE=Standard_B2ms - fi - - if [ -z ${config_file+x} ] - then - useDefault=true - else - useDefault=false - . $config_file - report_status "$?" "Loading config file" - if [ $self_dns = false ] && [ $DERIVED_LOCATION != $LOCATION ] - then - echo "Server name implies LOCATION=${DERIVED_LOCATION} but the config file specifies LOCATION=${LOCATION}. Unable to continue provisioning." - exit 1 - fi - fi - - if [ $useDefault = true ] - then - while true - do - prompt VM_IMAGE "Cloud VM image, press ENTER to accept default ${VM_IMAGE}: " - prompt VM_SIZE "Cloud VM size, press ENTER to accept default ${VM_SIZE}: " - if [ $self_dns = true ] - then - prompt LOCATION "Cloud location, press ENTER to accept default ${LOCATION}: " - prompt ans "VM image = ${VM_IMAGE}, VM size = ${VM_SIZE}, location = ${LOCATION}, OK? (Y/n) " - else - prompt ans "VM image = ${VM_IMAGE}, VM size = ${VM_SIZE}, OK? (Y/n) " - fi - if [[ $ans = "" ]] || [[ $ans =~ ^(y|Y)$ ]]; then break; fi - done - fi - - if [ $container = false ] - then - echo "If the client requires additional dependencies, please copy the requirements.txt to ${DIR}." - prompt ans "Press ENTER when it's done or no additional dependencies. " - fi - - az login --use-device-code -o none - report_status "$?" "login" - - # Start provisioning - - if [ $(az group exists -n $RESOURCE_GROUP) == 'false' ] - then - echo "Creating Resource Group $RESOURCE_GROUP at Location $LOCATION" - az group create --output none --name $RESOURCE_GROUP --location $LOCATION - report_status "$?" "creating resource group" - elif [ $useDefault = true ] - then - report_status "1" "Only one NVFL server VM and its resource group is allowed. $RESOURCE_GROUP exists and thus creating duplicate resource group" - else - echo "Users require to reuse Resource Group $RESOURCE_GROUP. This script will modify the group and may not work always." - fi - - echo "Creating Virtual Machine, will take a few minutes" - if [ $self_dns = true ] - then - az vm create \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --location $LOCATION \ - --name $VM_NAME \ - --image $VM_IMAGE \ - --size $VM_SIZE \ - --admin-username $ADMIN_USERNAME \ - --admin-password $PASSWORD \ - --authentication-type password \ - --public-ip-address nvflare_server_ip \ - --public-ip-address-allocation static \ - --public-ip-sku Standard > /tmp/vm.json - else - az vm create \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --location $LOCATION \ - --name $VM_NAME \ - --image $VM_IMAGE \ - --size $VM_SIZE \ - --admin-username $ADMIN_USERNAME \ - --admin-password $PASSWORD \ - --authentication-type password \ - --public-ip-address nvflare_server_ip \ - --public-ip-address-allocation static \ - --public-ip-sku Standard \ - --public-ip-address-dns-name $DNS_TAG > /tmp/vm.json - fi - report_status "$?" "creating virtual machine" - - IP_ADDRESS=$(jq -r .publicIpAddress /tmp/vm.json) - echo "Setting up network related configuration" - az network nsg create \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --location $LOCATION \ - --name $NSG_NAME - report_status "$?" "creating network security group" - - az network nsg rule create \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --name SSH \ - --nsg-name $NSG_NAME \ - --priority 1000 \ - --protocol Tcp \ - --destination-port-ranges 22 - report_status "$?" "creating network security group rule for SSH" - - az network nsg rule create \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --name FL_PORT \ - --nsg-name $NSG_NAME \ - --priority 1001 \ - --protocol Tcp \ - --destination-port-ranges $FL_PORT - report_status "$?" "creating network security group rule for FL port" - - az network nsg rule create \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --name ADMIN_PORT \ - --nsg-name $NSG_NAME \ - --priority 1002 \ - --protocol Tcp \ - --destination-port-ranges $ADMIN_PORT - report_status "$?" "creating network security group rule for Admin port" - - az network nic update \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --name $NIC_NAME \ - --network-security-group $NSG_NAME - report_status "$?" "updating network interface card" - - echo "Copying files to $VM_NAME" - DEST=$ADMIN_USERNAME@${IP_ADDRESS}:$DEST_FOLDER - echo "Destination folder is ${DEST}" - cd $DIR/.. && sshpass -p $PASSWORD scp -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $PWD $DEST - report_status "$?" "copying startup kits to VM" - - if [ $container = true ] - then - echo "Installing and lauching container in $VM_NAME, may take a few minutes." - scripts=$(cat << 'EOF' - sudo apt-get update && \ - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates curl gnupg lsb-release && \ - sudo mkdir -p /etc/apt/keyrings && \ - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null && \ - sudo apt-get update && \ - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y docker-ce docker-ce-cli containerd.io - EOF - ) - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "$scripts" > /tmp/docker_engine.json - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "sudo usermod -aG docker $ADMIN_USERNAME" >> /tmp/docker_engine.json - report_status "$?" "installing docker engine" - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} --network host ${DOCKER_OPTION} ${image_name} /bin/bash -c \"python -u -m nvflare.private.fed.app.server.server_train -m ${DEST_FOLDER} -s fed_server.json --set secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/vm_create.json 2>&1 - report_status "$?" "launching container" - else - echo "Installing packages in $VM_NAME, may take a few minutes." - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "echo ${DEST_FOLDER} && wget -q https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python3 -m pip install --ignore-installed nvflare && touch ${DEST_FOLDER}/startup/requirements.txt && python3 -m pip install -r ${DEST_FOLDER}/startup/requirements.txt && ${DEST_FOLDER}/startup/start.sh && sleep 20 && cat ${DEST_FOLDER}/log.txt" > /tmp/vm_create.json - report_status "$?" "installing packages" - fi - echo "System was provisioned" - echo "To delete the resource group (also delete the VM), run the following command" - echo "az group delete -n ${RESOURCE_GROUP}" - echo "To login to the VM with SSH, use ${ADMIN_USERNAME} : ${PASSWORD}" > vm_credential.txt - -azure_start_cln_sh: | - RESOURCE_GROUP=nvflare_client_rg_${RANDOM}_${RANDOM} - VM_NAME=nvflare_client - VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest - VM_SIZE=Standard_B2ms - NSG_NAME=nvflare_nsgc - ADMIN_USERNAME=nvflare - PASSWORD="NVFl@r3_P@88"$RANDOM"w0rd" - DEST_FOLDER=/var/tmp/cloud - LOCATION=westus2 - NIC_NAME=${VM_NAME}VMNic - echo "This script requires az (Azure CLI), sshpass and jq. Now checking if they are installed." - - check_binary az "Please see https://learn.microsoft.com/en-us/cli/azure/install-azure-cli on how to install it on your system." - check_binary sshpass "Please install it first." - check_binary jq "Please install it first." - - - if [ -z ${image_name+x} ] - then - container=false - else - container=true - fi - - if [ $container = true ] - then - VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest - VM_SIZE=Standard_D8s_v3 - else - VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest - VM_SIZE=Standard_B2ms - fi - if [ -z ${config_file+x} ] - then - useDefault=true - else - useDefault=false - . $config_file - report_status "$?" "Loading config file" - fi - - if [ $useDefault = true ] - then - while true - do - prompt LOCATION "Cloud location, press ENTER to accept default ${LOCATION}: " - prompt VM_IMAGE "Cloud VM image, press ENTER to accept default ${VM_IMAGE}: " - prompt VM_SIZE "Cloud VM size, press ENTER to accept default ${VM_SIZE}: " - prompt ans "location = ${LOCATION}, VM image = ${VM_IMAGE}, VM size = ${VM_SIZE}, OK? (Y/n) " - if [[ $ans = "" ]] || [[ $ans =~ ^(y|Y)$ ]]; then break; fi - done - fi - - if [ $container = false ] - then - echo "If the client requires additional dependencies, please copy the requirements.txt to ${DIR}." - prompt ans "Press ENTER when it's done or no additional dependencies. " - fi - - az login --use-device-code -o none - report_status "$?" "login" - - # Start provisioning - - if [ $(az group exists -n $RESOURCE_GROUP) == 'false' ] - then - echo "Creating Resource Group $RESOURCE_GROUP at Location $LOCATION" - az group create --output none --name $RESOURCE_GROUP --location $LOCATION - report_status "$?" "creating resource group" - else - echo "Resource Group $RESOURCE_GROUP exists, will reuse it." - fi - - echo "Creating Virtual Machine, will take a few minutes" - az vm create \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --location $LOCATION \ - --name $VM_NAME \ - --image $VM_IMAGE \ - --size $VM_SIZE \ - --admin-username $ADMIN_USERNAME \ - --admin-password $PASSWORD \ - --authentication-type password \ - --public-ip-sku Standard > /tmp/vm.json - report_status "$?" "creating virtual machine" - - IP_ADDRESS=$(jq -r .publicIpAddress /tmp/vm.json) - - echo "Setting up network related configuration" - - az network nsg create \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --location $LOCATION \ - --name $NSG_NAME - report_status "$?" "creating network security group" - - az network nsg rule create \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --name SSH \ - --nsg-name $NSG_NAME \ - --priority 1000 \ - --protocol Tcp \ - --destination-port-ranges 22 - report_status "$?" "creating network security group rule for SSH" - - az network nic update \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --name $NIC_NAME \ - --network-security-group $NSG_NAME - report_status "$?" "updating network interface card" - - echo "Copying files to $VM_NAME" - DEST=$ADMIN_USERNAME@$IP_ADDRESS:$DEST_FOLDER - echo "Destination folder is ${DEST}" - cd $DIR/.. && sshpass -p $PASSWORD scp -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $PWD $DEST - report_status "$?" "copying startup kits to VM" - - if [ $container = true ] - then - echo "Installing and lauching container in $VM_NAME, may take a few minutes." - scripts=$(cat <<- 'EOF' - sudo apt-get update && \ - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates curl gnupg lsb-release && \ - sudo mkdir -p /etc/apt/keyrings && \ - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null && \ - sudo apt-get update && \ - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y docker-ce docker-ce-cli containerd.io - EOF - ) - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "$scripts" > /tmp/docker_engine.json - report_status "$?" "installing docker engine" - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "sudo usermod -aG docker $ADMIN_USERNAME" >> /tmp/docker_engine.json - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} ${image_name} /bin/bash -c \"python -u -m nvflare.private.fed.app.client.client_train -m ${DEST_FOLDER} -s fed_client.json --set uid={~~SITE~~} secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/vm_create.json 2>&1 - report_status "$?" "launching container" - else - echo "Installing packages in $VM_NAME, may take a few minutes." - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "echo ${DEST_FOLDER} && wget -q https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python3 -m pip install --ignore-installed nvflare && touch ${DEST_FOLDER}/startup/requirements.txt && python3 -m pip install -r ${DEST_FOLDER}/startup/requirements.txt && ${DEST_FOLDER}/startup/start.sh && sleep 20 && cat ${DEST_FOLDER}/log.txt" > /tmp/vm_create.json - report_status "$?" "installing packages" - fi - echo "System was provisioned" - echo "To delete the resource group (also delete the VM), run the following command" - echo "az group delete -n ${RESOURCE_GROUP}" - echo "To login to the VM with SSH, use ${ADMIN_USERNAME} : ${PASSWORD}" > vm_credential.txt - -azure_start_dsb_sh: | - RESOURCE_GROUP=nvflare_dashboard_rg_${RANDOM}_${RANDOM} - VM_NAME=nvflare_dashboard - VM_IMAGE=Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest - VM_SIZE=Standard_B2ms - NSG_NAME=nvflare_nsgc - ADMIN_USERNAME=nvflare - PASSWORD="NVFl@r3_P@88"$RANDOM"w0rd" - DEST_FOLDER=/var/tmp/cloud - LOCATION=westus2 - NIC_NAME=${VM_NAME}VMNic - - echo "This script requires az (Azure CLI), sshpass and jq. Now checking if they are installed." - - check_binary az "Please see https://learn.microsoft.com/en-us/cli/azure/install-azure-cli on how to install it on your system." - check_binary sshpass "Please install it first." - check_binary jq "Please install it first." - - echo "One initial user will be created when starting dashboard." - echo "Please enter the email address for this user." - read email - credential="${email}:$RANDOM" - - az login --use-device-code -o none - report_status "$?" "login" - - # Start provisioning - if [ $(az group exists -n $RESOURCE_GROUP) == 'false' ] - then - echo "Creating Resource Group $RESOURCE_GROUP at Location $LOCATION" - az group create --output none --name $RESOURCE_GROUP --location $LOCATION - report_status "$?" "creating resource group" - else - echo "Resource Group $RESOURCE_GROUP exists, will reuse it." - fi - - echo "Creating Virtual Machine, will take a few minutes" - az vm create \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --location $LOCATION \ - --name $VM_NAME \ - --image $VM_IMAGE \ - --size $VM_SIZE \ - --admin-username $ADMIN_USERNAME \ - --admin-password $PASSWORD \ - --authentication-type password \ - --public-ip-sku Standard > /tmp/vm.json - report_status "$?" "creating virtual machine" - - IP_ADDRESS=$(jq -r .publicIpAddress /tmp/vm.json) - report_status "$?" "extracting ip address" - - echo "Setting up network related configuration" - az network nsg create \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --location $LOCATION \ - --name $NSG_NAME - report_status "$?" "creating network security group" - - az network nsg rule create \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --name SSH \ - --nsg-name $NSG_NAME \ - --priority 1000 \ - --protocol Tcp \ - --destination-port-ranges 22 - report_status "$?" "creating network security group rule for SSH" - - az network nsg rule create \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --name HTTPS \ - --nsg-name $NSG_NAME \ - --priority 1001 \ - --protocol Tcp \ - --destination-port-ranges 443 - report_status "$?" "creating network security group rule for HTTPS" - - az network nic update \ - --output none \ - --resource-group $RESOURCE_GROUP \ - --name $NIC_NAME \ - --network-security-group $NSG_NAME - report_status "$?" "updating network interface card" - - echo "Installing docker engine in $VM_NAME, may take a few minutes." - scripts=$(cat << 'EOF' - sudo apt-get update && \ - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates curl gnupg lsb-release && \ - sudo mkdir -p /etc/apt/keyrings && \ - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null && \ - sudo apt-get update && \ - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y docker-ce docker-ce-cli containerd.io - EOF - ) - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "$scripts" > /tmp/docker_engine.json - report_status "$?" "installing docker engine" - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "sudo usermod -aG docker $ADMIN_USERNAME" >> /tmp/docker_engine.json - report_status "$?" "installing docker engine" - - DEST_FOLDER=/home/${ADMIN_USERNAME} - echo "Installing nvflare in $VM_NAME, may take a few minutes." - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "echo ${DEST_FOLDER} && wget -q https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python3 -m pip install --ignore-installed {~~NVFLARE~~} && mkdir -p ${DEST_FOLDER}/cert && chown -R ${ADMIN_USERNAME} ${DEST_FOLDER}" > /tmp/nvflare.json - report_status "$?" "installing nvflare" - - echo "Checking if certificate (web.crt) and private key (web.key) are available" - if [[ -f "web.crt" && -f "web.key" ]]; then - DEST=$ADMIN_USERNAME@$IP_ADDRESS:${DEST_FOLDER}/cert - echo "Destination folder is ${DEST}" - sshpass -p $PASSWORD scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null web.{crt,key} $DEST - report_status "$?" "copying cert/key to VM ${DEST} folder" - secure=true - else - echo "No web.crt and web.key found" - secure=false - fi - - echo "Starting dashboard" - az vm run-command invoke \ - --output json \ - --resource-group $RESOURCE_GROUP \ - --command-id RunShellScript \ - --name $VM_NAME \ - --scripts \ - "cd ${DEST_FOLDER} && python3 -m nvflare.dashboard.cli --start -f ${DEST_FOLDER} --cred ${credential} {~~START_OPT~~}" > /tmp/dashboard.json - - # credential=$(jq -r .value[0].message /tmp/dashboard.json | grep "Project admin") - # echo "The VM was created with user: ${ADMIN_USERNAME} and password: ${PASSWORD}" - if [ "$secure" = true ] - then - echo "URL is https://${IP_ADDRESS}" - else - echo "URL is http://${IP_ADDRESS}:443" - fi - echo "Note: you may need to configure DNS server with your DNS hostname and the above IP address." - echo "Project admin credential (username:password) is ${credential} ." - echo "To stop the dashboard, run az group delete -n ${RESOURCE_GROUP}" - echo "To login to the VM with SSH, use ${ADMIN_USERNAME} : ${PASSWORD}" > vm_credential.txt - adm_notebook: | { "cells": [ @@ -1611,402 +941,3 @@ adm_notebook: | "nbformat_minor": 5 } -aws_start_svr_sh: | - VM_NAME=nvflare_server - SECURITY_GROUP=nvflare_server_sg - DEST_FOLDER=/var/tmp/cloud - KEY_PAIR=NVFlareServerKeyPair - KEY_FILE=${KEY_PAIR}.pem - - echo "This script requires aws (AWS CLI), sshpass, dig and jq. Now checking if they are installed." - - check_binary aws "Please see https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html on how to install it on your system." - check_binary sshpass "Please install it first." - check_binary dig "Please install it first." - check_binary jq "Please install it first." - - if [ -z ${image_name+x} ] - then - container=false - else - container=true - fi - - if [ $container = true ] - then - AMI_IMAGE=ami-06b8d5099f3a8d79d - EC2_TYPE=t2.xlarge - REGION=us-west-2 - else - AMI_IMAGE=ami-04bad3c587fe60d89 - EC2_TYPE=t2.small - REGION=us-west-2 - fi - - if [ -z ${config_file+x} ] - then - useDefault=true - else - useDefault=false - . $config_file - report_status "$?" "Loading config file" - fi - - - if [ $useDefault = true ] - then - while true - do - prompt AMI_IMAGE "Cloud AMI image, press ENTER to accept default ${AMI_IMAGE}: " - prompt EC2_TYPE "Cloud EC2 type, press ENTER to accept default ${EC2_TYPE}: " - prompt REGIION "Cloud EC2 region, press ENTER to accept default ${REGION}: " - prompt ans "region = ${REGION}, ami image = ${AMI_IMAGE}, EC2 type = ${EC2_TYPE}, OK? (Y/n) " - if [[ $ans = "" ]] || [[ $ans =~ ^(y|Y)$ ]] - then - break - fi - done - fi - - if [ $container = false ] - then - echo "If the server requires additional dependencies, please copy the requirements.txt to ${DIR}." - prompt ans "Press ENTER when it's done or no additional dependencies. " - fi - - cd $DIR/.. - # Generate key pair - - echo "Generating key pair for VM" - - aws ec2 delete-key-pair --key-name $KEY_PAIR > /dev/null 2>&1 - rm -rf $KEY_FILE - aws ec2 create-key-pair --key-name $KEY_PAIR --query 'KeyMaterial' --output text > $KEY_FILE - report_status "$?" "creating key pair" - chmod 400 $KEY_FILE - - # Generate Security Group - - sg_result=$(aws ec2 create-security-group --group-name $SECURITY_GROUP --description "NVFlare security group") - report_status "$?" "Only one NVFL server VM and its security group is allowed. $SECURITY_GROUP exists and thus creating duplicate security group" - sg_id=$(echo $sg_result | jq -r .GroupId) - my_public_ip=$(dig +short myip.opendns.com @resolver1.opendns.com) - if [ "$?" -eq 0 ] && [[ "$my_public_ip" =~ ^(([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))\.){3}([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))$ ]] - then - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr ${my_public_ip}/32 > /tmp/sec_grp.log - else - echo "getting my public IP failed, please manually configure the inbound rule to limit SSH access" - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr 0.0.0.0/0 > /tmp/sec_grp.log - fi - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 8002-8003 --cidr 0.0.0.0/0 >> /tmp/sec_grp.log - report_status "$?" "creating security group rules" - - # Start provisioning - - echo "Creating VM at region $REGION, may take a few minutes." - - aws ec2 run-instances --region $REGION --image-id $AMI_IMAGE --count 1 --instance-type $EC2_TYPE --key-name $KEY_PAIR --security-group-ids $sg_id > vm_create.json - report_status "$?" "creating VM" - instance_id=$(jq -r .Instances[0].InstanceId vm_create.json) - - aws ec2 wait instance-status-ok --instance-ids $instance_id - aws ec2 describe-instances --instance-ids $instance_id > vm_result.json - - IP_ADDRESS=$(jq -r .Reservations[0].Instances[0].PublicIpAddress vm_result.json) - - echo "VM created with IP address: ${IP_ADDRESS}" - - echo "Copying files to $VM_NAME" - DEST_SITE=ubuntu@${IP_ADDRESS} - DEST=${DEST_SITE}:${DEST_FOLDER} - echo "Destination folder is ${DEST}" - scp -q -i $KEY_FILE -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $PWD $DEST - report_status "$?" "copying startup kits to VM" - - if [ $container = true ] - then - echo "Launching container with docker option ${DOCKER_OPTION}." - ssh -f -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ - "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} --network host ${DOCKER_OPTION} ${image_name} \ - /bin/bash -c \"python -u -m nvflare.private.fed.app.server.server_train -m ${DEST_FOLDER} \ - -s fed_server.json --set secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/nvflare.log 2>&1 - report_status "$?" "launching container" - else - ssh -f -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ - "pwd && wget -q https://bootstrap.pypa.io/get-pip.py && \ - python3 get-pip.py && python3 -m pip install nvflare && \ - touch ${DEST_FOLDER}/startup/requirements.txt && \ - python3 -m pip install -r ${DEST_FOLDER}/startup/requirements.txt && \ - nohup ${DEST_FOLDER}/startup/start.sh && sleep 20 && \ - exit" > /tmp/nvflare.log 2>&1 - report_status "$?" "installing packages" - fi - - echo "System was provisioned" - echo "To terminate the EC2 instance, run the following command." - echo "aws ec2 terminate-instances --instance-ids ${instance_id}" - echo "Other resources provisioned" - echo "security group: ${SECURITY_GROUP}" - echo "key pair: ${KEY_PAIR}" - -aws_start_cln_sh: | - VM_NAME=nvflare_client - SECURITY_GROUP=nvflare_client_sg_$RANDOM - DEST_FOLDER=/var/tmp/cloud - KEY_PAIR=NVFlareClientKeyPair - KEY_FILE=${KEY_PAIR}.pem - - echo "This script requires aws (AWS CLI), sshpass, dig and jq. Now checking if they are installed." - - check_binary aws "Please see https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html on how to install it on your system." - check_binary sshpass "Please install it first." - check_binary dig "Please install it first." - check_binary jq "Please install it first." - - if [ -z ${image_name+x} ] - then - container=false - else - container=true - fi - - if [ $container = true ] - then - AMI_IMAGE=ami-06b8d5099f3a8d79d - EC2_TYPE=t2.xlarge - REGION=us-west-2 - else - AMI_IMAGE=ami-04bad3c587fe60d89 - EC2_TYPE=t2.small - REGION=us-west-2 - fi - - if [ -z ${config_file+x} ] - then - useDefault=true - else - useDefault=false - . $config_file - report_status "$?" "Loading config file" - fi - - if [ $useDefault = true ] - then - while true - do - prompt AMI_IMAGE "Cloud AMI image, press ENTER to accept default ${AMI_IMAGE}: " - prompt EC2_TYPE "Cloud EC2 type, press ENTER to accept default ${EC2_TYPE}: " - prompt REGIION "Cloud EC2 region, press ENTER to accept default ${REGION}: " - prompt ans "region = ${REGION}, ami image = ${AMI_IMAGE}, EC2 type = ${EC2_TYPE}, OK? (Y/n) " - if [[ $ans = "" ]] || [[ $ans =~ ^(y|Y)$ ]] - then - break - fi - done - fi - - if [ $container = false ] - then - echo "If the client requires additional dependencies, please copy the requirements.txt to ${DIR}." - prompt ans "Press ENTER when it's done or no additional dependencies. " - fi - - cd $DIR/.. - # Generate key pair - - echo "Generating key pair for VM" - - aws ec2 delete-key-pair --key-name $KEY_PAIR > /dev/null 2>&1 - rm -rf $KEY_FILE - aws ec2 create-key-pair --key-name $KEY_PAIR --query 'KeyMaterial' --output text > $KEY_FILE - report_status "$?" "creating key pair" - chmod 400 $KEY_FILE - - # Generate Security Group - # Try not reusing existing security group because we have to modify it for our own need. - sg_id=$(aws ec2 create-security-group --group-name $SECURITY_GROUP --description "NVFlare security group" | jq -r .GroupId) - report_status "$?" "creating security group" - my_public_ip=$(dig +short myip.opendns.com @resolver1.opendns.com) - if [ "$?" -eq 0 ] && [[ "$my_public_ip" =~ ^(([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))\.){3}([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))$ ]] - then - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr ${my_public_ip}/32 > /tmp/sec_grp.log - else - echo "getting my public IP failed, please manually configure the inbound rule to limit SSH access" - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr 0.0.0.0/0 > /tmp/sec_grp.log - fi - report_status "$?" "creating security group rules" - - # Start provisioning - - echo "Creating VM at region $REGION, may take a few minutes." - - aws ec2 run-instances --region $REGION --image-id $AMI_IMAGE --count 1 --instance-type $EC2_TYPE --key-name $KEY_PAIR --security-group-ids $sg_id > vm_create.json - report_status "$?" "creating VM" - instance_id=$(jq -r .Instances[0].InstanceId vm_create.json) - - aws ec2 wait instance-status-ok --instance-ids $instance_id - aws ec2 describe-instances --instance-ids $instance_id > vm_result.json - - IP_ADDRESS=$(jq -r .Reservations[0].Instances[0].PublicIpAddress vm_result.json) - - echo "VM created with IP address: ${IP_ADDRESS}" - - echo "Copying files to $VM_NAME" - DEST_SITE=ubuntu@${IP_ADDRESS} - DEST=${DEST_SITE}:${DEST_FOLDER} - echo "Destination folder is ${DEST}" - scp -q -i $KEY_FILE -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $PWD $DEST - report_status "$?" "copying startup kits to VM" - - if [ $container = true ] - then - echo "Launching container with docker option ${DOCKER_OPTION}." - ssh -f -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ - "docker run -d -v ${DEST_FOLDER}:${DEST_FOLDER} --network host ${DOCKER_OPTION} ${image_name} \ - /bin/bash -c \"python -u -m nvflare.private.fed.app.client.client_train -m ${DEST_FOLDER} \ - -s fed_client.json --set uid={~~SITE~~} secure_train=true config_folder=config org={~~ORG~~} \" " > /tmp/nvflare.log 2>&1 - report_status "$?" "launching container" - else - echo "Installing packages in $VM_NAME, may take a few minutes." - ssh -f -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ - "pwd && wget -q https://bootstrap.pypa.io/get-pip.py && \ - python3 get-pip.py && python3 -m pip install nvflare && \ - touch ${DEST_FOLDER}/startup/requirements.txt && \ - python3 -m pip install -r ${DEST_FOLDER}/startup/requirements.txt && \ - nohup ${DEST_FOLDER}/startup/start.sh && sleep 20 && \ - exit" > /tmp/nvflare.log 2>&1 - - report_status "$?" "installing packages" - fi - - echo "System was provisioned" - echo "To terminate the EC2 instance, run the following command." - echo "aws ec2 terminate-instances --instance-ids ${instance_id}" - echo "Other resources provisioned" - echo "security group: ${SECURITY_GROUP}" - echo "key pair: ${KEY_PAIR}" - - -aws_start_dsb_sh: | - VM_NAME=nvflare_dashboard - AMI_IMAGE=ami-04bad3c587fe60d89 - EC2_TYPE=t2.small - SECURITY_GROUP=nvflare_dashboard_sg_$RANDOM - REGION=us-west-2 - ADMIN_USERNAME=ubuntu - DEST_FOLDER=/home/${ADMIN_USERNAME} - KEY_PAIR=NVFlareDashboardKeyPair - KEY_FILE=${KEY_PAIR}.pem - - echo "This script requires aws (AWS CLI), sshpass, dig and jq. Now checking if they are installed." - - check_binary aws "Please see https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html on how to install it on your system." - check_binary sshpass "Please install it first." - check_binary dig "Please install it first." - check_binary jq "Please install it first." - - echo "One initial user will be created when starting dashboard." - echo "Please enter the email address for this user." - read email - credential="${email}:$RANDOM" - - # Generate key pair - - echo "Generating key pair for VM" - - aws ec2 delete-key-pair --key-name $KEY_PAIR > /dev/null 2>&1 - rm -rf $KEY_FILE - aws ec2 create-key-pair --key-name $KEY_PAIR --query 'KeyMaterial' --output text > $KEY_FILE - report_status "$?" "creating key pair" - chmod 400 $KEY_FILE - - # Generate Security Group - - sg_id=$(aws ec2 create-security-group --group-name $SECURITY_GROUP --description "NVFlare security group" | jq -r .GroupId) - report_status "$?" "creating security group" - echo "Security group id: ${sg_id}" - my_public_ip=$(dig +short myip.opendns.com @resolver1.opendns.com) - if [ "$?" -eq 0 ] && [[ "$my_public_ip" =~ ^(([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))\.){3}([1-9]?[0-9]|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))$ ]] - then - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr ${my_public_ip}/32 > /tmp/sec_grp.log - else - echo "getting my public IP failed, please manually configure the inbound rule to limit SSH access" - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 22 --cidr 0.0.0.0/0 > /tmp/sec_grp.log - fi - aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 443 --cidr 0.0.0.0/0 >> /tmp/sec_grp.log - report_status "$?" "creating security group rules" - - # Start provisioning - - echo "Creating VM at region $REGION, may take a few minutes." - - aws ec2 run-instances --region $REGION --image-id $AMI_IMAGE --count 1 --instance-type $EC2_TYPE --key-name $KEY_PAIR --security-group-ids $sg_id > vm_create.json - report_status "$?" "creating VM" - instance_id=$(jq -r .Instances[0].InstanceId vm_create.json) - - aws ec2 wait instance-status-ok --instance-ids $instance_id - aws ec2 describe-instances --instance-ids $instance_id > vm_result.json - - IP_ADDRESS=$(jq -r .Reservations[0].Instances[0].PublicIpAddress vm_result.json) - - echo "VM created with IP address: ${IP_ADDRESS}" - - echo "Installing docker engine in $VM_NAME, may take a few minutes." - DEST_SITE=${ADMIN_USERNAME}@${IP_ADDRESS} - scripts=$(cat << 'EOF' - sudo apt-get update && \ - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates curl gnupg lsb-release && \ - sudo mkdir -p /etc/apt/keyrings && \ - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null && \ - sudo apt-get update && \ - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y docker-ce docker-ce-cli containerd.io - EOF - ) - ssh -t -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} "$scripts" > /tmp/docker_engine.log - report_status "$?" "installing docker engine" - ssh -t -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} "sudo usermod -aG docker $ADMIN_USERNAME && exit" >> /tmp/docker_engine.log - report_status "$?" "installing docker engine" - - echo "Installing nvflare in $VM_NAME, may take a few minutes." - ssh -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ - "export PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin && \ - wget -q https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && \ - python3 -m pip install {~~NVFLARE~~} && \ - mkdir -p ./cert && \ - exit" > /tmp/nvflare.json - report_status "$?" "installing nvflare" - - echo "Checking if certificate (web.crt) and private key (web.key) are available" - if [[ -f "web.crt" && -f "web.key" ]]; then - CERT_FOLDER=${DEST_SITE}:${DEST_FOLDER}/cert - echo "Cert folder is ${CERT_FOLDER}" - scp -i $KEY_FILE -r -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null web.{crt,key} $CERT_FOLDER - report_status "$?" "copying cert/key to VM ${CERT_FOLDER} folder" - secure=true - else - echo "No web.crt and web.key found" - secure=false - fi - - echo "Starting dashboard" - ssh -i $KEY_FILE -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${DEST_SITE} \ - "export PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin && \ - python3 -m nvflare.dashboard.cli --start -f ${DEST_FOLDER} --cred ${credential} {~~START_OPT~~}" > /tmp/dashboard.json - - echo "Dashboard url is running at IP address ${IP_ADDRESS}, listening to port 443." - if [ "$secure" = true ] - then - echo "URL is https://${IP_ADDRESS}" - else - echo "URL is http://${IP_ADDRESS}:443" - fi - echo "Note: you may need to configure DNS server with your DNS hostname and the above IP address." - echo "Project admin credential (username:password) is ${credential} ." - echo "To terminate the EC2 instance, run the following command." - echo "aws ec2 terminate-instances --instance-ids ${instance_id}" - echo "Other resources provisioned" - echo "security group: ${SECURITY_GROUP}" - echo "key pair: ${KEY_PAIR}" diff --git a/nvflare/lighter/impl/static_file.py b/nvflare/lighter/impl/static_file.py index 21ef4c8f04..a024c84a43 100644 --- a/nvflare/lighter/impl/static_file.py +++ b/nvflare/lighter/impl/static_file.py @@ -18,8 +18,8 @@ import yaml +from nvflare.lighter import utils from nvflare.lighter.spec import Builder -from nvflare.lighter.utils import sh_replace class StaticFileBuilder(Builder): @@ -61,13 +61,6 @@ def __init__( self.snapshot_persistor = snapshot_persistor self.components = components - def _write(self, file_full_path, content, mode, exe=False): - mode = mode + "w" - with open(file_full_path, mode) as f: - f.write(content) - if exe: - os.chmod(file_full_path, 0o755) - def get_server_name(self, server): return server.name @@ -76,7 +69,7 @@ def get_overseer_name(self, overseer): def _build_overseer(self, overseer, ctx): dest_dir = self.get_kit_dir(overseer, ctx) - self._write( + utils._write( os.path.join(dest_dir, "start.sh"), self.template["start_svr_sh"], "t", @@ -95,7 +88,7 @@ def _build_overseer(self, overseer, ctx): privilege_dict[role].append(admin.subject) else: privilege_dict[role] = [admin.subject] - self._write( + utils._write( os.path.join(dest_dir, "privilege.yml"), yaml.dump(privilege_dict, Dumper=yaml.Dumper), "t", @@ -103,19 +96,19 @@ def _build_overseer(self, overseer, ctx): ) if self.docker_image: - self._write( + utils._write( os.path.join(dest_dir, "docker.sh"), - sh_replace(self.template["docker_svr_sh"], replacement_dict), + utils.sh_replace(self.template["docker_svr_sh"], replacement_dict), "t", exe=True, ) - self._write( + utils._write( os.path.join(dest_dir, "gunicorn.conf.py"), - sh_replace(self.template["gunicorn_conf_py"], replacement_dict), + utils.sh_replace(self.template["gunicorn_conf_py"], replacement_dict), "t", exe=False, ) - self._write( + utils._write( os.path.join(dest_dir, "start.sh"), self.template["start_ovsr_sh"], "t", @@ -140,11 +133,6 @@ def _build_server(self, server, ctx): server_0["service"]["scheme"] = self.scheme server_0["admin_host"] = self.get_server_name(server) server_0["admin_port"] = admin_port - # if self.download_job_url: - # server_0["download_job_url"] = self.download_job_url - # config["enable_byoc"] = server.enable_byoc - # if self.app_validator: - # config["app_validator"] = {"path": self.app_validator} if self.overseer_agent: overseer_agent = copy.deepcopy(self.overseer_agent) if overseer_agent.get("overseer_exists", True): @@ -158,46 +146,36 @@ def _build_server(self, server, ctx): } overseer_agent.pop("overseer_exists", None) config["overseer_agent"] = overseer_agent - # if self.snapshot_persistor: - # config["snapshot_persistor"] = self.snapshot_persistor - # components = server.props.get("components", []) - # config["components"] = list() - # for comp in components: - # temp_dict = {"id": comp} - # temp_dict.update(components[comp]) - # config["components"].append(temp_dict) - # provisioned_client_list = list() - # for client in self.project.get_participants_by_type("client", first_only=False): - # provisioned_client_list.append(client.name) - # config["provisioned_client_list"] = provisioned_client_list - self._write(os.path.join(dest_dir, "fed_server.json"), json.dumps(config, indent=2), "t") + utils._write(os.path.join(dest_dir, "fed_server.json"), json.dumps(config, indent=2), "t") replacement_dict = { "admin_port": admin_port, "fed_learn_port": fed_learn_port, "config_folder": self.config_folder, "docker_image": self.docker_image, "org_name": server.org, + "type": "server", + "cln_uid": "", } if self.docker_image: - self._write( + utils._write( os.path.join(dest_dir, "docker.sh"), - sh_replace(self.template["docker_svr_sh"], replacement_dict), + utils.sh_replace(self.template["docker_svr_sh"], replacement_dict), "t", exe=True, ) - self._write( + utils._write( os.path.join(dest_dir, "start.sh"), self.template["start_svr_sh"], "t", exe=True, ) - self._write( + utils._write( os.path.join(dest_dir, "sub_start.sh"), - sh_replace(self.template["sub_start_svr_sh"], replacement_dict), + utils.sh_replace(self.template["sub_start_sh"], replacement_dict), "t", exe=True, ) - self._write( + utils._write( os.path.join(dest_dir, "stop_fl.sh"), self.template["stop_fl_sh"], "t", @@ -205,29 +183,29 @@ def _build_server(self, server, ctx): ) # local folder creation dest_dir = self.get_local_dir(server, ctx) - self._write( + utils._write( os.path.join(dest_dir, "log.config.default"), self.template["log_config"], "t", ) - self._write( + utils._write( os.path.join(dest_dir, "resources.json.default"), self.template["local_server_resources"], "t", ) - self._write( + utils._write( os.path.join(dest_dir, "privacy.json.sample"), self.template["sample_privacy"], "t", ) - self._write( + utils._write( os.path.join(dest_dir, "authorization.json.default"), self.template["default_authz"], "t", ) # workspace folder file - self._write( + utils._write( os.path.join(self.get_ws_dir(server, ctx), "readme.txt"), self.template["readme_fs"], "t", @@ -247,6 +225,8 @@ def _build_client(self, client, ctx): "config_folder": self.config_folder, "docker_image": self.docker_image, "org_name": client.org, + "type": "client", + "cln_uid": f"uid={client.subject}", } if self.overseer_agent: overseer_agent = copy.deepcopy(self.overseer_agent) @@ -266,27 +246,27 @@ def _build_client(self, client, ctx): # temp_dict.update(components[comp]) # config["components"].append(temp_dict) - self._write(os.path.join(dest_dir, "fed_client.json"), json.dumps(config, indent=2), "t") + utils._write(os.path.join(dest_dir, "fed_client.json"), json.dumps(config, indent=2), "t") if self.docker_image: - self._write( + utils._write( os.path.join(dest_dir, "docker.sh"), - sh_replace(self.template["docker_cln_sh"], replacement_dict), + utils.sh_replace(self.template["docker_cln_sh"], replacement_dict), "t", exe=True, ) - self._write( + utils._write( os.path.join(dest_dir, "start.sh"), self.template["start_cln_sh"], "t", exe=True, ) - self._write( + utils._write( os.path.join(dest_dir, "sub_start.sh"), - sh_replace(self.template["sub_start_cln_sh"], replacement_dict), + utils.sh_replace(self.template["sub_start_sh"], replacement_dict), "t", exe=True, ) - self._write( + utils._write( os.path.join(dest_dir, "stop_fl.sh"), self.template["stop_fl_sh"], "t", @@ -294,29 +274,29 @@ def _build_client(self, client, ctx): ) # local folder creation dest_dir = self.get_local_dir(client, ctx) - self._write( + utils._write( os.path.join(dest_dir, "log.config.default"), self.template["log_config"], "t", ) - self._write( + utils._write( os.path.join(dest_dir, "resources.json.default"), self.template["local_client_resources"], "t", ) - self._write( + utils._write( os.path.join(dest_dir, "privacy.json.sample"), self.template["sample_privacy"], "t", ) - self._write( + utils._write( os.path.join(dest_dir, "authorization.json.default"), self.template["default_authz"], "t", ) # workspace folder file - self._write( + utils._write( os.path.join(self.get_ws_dir(client, ctx), "readme.txt"), self.template["readme_fc"], "t", @@ -335,21 +315,21 @@ def _build_admin(self, admin, ctx): config = self.prepare_admin_config(admin, ctx) - self._write(os.path.join(dest_dir, "fed_admin.json"), json.dumps(config, indent=2), "t") + utils._write(os.path.join(dest_dir, "fed_admin.json"), json.dumps(config, indent=2), "t") if self.docker_image: - self._write( + utils._write( os.path.join(dest_dir, "docker.sh"), - sh_replace(self.template["docker_adm_sh"], replacement_dict), + utils.sh_replace(self.template["docker_adm_sh"], replacement_dict), "t", exe=True, ) - self._write( + utils._write( os.path.join(dest_dir, "fl_admin.sh"), - sh_replace(self.template["fl_admin_sh"], replacement_dict), + utils.sh_replace(self.template["fl_admin_sh"], replacement_dict), "t", exe=True, ) - self._write( + utils._write( os.path.join(dest_dir, "readme.txt"), self.template["readme_am"], "t", diff --git a/nvflare/lighter/impl/template.py b/nvflare/lighter/impl/template.py index a85277ad11..e3a19e8261 100644 --- a/nvflare/lighter/impl/template.py +++ b/nvflare/lighter/impl/template.py @@ -26,6 +26,8 @@ class TemplateBuilder(Builder): def initialize(self, ctx): resource_dir = self.get_resources_dir(ctx) - template_file = ctx.get("template_file") - template = load_yaml(os.path.join(resource_dir, template_file)) + template_files = ctx.get("template_files") + template = dict() + for tplt_file in template_files: + template.update(load_yaml(os.path.join(resource_dir, tplt_file))) ctx["template"] = template diff --git a/nvflare/lighter/impl/workspace.py b/nvflare/lighter/impl/workspace.py index 4926b20629..6b203227df 100644 --- a/nvflare/lighter/impl/workspace.py +++ b/nvflare/lighter/impl/workspace.py @@ -43,9 +43,9 @@ def __init__(self, template_file): wip/ <--- this is only used during runtime, and will be removed when the provision command exits Args: - template_file: name of template file containing scripts and configs to put into startup folders + template_file: name(s) of template file(s) containing scripts and configs to put into startup folders """ - self.template_file = template_file + self.template_files = template_file def _make_dir(self, dirs): for dir in dirs: @@ -61,10 +61,15 @@ def initialize(self, ctx): if stage > last: last = stage ctx["last_prod_stage"] = last - template_file_full_path = os.path.join(self.get_resources_dir(ctx), self.template_file) - file_path = pathlib.Path(__file__).parent.absolute() - shutil.copyfile(os.path.join(file_path, self.template_file), template_file_full_path) - ctx["template_file"] = self.template_file + if not isinstance(self.template_files, list): + self.template_files = [self.template_files] + tplt_file_list = [] + for tplt_file in self.template_files: + tplt_file_full_path = os.path.join(self.get_resources_dir(ctx), tplt_file) + file_path = pathlib.Path(__file__).parent.absolute() + shutil.copyfile(os.path.join(file_path, tplt_file), tplt_file_full_path) + tplt_file_list.append(tplt_file) + ctx["template_files"] = tplt_file_list def build(self, project: Project, ctx: dict): dirs = [self.get_kit_dir(p, ctx) for p in project.participants] diff --git a/nvflare/lighter/tplt_utils.py b/nvflare/lighter/tplt_utils.py index e0ebf5aec9..590052a5cd 100644 --- a/nvflare/lighter/tplt_utils.py +++ b/nvflare/lighter/tplt_utils.py @@ -13,9 +13,81 @@ # limitations under the License. +from . import utils + + class Template: def __init__(self, template): self.template = template + self.supported_csps = ("azure", "aws") def get_cloud_script_header(self): return self.template.get("cloud_script_header") + + def get_azure_server_start_sh(self, entity): + tmp = self.get_cloud_script_header() + self.get_azure_start_svr_header_sh() + self.get_azure_start_common_sh() + script = utils.sh_replace( + tmp, + { + "type": "server", + "docker_network": "--network host", + "cln_uid": "", + "server_name": entity.name, + "ORG": "", + }, + ) + return script + + def get_aws_server_start_sh(self, entity): + tmp = self.get_cloud_script_header() + self.template.get("aws_start_sh") + script = utils.sh_replace( + tmp, + { + "type": "server", + "inbound_rule": "aws ec2 authorize-security-group-ingress --group-id $sg_id --protocol tcp --port 8002-8003 --cidr 0.0.0.0/0 >> /tmp/sec_grp.log", + "cln_uid": "", + "server_name": entity.name, + "ORG": "", + }, + ) + return script + + def get_azure_client_start_sh(self, entity): + tmp = self.get_cloud_script_header() + self.get_azure_start_cln_header_sh() + self.get_azure_start_common_sh() + script = utils.sh_replace( + tmp, + {"type": "client", "docker_network": "", "cln_uid": f"uid={entity.name}", "ORG": entity.org}, + ) + return script + + def get_aws_client_start_sh(self, entity): + tmp = self.get_cloud_script_header() + self.template.get("aws_start_sh") + script = utils.sh_replace( + tmp, {"type": "client", "inbound_rule": "", "cln_uid": f"uid={entity.name}", "ORG": entity.org} + ) + return script + + def get_azure_start_svr_header_sh(self): + return self.template.get("azure_start_svr_header_sh") + + def get_azure_start_cln_header_sh(self): + return self.template.get("azure_start_cln_header_sh") + + def get_azure_start_common_sh(self): + return self.template.get("azure_start_common_sh") + + def get_sub_start_sh(self): + return self.template.get("sub_start_sh") + + def get_azure_svr_sh(self): + return self.get_cloud_script_header() + self.get_azure_start_svr_header_sh() + self.get_azure_start_common_sh() + + def get_azure_cln_sh(self): + return self.get_cloud_script_header() + self.get_azure_start_cln_header_sh() + self.get_azure_start_common_sh() + + def get_start_sh(self, csp, type, entity): + try: + func = getattr(self, f"get_{csp}_{type}_start_sh") + return func(entity) + except AttributeError: + return "" diff --git a/nvflare/lighter/utils.py b/nvflare/lighter/utils.py index fa202b480a..e7836537d8 100644 --- a/nvflare/lighter/utils.py +++ b/nvflare/lighter/utils.py @@ -224,3 +224,75 @@ def update_storage_locations( json_object = json.dumps(resources, indent=4) with open(target_resource, "w") as outfile: outfile.write(json_object) + + +def _write(file_full_path, content, mode, exe=False): + mode = mode + "w" + with open(file_full_path, mode) as f: + f.write(content) + if exe: + os.chmod(file_full_path, 0o755) + + +def _write_common(type, dest_dir, template, tplt, replacement_dict, config): + mapping = {"server": "svr", "client": "cln"} + _write(os.path.join(dest_dir, f"fed_{type}.json"), json.dumps(config, indent=2), "t") + _write( + os.path.join(dest_dir, "docker.sh"), + sh_replace(template[f"docker_{mapping[type]}_sh"], replacement_dict), + "t", + exe=True, + ) + _write( + os.path.join(dest_dir, "start.sh"), + sh_replace(template[f"start_{mapping[type]}_sh"], replacement_dict), + "t", + exe=True, + ) + _write( + os.path.join(dest_dir, "sub_start.sh"), + sh_replace(tplt.get_sub_start_sh(), replacement_dict), + "t", + exe=True, + ) + _write( + os.path.join(dest_dir, "stop_fl.sh"), + template["stop_fl_sh"], + "t", + exe=True, + ) + + +def _write_local(type, dest_dir, template, capacity=""): + _write( + os.path.join(dest_dir, "log.config.default"), + template["log_config"], + "t", + ) + _write( + os.path.join(dest_dir, "privacy.json.sample"), + template["sample_privacy"], + "t", + ) + _write( + os.path.join(dest_dir, "authorization.json.default"), + template["default_authz"], + "t", + ) + resources = json.loads(template["local_client_resources"]) + if type == "client": + for component in resources["components"]: + if "nvflare.app_common.resource_managers.gpu_resource_manager.GPUResourceManager" == component["path"]: + component["args"] = json.loads(capacity) + break + _write( + os.path.join(dest_dir, "resources.json.default"), + json.dumps(resources, indent=2), + "t", + ) + + +def _write_pki(type, dest_dir, cert_pair, root_cert): + _write(os.path.join(dest_dir, f"{type}.crt"), cert_pair.ser_cert, "b", exe=False) + _write(os.path.join(dest_dir, f"{type}.key"), cert_pair.ser_pri_key, "b", exe=False) + _write(os.path.join(dest_dir, "rootCA.pem"), root_cert, "b", exe=False)