diff --git a/python/examples/launch/README.md b/python/examples/launch/README.md index 02375d50f9..416f069d2d 100644 --- a/python/examples/launch/README.md +++ b/python/examples/launch/README.md @@ -48,9 +48,8 @@ computing: device_type: GPU # options: GPU, CPU, hybrid resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type -framework_type: fedml # options: fedml, deepspeed, pytorch, general - job_type: train # options: train, deploy, federate +framework_type: fedml # options: fedml, deepspeed, pytorch, general # train subtype: general_training, single_machine_training, cluster_distributed_training, cross_cloud_training # federate subtype: cross_silo, simulation, web, smart_phone @@ -63,6 +62,39 @@ server_job: | echo "Hello, Here is the server job." echo "Current directory is as follows." pwd + +# If you want to use the job created by the MLOps platform, +# just uncomment the following three, then set job_id and config_id to your desired job id and related config. +#job_args: +# job_id: 2070 +# config_id: 111 + +# If you want to create the job with specific name, just uncomment the following line and set job_name to your desired job name. +#job_name: cv_job + +# If you want to pass your API key to your job for calling FEDML APIs, you may uncomment the following line and set your API key here. +# You may use the environment variable FEDML_RUN_API_KEY to get your API key in your job commands or scripts. +#run_api_key: my_api_key + +# If you want to use the model created by the MLOps platform or create your own model card with a specified name, +# just uncomment the following four lines, then set model_name to your desired model name or set your desired endpoint name +#serving_args: +# model_name: "fedml-launch-sample-model" # Model card from MLOps platform or create your own model card with a specified name +# model_version: "" # Model version from MLOps platform or set as empty string "" which will use the latest version. +# endpoint_name: "fedml-launch-endpoint" # Set your end point name which will be deployed, it can be empty string "" which will be auto generated. + +# Dataset related arguments +fedml_data_args: + dataset_name: mnist + dataset_path: ./dataset + dataset_type: csv + +# Model related arguments +fedml_model_args: + input_dim: '784' + model_cache_path: /Users/alexliang/fedml_models + model_name: lr + output_dim: '10' ``` You just need to customize the following config items. @@ -111,10 +143,20 @@ For querying the realtime status of your job, please run the following command. fedml job logs -jid 1696947481910317056 ``` +## Supported Environment Variables +You may use the following environment variables in your job commands or scripts. +``` +$FEDML_CURRENT_JOB_ID, current run id for your job +$FEDML_CURRENT_EDGE_ID, current edge device id for your job +$FEDML_CLIENT_RANK, current device index for your job +$FEDML_CURRENT_VERSION, current fedml config version, options: dev, test or release +$FEDML_RUN_API_KEY, current API key from your job.yaml with the config item run_api_key +``` + ## Login as the GPU supplier If you want to login as the role of GPU supplier and join into the FedML launch payment system. You just need to run the following command. ``` -fedml login $YourUserId -k $YourApiKey -r gpu_supplier +fedml login $YourApiKey -r gpu_supplier ``` Then you may find your GPU device in the FedML launch platform https://open.fedml.ai/gpu-supplier/gpus/index diff --git a/python/examples/launch/federate_build_package/README.md b/python/examples/launch/federate_build_package/README.md new file mode 100644 index 0000000000..4eb0f10548 --- /dev/null +++ b/python/examples/launch/federate_build_package/README.md @@ -0,0 +1,112 @@ + +## Build the package for FEDML Federate +``` +Usage: fedml federate build [OPTIONS] + + Build federate packages for the FedML® Launch platform (open.fedml.ai). + +Options: + -h, --help Show this message and exit. + -s, --server build the server package, default is building + client package. + -sf, --source_folder TEXT the source code folder path + -ep, --entry_point TEXT the entry point of the source code + -ea, --entry_args TEXT entry arguments of the entry point program + -cf, --config_folder TEXT the config folder path + -df, --dest_folder TEXT the destination package folder path + -ig, --ignore TEXT the ignore list for copying files, the format + is as follows: *.model,__pycache__,*.data*, + -m, --model_name TEXT model name for training. + -mc, --model_cache_path TEXT model cache path for training. + -mi, --input_dim TEXT input dimensions for training. + -mo, --output_dim TEXT output dimensions for training. + -dn, --dataset_name TEXT dataset name for training. + -dt, --dataset_type TEXT dataset type for training. + -dp, --dataset_path TEXT dataset path for training. +``` + +At first, you need to define your package properties as follows. +If you want to ignore some folders or files, you may specify the ignore argument +or add them to the .gitignore file in the source code folder. + +### Required arguments: +source code folder, entry file, entry arguments, +config folder, built destination folder + +### Optional arguments: +You may define the model and data arguments using the command arguments as follows. +``` +model name, model cache path, model input dimension, model output dimension, +dataset name, dataset type, dataset path. +``` + +Also, you may define the model and data arguments using the file named fedml_config.yaml as follows. +``` +fedml_data_args: + dataset_name: mnist + dataset_path: ./dataset + dataset_type: csv + +fedml_model_args: + input_dim: '784' + model_cache_path: /Users/alexliang/fedml_models + model_name: lr + output_dim: '10' +``` + +The above model and data arguments will be mapped to the equivalent environment variables as follows. +``` +dataset_name = $FEDML_DATASET_NAME +dataset_path = $FEDML_DATASET_PATH +dataset_type = $FEDML_DATASET_TYPE +model_name = $FEDML_MODEL_NAME +model_cache_path = $FEDML_MODEL_CACHE_PATH +input_dim = $FEDML_MODEL_INPUT_DIM +output_dim = $FEDML_MODEL_OUTPUT_DIM +``` + +Your may pass these environment variables as your entry arguments. e.g., +``` +ENTRY_ARGS_MODEL_DATA='-m $FEDML_MODEL_NAME -mc $FEDML_MODEL_CACHE_PATH -mi $FEDML_MODEL_INPUT_DIM -mo $FEDML_MODEL_OUTPUT_DIM -dn $FEDML_DATASET_NAME -dt $FEDML_DATASET_TYPE -dp $FEDML_DATASET_PATH' +``` + +### Examples +``` +# Define the federated package properties +SOURCE_FOLDER=. +ENTRY_FILE=train.py +ENTRY_ARGS='--epochs 1' +ENTRY_ARGS_MODEL_DATA='-m $FEDML_MODEL_NAME -mc $FEDML_MODEL_CACHE_PATH -mi $FEDML_MODEL_INPUT_DIM -mo $FEDML_MODEL_OUTPUT_DIM -dn $FEDML_DATASET_NAME -dt $FEDML_DATASET_TYPE -dp $FEDML_DATASET_PATH' +CONFIG_FOLDER=config +DEST_FOLDER=./mlops +MODEL_NAME=lr +MODEL_CACHE=~/fedml_models +MODEL_INPUT_DIM=784 +MODEL_OUTPUT_DIM=10 +DATASET_NAME=mnist +DATASET_TYPE=csv +DATASET_PATH=./dataset + +# Build the federated client package with the model and data arguments +fedml federate build -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ + -cf $CONFIG_FOLDER -df $DEST_FOLDER \ + -m $MODEL_NAME -mc $MODEL_CACHE -mi $MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM \ + -dn $DATASET_NAME -dt $DATASET_TYPE -dp $DATASET_PATH + +# Build the federated client package without the model and data arguments +# fedml federate build -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ +# -cf $CONFIG_FOLDER -df $DEST_FOLDER + +# Define the federated server package properties +ENTRY_FILE=torch_server.py + +# Build the federated server package with the model and data arguments +fedml federate build -s -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ + -cf $CONFIG_FOLDER -df $DEST_FOLDER \ + -m $MODEL_NAME -mc $MODEL_CACHE -mi $MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM \ + -dn $DATASET_NAME -dt $DATASET_TYPE -dp $DATASET_PATH + +# Build the federated server package without the model and data arguments +# fedml federate build -s -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ +# -cf $CONFIG_FOLDER -df $DEST_FOLDER +``` diff --git a/python/examples/launch/federate_build_package/build_federate_package.sh b/python/examples/launch/federate_build_package/build_federate_package.sh index d2c0effc09..580fa60747 100755 --- a/python/examples/launch/federate_build_package/build_federate_package.sh +++ b/python/examples/launch/federate_build_package/build_federate_package.sh @@ -1,9 +1,9 @@ -# Build federated client package -SOURCE=. -ENTRY=torch_client.py -ENTRY_ARGS='-m $FEDML_MODEL_NAME -mc $FEDML_MODEL_CACHE_PATH -mi $FEDML_MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM -dn $FEDML_DATASET_NAME -dt $FEDML_DATASET_TYPE -dp $FEDML_DATASET_PATH' -CONFIG=config -DEST=./mlops +# Define the federated package properties +SOURCE_FOLDER=. +ENTRY_FILE=torch_client.py +ENTRY_ARGS='-m $FEDML_MODEL_NAME -mc $FEDML_MODEL_CACHE_PATH -mi $FEDML_MODEL_INPUT_DIM -mo $FEDML_MODEL_OUTPUT_DIM -dn $FEDML_DATASET_NAME -dt $FEDML_DATASET_TYPE -dp $FEDML_DATASET_PATH' +CONFIG_FOLDER=config +DEST_FOLDER=./mlops MODEL_NAME=lr MODEL_CACHE=~/fedml_models MODEL_INPUT_DIM=784 @@ -11,23 +11,26 @@ MODEL_OUTPUT_DIM=10 DATASET_NAME=mnist DATASET_TYPE=csv DATASET_PATH=~/fedml_data -fedml federate build -sf $SOURCE -ep $ENTRY -ea "$ENTRY_ARGS" \ - -cf $CONFIG -df $DEST -m $MODEL_NAME -mc $MODEL_CACHE \ - -mi $MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM -dn $DATASET_NAME -dt $DATASET_TYPE -dp $DATASET_PATH -# Build federated server package -SOURCE=. -ENTRY=torch_server.py -ENTRY_ARGS=='-m $FEDML_MODEL_NAME -mc $FEDML_MODEL_CACHE_PATH -mi $FEDML_MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM -dn $FEDML_DATASET_NAME -dt $FEDML_DATASET_TYPE -dp $FEDML_DATASET_PATH' -CONFIG=config -DEST=./mlops -MODEL_NAME=lr -MODEL_CACHE=~/fedml_models -MODEL_INPUT_DIM=784 -MODEL_OUTPUT_DIM=10 -DATASET_NAME=mnist -DATASET_TYPE=csv -DATASET_PATH=~/fedml_data -fedml federate build -s -sf $SOURCE -ep $ENTRY -ea "$ENTRY_ARGS" \ - -cf $CONFIG -df $DEST -m $MODEL_NAME -mc $MODEL_CACHE \ - -mi $MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM -dn $DATASET_NAME -dt $DATASET_TYPE -dp $DATASET_PATH +# Build the federated client package with the model and data arguments +fedml federate build -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ + -cf $CONFIG_FOLDER -df $DEST_FOLDER \ + -m $MODEL_NAME -mc $MODEL_CACHE -mi $MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM \ + -dn $DATASET_NAME -dt $DATASET_TYPE -dp $DATASET_PATH + +# Build the federated client package without the model and data arguments +#fedml federate build -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ +# -cf $CONFIG_FOLDER -df $DEST_FOLDER + +# Define the federated server package properties +ENTRY_FILE=torch_server.py + +# Build the federated server package with the model and data arguments +fedml federate build -s -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ + -cf $CONFIG_FOLDER -df $DEST_FOLDER \ + -m $MODEL_NAME -mc $MODEL_CACHE -mi $MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM \ + -dn $DATASET_NAME -dt $DATASET_TYPE -dp $DATASET_PATH + +# Build the federated server package without the model and data arguments +# fedml federate build -s -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ +# -cf $CONFIG_FOLDER -df $DEST_FOLDER \ No newline at end of file diff --git a/python/examples/launch/federate_build_package/config/fedml_config.yaml b/python/examples/launch/federate_build_package/config/fedml_config.yaml new file mode 100644 index 0000000000..bc9bf0ae84 --- /dev/null +++ b/python/examples/launch/federate_build_package/config/fedml_config.yaml @@ -0,0 +1,56 @@ +comm_args: + backend: MQTT_S3 + mqtt_config_path: config/mqtt_config.yaml + s3_config_path: config/s3_config.yaml +common_args: + random_seed: 0 + scenario: horizontal + training_type: cross_silo + using_mlops: false +data_args: + data_cache_dir: ~/fedml_data + dataset: mnist + partition_alpha: 0.5 + partition_method: hetero +device_args: + gpu_mapping_file: config/gpu_mapping.yaml + gpu_mapping_key: mapping_default + using_gpu: false + worker_num: 2 +environment_args: + bootstrap: config/bootstrap.sh +fedml_data_args: + dataset_name: mnist + dataset_path: /Users/alexliang/fedml_data + dataset_type: csv +fedml_entry_args: + arg_items: -m $FEDML_MODEL_NAME -mc $FEDML_MODEL_CACHE_PATH -mi $FEDML_MODEL_INPUT_DIM + -mo $FEDML_MODEL_OUTPUT_DIM -dn $FEDML_DATASET_NAME -dt $FEDML_DATASET_TYPE -dp + $FEDML_DATASET_PATH +fedml_model_args: + input_dim: '784' + model_cache_path: /Users/alexliang/fedml_models + model_name: lr + output_dim: '10' +model_args: + global_model_file_path: ./model_file_cache/global_model.pt + model: lr + model_file_cache_folder: ./model_file_cache +tracking_args: + enable_wandb: false + wandb_key: ee0b5f53d949c84cee7decbe7a629e63fb2f8408 + wandb_name: fedml_torch_fedavg_mnist_lr + wandb_project: fedml +train_args: + batch_size: 10 + client_id_list: null + client_num_in_total: 2 + client_num_per_round: 2 + client_optimizer: sgd + comm_round: 3 + epochs: 1 + federated_optimizer: FedAvg + learning_rate: 0.03 + weight_decay: 0.001 +validation_args: + frequency_of_the_test: 1 diff --git a/python/examples/launch/hello_job.yaml b/python/examples/launch/hello_job.yaml index 9aa56ff0ff..9cf2bf5b0a 100755 --- a/python/examples/launch/hello_job.yaml +++ b/python/examples/launch/hello_job.yaml @@ -28,7 +28,6 @@ job: | # If you want to use the job created by the MLOps platform, # just uncomment the following three, then set job_id and config_id to your desired job id and related config. -# set job_name to your desired job name #job_args: # job_id: 2070 # config_id: 111 @@ -54,4 +53,15 @@ computing: maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card #allow_cross_cloud_resources: true # true, false #device_type: CPU # options: GPU, CPU, hybrid - resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type \ No newline at end of file + resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type + +fedml_data_args: + dataset_name: mnist + dataset_path: ./dataset + dataset_type: csv + +fedml_model_args: + input_dim: '784' + model_cache_path: /Users/alexliang/fedml_models + model_name: lr + output_dim: '10' \ No newline at end of file diff --git a/python/examples/launch/serve_job_mnist.yaml b/python/examples/launch/serve_job_mnist.yaml index 5f7d1b4076..cda7b7904d 100755 --- a/python/examples/launch/serve_job_mnist.yaml +++ b/python/examples/launch/serve_job_mnist.yaml @@ -26,7 +26,6 @@ task_type: deploy # options: train, deploy, federate # If you want to use the model created by the MLOps platform or create your own model card with a specified name, # just uncomment the following four lines, then set model_name to your desired model name or set your desired endpoint name #serving_args: -# model_name: serve_mnist_FedMLLaunchApp # model_name: "fedml-launch-sample-model" # Model card from MLOps platform or create your own model card with a specified name # model_version: "" # Model version from MLOps platform or set as empty string "" which will use the latest version. # endpoint_name: "fedml-launch-endpoint" # Set your end point name which will be deployed, it can be empty string "" which will be auto generated. diff --git a/python/examples/launch/train_build_package/README.md b/python/examples/launch/train_build_package/README.md new file mode 100644 index 0000000000..40bf480d0e --- /dev/null +++ b/python/examples/launch/train_build_package/README.md @@ -0,0 +1,97 @@ + +## Build the package for FEDML Train +``` +Usage: fedml train build [OPTIONS] + + Build training packages for the FedML® Launch platform (open.fedml.ai). + +Options: + -h, --help Show this message and exit. + -sf, --source_folder TEXT the source code folder path + -ep, --entry_point TEXT the entry point of the source code + -ea, --entry_args TEXT entry arguments of the entry point program + -cf, --config_folder TEXT the config folder path + -df, --dest_folder TEXT the destination package folder path + -ig, --ignore TEXT the ignore list for copying files, the format + is as follows: *.model,__pycache__,*.data*, + -m, --model_name TEXT model name for training. + -mc, --model_cache_path TEXT model cache path for training. + -mi, --input_dim TEXT input dimensions for training. + -mo, --output_dim TEXT output dimensions for training. + -dn, --dataset_name TEXT dataset name for training. + -dt, --dataset_type TEXT dataset type for training. + -dp, --dataset_path TEXT dataset path for training. +``` + +At first, you need to define your package properties as follows. +If you want to ignore some folders or files, you may specify the ignore argument +or add them to the .gitignore file in the source code folder. + +### Required arguments: +source code folder, entry file, entry arguments, +config folder, built destination folder + +### Optional arguments: +You may define the model and data arguments using the command arguments as follows. +``` +model name, model cache path, model input dimension, model output dimension, +dataset name, dataset type, dataset path. +``` + +Also, you may define the model and data arguments using the file named fedml_config.yaml as follows. +``` +fedml_data_args: + dataset_name: mnist + dataset_path: ./dataset + dataset_type: csv + +fedml_model_args: + input_dim: '784' + model_cache_path: /Users/alexliang/fedml_models + model_name: lr + output_dim: '10' +``` + +The above model and data arguments will be mapped to the equivalent environment variables as follows. +``` +dataset_name = $FEDML_DATASET_NAME +dataset_path = $FEDML_DATASET_PATH +dataset_type = $FEDML_DATASET_TYPE +model_name = $FEDML_MODEL_NAME +model_cache_path = $FEDML_MODEL_CACHE_PATH +input_dim = $FEDML_MODEL_INPUT_DIM +output_dim = $FEDML_MODEL_OUTPUT_DIM +``` + +Your may pass these environment variables as your entry arguments. e.g., +``` +ENTRY_ARGS_MODEL_DATA='-m $FEDML_MODEL_NAME -mc $FEDML_MODEL_CACHE_PATH -mi $FEDML_MODEL_INPUT_DIM -mo $FEDML_MODEL_OUTPUT_DIM -dn $FEDML_DATASET_NAME -dt $FEDML_DATASET_TYPE -dp $FEDML_DATASET_PATH' +``` + +### Examples +``` +# Define the package properties +SOURCE_FOLDER=. +ENTRY_FILE=train.py +ENTRY_ARGS='--epochs 1' +ENTRY_ARGS_MODEL_DATA='-m $FEDML_MODEL_NAME -mc $FEDML_MODEL_CACHE_PATH -mi $FEDML_MODEL_INPUT_DIM -mo $FEDML_MODEL_OUTPUT_DIM -dn $FEDML_DATASET_NAME -dt $FEDML_DATASET_TYPE -dp $FEDML_DATASET_PATH' +CONFIG_FOLDER=config +DEST_FOLDER=./mlops +MODEL_NAME=lr +MODEL_CACHE=~/fedml_models +MODEL_INPUT_DIM=784 +MODEL_OUTPUT_DIM=10 +DATASET_NAME=mnist +DATASET_TYPE=csv +DATASET_PATH=./dataset + +# Build the train package with the model and data arguments +fedml train build -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ + -cf $CONFIG_FOLDER -df $DEST_FOLDER \ + -m $MODEL_NAME -mc $MODEL_CACHE -mi $MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM \ + -dn $DATASET_NAME -dt $DATASET_TYPE -dp $DATASET_PATH + +# Build the train package without the model and data arguments +fedml train build -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ + -cf $CONFIG_FOLDER -df $DEST_FOLDER +``` diff --git a/python/examples/launch/train_build_package/build_train_package.sh b/python/examples/launch/train_build_package/build_train_package.sh index e1817ac322..ff9891674f 100755 --- a/python/examples/launch/train_build_package/build_train_package.sh +++ b/python/examples/launch/train_build_package/build_train_package.sh @@ -1,9 +1,10 @@ -SOURCE=. -ENTRY=train.py +# Define the package properties +SOURCE_FOLDER=. +ENTRY_FILE=train.py ENTRY_ARGS='--epochs 1' -ENTRY_ARGS_MODEL_DATA='-m $FEDML_MODEL_NAME -mc $FEDML_MODEL_CACHE_PATH -mi $FEDML_MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM -dn $FEDML_DATASET_NAME -dt $FEDML_DATASET_TYPE -dp $FEDML_DATASET_PATH' -CONFIG=config -DEST=./mlops +ENTRY_ARGS_MODEL_DATA='-m $FEDML_MODEL_NAME -mc $FEDML_MODEL_CACHE_PATH -mi $FEDML_MODEL_INPUT_DIM -mo $FEDML_MODEL_OUTPUT_DIM -dn $FEDML_DATASET_NAME -dt $FEDML_DATASET_TYPE -dp $FEDML_DATASET_PATH' +CONFIG_FOLDER=config +DEST_FOLDER=./mlops MODEL_NAME=lr MODEL_CACHE=~/fedml_models MODEL_INPUT_DIM=784 @@ -11,6 +12,12 @@ MODEL_OUTPUT_DIM=10 DATASET_NAME=mnist DATASET_TYPE=csv DATASET_PATH=./dataset -fedml train build -sf $SOURCE -ep $ENTRY -ea "$ENTRY_ARGS" \ - -cf $CONFIG -df $DEST -m $MODEL_NAME -mc $MODEL_CACHE \ - -mi $MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM -dn $DATASET_NAME -dt $DATASET_TYPE -dp $DATASET_PATH + +# Build the train package with the model and data arguments +fedml train build -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ + -cf $CONFIG_FOLDER -df $DEST_FOLDER -m $MODEL_NAME -mc $MODEL_CACHE \ + -mi $MODEL_INPUT_DIM -mo $MODEL_OUTPUT_DIM -dn $DATASET_NAME -dt $DATASET_TYPE -dp $DATASET_PATH + +# Build the train package without the model and data arguments +# fedml train build -sf $SOURCE_FOLDER -ep $ENTRY_FILE -ea "$ENTRY_ARGS" \ +# -cf $CONFIG_FOLDER -df $DEST_FOLDER \ No newline at end of file diff --git a/python/examples/launch/train_build_package/config/fedml_config.yaml b/python/examples/launch/train_build_package/config/fedml_config.yaml new file mode 100644 index 0000000000..0dc7ffbf1e --- /dev/null +++ b/python/examples/launch/train_build_package/config/fedml_config.yaml @@ -0,0 +1,11 @@ +fedml_data_args: + dataset_name: mnist + dataset_path: ./dataset + dataset_type: csv +fedml_entry_args: + arg_items: --epochs 1 +fedml_model_args: + input_dim: '784' + model_cache_path: /Users/alexliang/fedml_models + model_name: lr + output_dim: '10' diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py index 7de5294cdd..384a8c9ec7 100644 --- a/python/fedml/__init__.py +++ b/python/fedml/__init__.py @@ -34,7 +34,7 @@ _global_training_type = None _global_comm_backend = None -__version__ = "0.8.8a151" +__version__ = "0.8.8a152" # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release diff --git a/python/fedml/api/modules/build.py b/python/fedml/api/modules/build.py index 7c2f777a13..9594adb815 100644 --- a/python/fedml/api/modules/build.py +++ b/python/fedml/api/modules/build.py @@ -5,6 +5,7 @@ import click from fedml.api.modules.utils import build_mlops_package +from fedml.computing.scheduler.comm_utils import sys_utils from fedml.computing.scheduler.comm_utils.platform_utils import platform_is_valid from fedml.computing.scheduler.scheduler_entry.constants import Constants @@ -49,6 +50,12 @@ def build(platform, type, source_folder, entry_point, config_folder, dest_folder except Exception as e: pass + # Read the gitignore file + gitignore_file = os.path.join(source_folder, ".gitignore") + if os.path.exists(gitignore_file): + ignore_list_str = sys_utils.read_gitignore_file(gitignore_file) + ignore = f"{ignore},{ignore_list_str}" + ignore_list = "{},{}".format(ignore, Constants.FEDML_MLOPS_BUILD_PRE_IGNORE_LIST) pip_source_dir = os.path.dirname(__file__) pip_source_dir = os.path.dirname(pip_source_dir) diff --git a/python/fedml/api/modules/federate.py b/python/fedml/api/modules/federate.py index 84d19fc0c2..aca0b90ffd 100644 --- a/python/fedml/api/modules/federate.py +++ b/python/fedml/api/modules/federate.py @@ -1,7 +1,6 @@ import os from fedml.api.modules.constants import ModuleConstants -from fedml.computing.scheduler.comm_utils import sys_utils from fedml.computing.scheduler.comm_utils.sys_utils import generate_yaml_doc from fedml.computing.scheduler.comm_utils.yaml_utils import load_yaml_config from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants @@ -51,12 +50,6 @@ def build(is_built_client_package, source_folder, entry_point, entry_args, confi # Save the updated config object into the config yaml file generate_yaml_doc(config_dict, config_file_path) - # Read the gitignore file - gitignore_file = os.path.join(source_folder, ".gitignore") - if os.path.exists(gitignore_file): - ignore_list_str = sys_utils.read_gitignore_file(gitignore_file) - ignore = f"{ignore},{ignore_list_str}" - # Build the package based on the updated config file fedml.api.modules.build.build( ModuleConstants.PLATFORM_NAME_LAUNCH, diff --git a/python/fedml/api/modules/train.py b/python/fedml/api/modules/train.py index 0126aa836e..05e53601a7 100644 --- a/python/fedml/api/modules/train.py +++ b/python/fedml/api/modules/train.py @@ -1,7 +1,6 @@ import os from fedml.api.modules.constants import ModuleConstants -from fedml.computing.scheduler.comm_utils import sys_utils from fedml.computing.scheduler.comm_utils.sys_utils import generate_yaml_doc from fedml.computing.scheduler.comm_utils.yaml_utils import load_yaml_config from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants @@ -51,12 +50,6 @@ def build(source_folder, entry_point, entry_args, config_folder, dest_folder, ig # Save the updated config object into the config yaml file generate_yaml_doc(config_dict, config_file_path) - # Read the gitignore file - gitignore_file = os.path.join(source_folder, ".gitignore") - if os.path.exists(gitignore_file): - ignore_list_str = sys_utils.read_gitignore_file(gitignore_file) - ignore = f"{ignore},{ignore_list_str}" - # Build the package based on the updated config file fedml.api.modules.build.build(ModuleConstants.PLATFORM_NAME_LAUNCH, ModuleConstants.TRAIN_BUILD_PACKAGE_CLIENT_TYPE, source_folder, entry_point, config_folder, dest_folder, ignore, diff --git a/python/fedml/cli/README.md b/python/fedml/cli/README.md index 5cffdb8d21..9373118fe7 100644 --- a/python/fedml/cli/README.md +++ b/python/fedml/cli/README.md @@ -142,15 +142,7 @@ Usage: fedml launch [OPTIONS] [YAML_FILE]... launch job at the MLOps platform Options: --uname, --user_name TEXT user name. If you do not specify this option, the fedml_account_name field from YAML_FILE will be used. --uid, --user_id TEXT user id. If you do not specify this option, the fedml_account_id field from YAML_FILE will be used. -k, --api_key TEXT user api key. --pf, --platform TEXT The platform name at the MLOps platform (options:octopus, parrot, spider, beehive, launch, default is launch). --jn, --job_name TEXT The job name at the MLOps platform. If you don't specify here, the job name from the job yaml file will be used. --ds, --devices_server TEXT The server to run the launching job, for the launch platform, we do not need to set this option. --de, --devices_edges TEXT The edge devices to run the launching job. Seperated with ',', e.g. 705,704. For the launch platform, we do not need to set this option. --nc, --no_confirmation no confirmation after initiating launching request. --v, --version TEXT launch job to which version of MLOps platform. It should be dev, test or release --help Show this message and exit. ``` At first, you need to define your job properties in the job yaml file, e.g. entry file, config file, command arguments, etc. @@ -183,13 +175,22 @@ bootstrap: | computing: minimum_num_gpus: 1 # minimum # of GPUs to provision - maximum_cost_per_hour: $1.75 # max cost per hour for your job per machine + + # max cost per hour of all machines for your job. + # E.g., if your job are assigned 2 x A100 nodes (8 GPUs), each GPU cost $1/GPU/Hour, "maximum_cost_per_hour" = 16 * $1 = $16 + maximum_cost_per_hour: $1.75 + allow_cross_cloud_resources: false # true, false device_type: GPU # options: GPU, CPU, hybrid resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type -framework_type: fedml # options: fedml, deepspeed, pytorch, general -task_type: train # options: serve, train, dev-environment +job_type: train # options: train, deploy, federate +framework_type: fedml # options: fedml, deepspeed, pytorch, general + +# train subtype: general_training, single_machine_training, cluster_distributed_training, cross_cloud_training +# federate subtype: cross_silo, simulation, web, smart_phone +# deploy subtype: none +job_subtype: generate_training # Running entry commands on the server side which will be executed as the job entry point. # Support multiple lines, which can not be empty. @@ -197,6 +198,39 @@ server_job: | echo "Hello, Here is the server job." echo "Current directory is as follows." pwd + +# If you want to use the job created by the MLOps platform, +# just uncomment the following three, then set job_id and config_id to your desired job id and related config. +#job_args: +# job_id: 2070 +# config_id: 111 + +# If you want to create the job with specific name, just uncomment the following line and set job_name to your desired job name. +#job_name: cv_job + +# If you want to pass your API key to your job for calling FEDML APIs, you may uncomment the following line and set your API key here. +# You may use the environment variable FEDML_RUN_API_KEY to get your API key in your job commands or scripts. +#run_api_key: my_api_key + +# If you want to use the model created by the MLOps platform or create your own model card with a specified name, +# just uncomment the following four lines, then set model_name to your desired model name or set your desired endpoint name +#serving_args: +# model_name: "fedml-launch-sample-model" # Model card from MLOps platform or create your own model card with a specified name +# model_version: "" # Model version from MLOps platform or set as empty string "" which will use the latest version. +# endpoint_name: "fedml-launch-endpoint" # Set your end point name which will be deployed, it can be empty string "" which will be auto generated. + +# Dataset related arguments +fedml_data_args: + dataset_name: mnist + dataset_path: ./dataset + dataset_type: csv + +# Model related arguments +fedml_model_args: + input_dim: '784' + model_cache_path: /Users/alexliang/fedml_models + model_name: lr + output_dim: '10' ``` You just need to customize the following config items. @@ -212,5 +246,59 @@ Then you can use the following example CLI to launch the job at the MLOps platfo Example: ``` -fedml launch call_gpu.yaml +fedml launch hello_job.yaml +``` + +After the launch CLI is executed, the output is as follows. Here you may open the job url to confirm and actually start the job. +``` +Submitting your job to FedML® Launch platform: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6.07k/6.07k [00:01<00:00, 4.94kB/s] + +Searched and matched the following GPU resource for your job: ++-----------+-------------------+---------+------------+-------------------------+---------+-------+----------+ +| Provider | Instance | vCPU(s) | Memory(GB) | GPU(s) | Region | Cost | Selected | ++-----------+-------------------+---------+------------+-------------------------+---------+-------+----------+ +| FedML Inc | fedml_a100_node_2 | 256 | 2003.9 | NVIDIA A100-SXM4-80GB:8 | DEFAULT | 40.00 | √ | ++-----------+-------------------+---------+------------+-------------------------+---------+-------+----------+ + +You can also view the matched GPU resource with Web UI at: +https://open.fedml.ai/gpu/projects/job/confirmStartJob?projectId=1692900612607447040&projectName=default-project&jobId=1696947481910317056 + +Are you sure to launch it? [y/N]: y + +Your launch result is as follows: ++------------+---------------------+---------+---------------------+------------------+------+ +| Job Name | Job ID | Status | Created | Spend Time(hour) | Cost | ++------------+---------------------+---------+---------------------+------------------+------+ +| munch_clam | 1696947481910317056 | RUNNING | 2023-08-31 02:06:22 | None | 0.0 | ++------------+---------------------+---------+---------------------+------------------+------+ + +You can track your job running details at this URL: +https://open.fedml.ai/gpu/projects/job/jobDetail?projectId=1692900612607447040&jobId=1696947481910317056 + +For querying the realtime status of your job, please run the following command. +fedml job logs -jid 1696947481910317056 +``` + +### Supported Environment Variables +You may use the following environment variables in your job commands or scripts. +``` +$FEDML_CURRENT_JOB_ID, current run id for your job +$FEDML_CURRENT_EDGE_ID, current edge device id for your job +$FEDML_CLIENT_RANK, current device index for your job +$FEDML_CURRENT_VERSION, current fedml config version, options: dev, test or release +$FEDML_RUN_API_KEY, current API key from your job.yaml with the config item run_api_key +``` + +## 9. Login as the GPU supplier +If you want to login as the role of GPU supplier and join into the FedML launch payment system. You just need to run the following command. +``` +fedml login $YourApiKey -r gpu_supplier ``` + +Then you may find your GPU device in the FedML launch platform https://open.fedml.ai/gpu-supplier/gpus/index + +And then you may bind your FedML account to your payment account. Once your GPU device is scheduled to run any computing work load, + +you will get some rewards from the GPU consumer with the `fedml launch` CLI. + + diff --git a/python/fedml/computing/scheduler/comm_utils/job_utils.py b/python/fedml/computing/scheduler/comm_utils/job_utils.py new file mode 100644 index 0000000000..2a3b838a25 --- /dev/null +++ b/python/fedml/computing/scheduler/comm_utils/job_utils.py @@ -0,0 +1,160 @@ +import os +import platform + +from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants +from fedml.computing.scheduler.comm_utils.sys_utils import get_python_program + + +class JobRunnerUtils: + FEDML_SUPPORTED_ENVIRONMENT_VARIABLES = ["$FEDML_MODEL_NAME", "$FEDML_MODEL_CACHE_PATH", + "$FEDML_MODEL_INPUT_DIM", "$FEDML_MODEL_OUTPUT_DIM", + "$FEDML_DATASET_NAME", "$FEDML_DATASET_PATH", "$FEDML_DATASET_TYPE"] + + @staticmethod + def generate_job_execute_commands(run_id, edge_id, version, + package_type, executable_interpreter, entry_file_full_path, + conf_file_object, entry_args, assigned_gpu_ids, + job_api_key, client_rank, job_yaml=None): + shell_cmd_list = list() + entry_commands_origin = list() + + # Read entry commands if job is from launch + if package_type == SchedulerConstants.JOB_PACKAGE_TYPE_LAUNCH or \ + os.path.basename(entry_file_full_path) == SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME: + with open(entry_file_full_path, 'r') as entry_file_handle: + entry_commands_origin.extend(entry_file_handle.readlines()) + entry_file_handle.close() + + # Generate the export env list for publishing environment variables + export_cmd = "set" if platform.system() == "Windows" else "export" + export_env_list, env_value_map = JobRunnerUtils.parse_config_args_as_env_variables( + export_cmd, conf_file_object, job_yaml=job_yaml) + + # Replace entry commands with environment variable values + entry_commands = JobRunnerUtils.replace_entry_command_with_env_variable(entry_commands_origin, env_value_map) + + # Replace entry arguments with environment variable values + entry_args = JobRunnerUtils.replace_entry_args_with_env_variable(entry_args, env_value_map) + + # Add the export env list to the entry commands + if len(export_env_list) > 0: + entry_commands.extend(export_env_list) + + # Add general environment variables + entry_commands.insert(0, f"{export_cmd} FEDML_CURRENT_EDGE_ID={edge_id}\n") + entry_commands.insert(0, f"{export_cmd} FEDML_CURRENT_JOB_ID={run_id}\n") + if assigned_gpu_ids is not None and assigned_gpu_ids != "": + entry_commands.insert(0, f"{export_cmd} CUDA_VISIBLE_DEVICES={assigned_gpu_ids}\n") + entry_commands.insert(0, f"{export_cmd} FEDML_CURRENT_VERSION={version}\n") + entry_commands.insert(0, f"{export_cmd} FEDML_USING_MLOPS=true\n") + entry_commands.insert(0, f"{export_cmd} FEDML_CLIENT_RANK={client_rank}\n") + if job_api_key is not None and str(job_api_key).strip() != "": + entry_commands.insert(0, f"{export_cmd} FEDML_RUN_API_KEY={job_api_key}\n") + + # Set -e for the entry script + entry_commands_filled = list() + if platform.system() == "Windows": + entry_file_full_path = entry_file_full_path.replace(".sh", ".bat") + for cmd in entry_commands: + entry_commands_filled.append(cmd) + entry_commands_filled.append("if %ERRORLEVEL% neq 0 EXIT %ERRORLEVEL%\n") + entry_commands_filled.append("EXIT %ERRORLEVEL%") + else: + entry_commands_filled = entry_commands + entry_commands_filled.insert(0, "set -e\n") + + # If the job type is not launch, we need to generate an entry script wrapping with entry commands + if package_type != SchedulerConstants.JOB_PACKAGE_TYPE_LAUNCH and \ + os.path.basename(entry_file_full_path) != SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME: + python_program = get_python_program() + entry_commands_filled.append(f"{python_program} {entry_file_full_path} {entry_args}\n") + entry_file_full_path = os.path.join( + os.path.dirname(entry_file_full_path), os.path.basename(entry_file_full_path) + ".sh") + + # Write the entry commands to the entry script + with open(entry_file_full_path, 'w') as entry_file_handle: + entry_file_handle.writelines(entry_commands_filled) + entry_file_handle.close() + + # Generate the shell commands to be executed + shell_cmd_list.append(f"{executable_interpreter} {entry_file_full_path}") + + return shell_cmd_list + + @staticmethod + def replace_entry_command_with_env_variable(entry_commands, env_value_map): + entry_commands_replaced = list() + for entry_cmd in entry_commands: + for env_name in JobRunnerUtils.FEDML_SUPPORTED_ENVIRONMENT_VARIABLES: + env_value = env_value_map.get(env_name, None) + if env_value is None: + continue + entry_cmd = entry_cmd.replace(env_name, env_value) + + entry_commands_replaced.append(entry_cmd) + + return entry_commands_replaced + + @staticmethod + def replace_entry_args_with_env_variable(entry_args, env_value_map): + if entry_args is None: + return "" + for env_name in JobRunnerUtils.FEDML_SUPPORTED_ENVIRONMENT_VARIABLES: + env_value = env_value_map.get(env_name, None) + if env_value is None: + continue + entry_args = entry_args.replace(env_name, env_value) + + return entry_args + + @staticmethod + def parse_config_args_as_env_variables(export_cmd, run_params, job_yaml=None): + model_args = run_params.get("fedml_model_args", None) + if model_args is None: + model_args = job_yaml.get("fedml_model_args", {}) if job_yaml is not None else dict() + + data_args = run_params.get("fedml_data_args", None) + if data_args is None: + data_args = job_yaml.get("fedml_data_args", {}) if job_yaml is not None else dict() + + model_name = model_args.get("model_name", None) + model_cache_path = model_args.get("model_cache_path", None) + input_dim = model_args.get("input_dim", None) + output_dim = model_args.get("output_dim", None) + dataset_name = data_args.get("dataset_name", None) + dataset_path = data_args.get("dataset_path", None) + dataset_type = data_args.get("dataset_type", None) + + export_env_list = list() + env_value_map = dict() + + if model_name is not None and str(model_name).strip() != "": + export_env_list.append(f"{export_cmd} FEDML_MODEL_NAME={model_name}\n") + env_value_map["$FEDML_MODEL_NAME"] = model_name + + if model_cache_path is not None and str(model_cache_path).strip() != "": + export_env_list.append(f"{export_cmd} FEDML_MODEL_CACHE_PATH={model_cache_path}\n") + env_value_map["$FEDML_MODEL_CACHE_PATH"] = model_cache_path + + if input_dim is not None and str(input_dim).strip() != "": + export_env_list.append(f"{export_cmd} FEDML_MODEL_INPUT_DIM={input_dim}\n") + env_value_map["$FEDML_MODEL_INPUT_DIM"] = input_dim + + if output_dim is not None and str(output_dim).strip() != "": + export_env_list.append(f"{export_cmd} MODEL_OUTPUT_DIM={output_dim}\n") + env_value_map["$MODEL_OUTPUT_DIM"] = output_dim + + if dataset_name is not None and str(dataset_name).strip() != "": + export_env_list.append(f"{export_cmd} FEDML_DATASET_NAME={dataset_name}\n") + env_value_map["$FEDML_DATASET_NAME"] = dataset_name + + if dataset_path is not None and str(dataset_path).strip() != "": + export_env_list.append(f"{export_cmd} FEDML_DATASET_PATH={dataset_path}\n") + env_value_map["$FEDML_DATASET_PATH"] = dataset_path + + if dataset_type is not None and str(dataset_type).strip() != "": + export_env_list.append(f"{export_cmd} FEDML_DATASET_TYPE={dataset_type}\n") + env_value_map["$FEDML_DATASET_TYPE"] = dataset_type + + return export_env_list, env_value_map + diff --git a/python/fedml/computing/scheduler/master/server_runner.py b/python/fedml/computing/scheduler/master/server_runner.py index b188158399..1f49a3cc48 100755 --- a/python/fedml/computing/scheduler/master/server_runner.py +++ b/python/fedml/computing/scheduler/master/server_runner.py @@ -24,6 +24,7 @@ import requests from ..comm_utils.constants import SchedulerConstants +from ..comm_utils.job_utils import JobRunnerUtils from ..comm_utils.run_process_utils import RunProcessUtils from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog @@ -655,50 +656,28 @@ def execute_job_task(self, entry_file_full_path, conf_file_full_path, run_id): run_params = run_config.get("parameters", {}) job_yaml = run_params.get("job_yaml", {}) job_yaml_default_none = run_params.get("job_yaml", None) + job_api_key = job_yaml.get("run_api_key", None) assigned_gpu_ids = run_params.get("gpu_ids", None) - using_easy_mode = True - expert_mode = job_yaml.get("expert_mode", None) framework_type = job_yaml.get("framework_type", None) job_type = job_yaml.get("job_type", None) job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type conf_file_object = load_yaml_config(conf_file_full_path) entry_args_dict = conf_file_object.get("fedml_entry_args", {}) entry_args = entry_args_dict.get("arg_items", None) - error_list = list() - if expert_mode is None: - executable_interpreter = ClientConstants.CLIENT_SHELL_PS \ - if platform.system() == ClientConstants.PLATFORM_WINDOWS else ClientConstants.CLIENT_SHELL_BASH - executable_commands = job_yaml.get("job", "") - else: - using_easy_mode = False - executable_interpreter = expert_mode.get("executable_interpreter", "") - executable_file = expert_mode.get("executable_file", "") - executable_conf_option = expert_mode.get("executable_conf_option", "") - executable_conf_file = expert_mode.get("executable_conf_file", "") - executable_args = expert_mode.get("executable_args", "") + + executable_interpreter = ClientConstants.CLIENT_SHELL_PS \ + if platform.system() == ClientConstants.PLATFORM_WINDOWS else ClientConstants.CLIENT_SHELL_BASH if job_yaml_default_none is None: + # Generate the job executing commands for previous federated learning (Compatibility) python_program = get_python_program() logging.info("Run the server: {} {} --cf {} --rank 0 --role server".format( python_program, entry_file_full_path, conf_file_full_path)) - # process = ServerConstants.exec_console_with_shell_script_list( - # [ - # python_program, - # entry_file_full_path, - # "--cf", - # conf_file_full_path, - # "--rank ", - # "0", - # "--role", - # "server" - # ], - # should_capture_stdout=False, - # should_capture_stderr=True - # ) - entry_command = f"{python_program} {entry_file_full_path} --cf " \ f"{conf_file_full_path} --rank 0 --role server" shell_cmd_list = [entry_command] + + # Run the job executing commands for previous federated learning (Compatibility) process, error_list = ClientConstants.execute_commands_with_live_logs( shell_cmd_list, callback=self.callback_start_fl_job, should_write_log_file=False) is_launch_task = False @@ -709,87 +688,17 @@ def execute_job_task(self, entry_file_full_path, conf_file_full_path, run_id): ServerConstants.MSG_MLOPS_SERVER_STATUS_RUNNING, running_json=self.start_request_json) - shell_cmd_list = list() - if using_easy_mode: - entry_commands_origin = list() - if self.package_type == SchedulerConstants.JOB_PACKAGE_TYPE_LAUNCH or \ - os.path.basename(entry_file_full_path) == SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME: - with open(entry_file_full_path, 'r') as entry_file_handle: - entry_commands_origin.extend(entry_file_handle.readlines()) - entry_file_handle.close() - - # Replace entry commands with environment variable values - export_cmd = "set" if platform.system() == "Windows" else "export" - entry_commands = list() - env_list, env_value_map = self.parse_config_args_as_env_variables(export_cmd, conf_file_object) - for entry_cmd in entry_commands_origin: - for env_name in ClientConstants.FEDML_SUPPORTED_ENVIRONMENT_VARIABLES: - env_value = env_value_map.get(env_name, None) - if env_value is None: - continue - entry_cmd = entry_cmd.replace(env_name, env_value) - - entry_commands.append(entry_cmd) - - # Replace entry arguments with environment variable values - for env_name in ClientConstants.FEDML_SUPPORTED_ENVIRONMENT_VARIABLES: - env_value = env_value_map.get(env_name, None) - if env_value is None: - continue - entry_args = entry_args.replace(env_name, env_value) - - # Export the environment variables - if len(env_list) > 0: - entry_commands.extend(env_list) - - # Add general environment variables - entry_commands.insert(0, f"{export_cmd} FEDML_CURRENT_EDGE_ID={self.edge_id}\n") - entry_commands.insert(0, f"{export_cmd} FEDML_CURRENT_JOB_ID={self.run_id}\n") - if assigned_gpu_ids is not None and assigned_gpu_ids != "": - entry_commands.insert(0, f"{export_cmd} CUDA_VISIBLE_DEVICES={assigned_gpu_ids}\n") - entry_commands.insert(0, f"{export_cmd} FEDML_CURRENT_VERSION={self.version}\n") - entry_commands.insert(0, f"{export_cmd} FEDML_USING_MLOPS=true\n") - entry_commands.insert(0, f"{export_cmd} FEDML_SERVER_RANK=0\n") - - entry_commands_filled = list() - if platform.system() == "Windows": - entry_file_full_path = entry_file_full_path.replace(".sh", ".bat") - for cmd in entry_commands: - entry_commands_filled.append(cmd) - entry_commands_filled.append("if %ERRORLEVEL% neq 0 EXIT %ERRORLEVEL%\n") - entry_commands_filled.append("EXIT %ERRORLEVEL%") - else: - entry_commands_filled = entry_commands - entry_commands_filled.insert(0, "set -e\n") + # Generate the job executing commands + job_executing_commands = JobRunnerUtils.generate_job_execute_commands( + self.run_id, self.edge_id, self.version, + self.package_type, executable_interpreter, entry_file_full_path, + conf_file_object, entry_args, assigned_gpu_ids, + job_api_key, 0, job_yaml=job_yaml_default_none) - if self.package_type != SchedulerConstants.JOB_PACKAGE_TYPE_LAUNCH and \ - os.path.basename(entry_file_full_path) != SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME: - python_program = get_python_program() - entry_commands_filled.append(f"{python_program} {entry_file_full_path} {entry_args}\n") - entry_file_full_path = os.path.join( - os.path.dirname(entry_file_full_path), os.path.basename(entry_file_full_path) + ".sh") - - with open(entry_file_full_path, 'w') as entry_file_handle: - entry_file_handle.writelines(entry_commands_filled) - entry_file_handle.close() - - shell_cmd_list.append(f"{executable_interpreter} {entry_file_full_path}") - else: - shell_cmd_list.append(executable_interpreter) - if executable_file != "": - shell_cmd_list.append(entry_file_full_path) - if executable_conf_file != "" and executable_conf_option != "": - shell_cmd_list.append(executable_conf_option) - shell_cmd_list.append(conf_file_full_path) - shell_cmd_list.append(executable_args) - shell_cmd_list.append(f"--run_id {self.run_id}") - shell_cmd_list.append(f"--run_device_id {self.edge_id}") - shell_cmd_list.append(f"--rank 0") - shell_cmd_list.append(f"--role server") - shell_cmd_list.append("--using_mlops True") + # Run the job executing commands logging.info(f"Run the server job with job id {self.run_id}, device id {self.edge_id}.") process, error_list = ServerConstants.execute_commands_with_live_logs( - shell_cmd_list, callback=self.start_job_perf, error_processor=self.job_error_processor) + job_executing_commands, callback=self.start_job_perf, error_processor=self.job_error_processor) is_launch_task = True return process, is_launch_task, error_list @@ -805,50 +714,6 @@ def job_error_processor(self, error_list): error_str = "\n".join(error_list) raise Exception(f"Error occurs when running the job... {error_str}") - def parse_config_args_as_env_variables(self, export_cmd, run_params): - model_args = run_params.get("fedml_model_args", {}) - data_args = run_params.get("fedml_data_args", {}) - model_name = model_args.get("model_name", None) - model_cache_path = model_args.get("model_cache_path", None) - input_dim = model_args.get("input_dim", None) - output_dim = model_args.get("output_dim", None) - dataset_name = data_args.get("dataset_name", None) - dataset_path = data_args.get("dataset_path", None) - dataset_type = data_args.get("dataset_type", None) - - env_list = list() - env_value_map = dict() - - if model_name is not None and str(model_name).strip() != "": - env_list.append(f"{export_cmd} FEDML_MODEL_NAME={model_name}\n") - env_value_map["$FEDML_MODEL_NAME"] = model_name - - if model_cache_path is not None and str(model_cache_path).strip() != "": - env_list.append(f"{export_cmd} FEDML_MODEL_CACHE_PATH={model_cache_path}\n") - env_value_map["$FEDML_MODEL_CACHE_PATH"] = model_cache_path - - if input_dim is not None and str(input_dim).strip() != "": - env_list.append(f"{export_cmd} FEDML_MODEL_INPUT_DIM={input_dim}\n") - env_value_map["$FEDML_MODEL_INPUT_DIM"] = input_dim - - if output_dim is not None and str(output_dim).strip() != "": - env_list.append(f"{export_cmd} MODEL_OUTPUT_DIM={output_dim}\n") - env_value_map["$MODEL_OUTPUT_DIM"] = output_dim - - if dataset_name is not None and str(dataset_name).strip() != "": - env_list.append(f"{export_cmd} FEDML_DATASET_NAME={dataset_name}\n") - env_value_map["$FEDML_DATASET_NAME"] = dataset_name - - if dataset_path is not None and str(dataset_path).strip() != "": - env_list.append(f"{export_cmd} FEDML_DATASET_PATH={dataset_path}\n") - env_value_map["$FEDML_DATASET_PATH"] = dataset_path - - if dataset_type is not None and str(dataset_type).strip() != "": - env_list.append(f"{export_cmd} FEDML_DATASET_TYPE={dataset_type}\n") - env_value_map["$FEDML_DATASET_TYPE"] = dataset_type - - return env_list, env_value_map - def process_job_status(self, run_id): all_edges_is_finished = True any_edge_is_failed = False diff --git a/python/fedml/computing/scheduler/scheduler_entry/launch_manager.py b/python/fedml/computing/scheduler/scheduler_entry/launch_manager.py index 00b6aa7616..d949786792 100755 --- a/python/fedml/computing/scheduler/scheduler_entry/launch_manager.py +++ b/python/fedml/computing/scheduler/scheduler_entry/launch_manager.py @@ -446,6 +446,17 @@ def __init__(self, job_yaml_file, should_use_default_workspace=False): self.model_app_name = self.serving_model_name \ if self.serving_model_name is not None and self.serving_model_name != "" else self.application_name + data_args = self.job_config_dict.get("fedml_data_args", {}) + self.data_args_dataset_name = data_args.get("dataset_name", None) + self.data_args_dataset_path = data_args.get("dataset_path", None) + self.data_args_dataset_type = data_args.get("dataset_type", None) + + model_args = self.job_config_dict.get("fedml_model_args", {}) + self.model_args_model_name = model_args.get("model_name", None) + self.model_args_model_cache_path = model_args.get("model_cache_path", None) + self.model_args_input_dim = model_args.get("input_dim", None) + self.model_args_output_dim = model_args.get("output_dim", None) + self.gitignore_file = os.path.join( self.base_dir, workspace if workspace is not None and workspace != "" else random_workspace, ".gitignore") self.ignore_list_str = Constants.FEDML_MLOPS_BUILD_PRE_IGNORE_LIST diff --git a/python/fedml/computing/scheduler/slave/client_constants.py b/python/fedml/computing/scheduler/slave/client_constants.py index 57f6b49d95..3db7533bf1 100644 --- a/python/fedml/computing/scheduler/slave/client_constants.py +++ b/python/fedml/computing/scheduler/slave/client_constants.py @@ -71,9 +71,6 @@ class ClientConstants(object): FEDML_PARENT_PID_FILE = "fedml_parent_pid" - FEDML_SUPPORTED_ENVIRONMENT_VARIABLES = ["$FEDML_MODEL_NAME", "$FEDML_MODEL_CACHE_PATH", "$FEDML_MODEL_INPUT_DIM", "$MODEL_OUTPUT_DIM", - "$FEDML_DATASET_NAME", "$FEDML_DATASET_PATH", "$FEDML_DATASET_TYPE"] - LOCAL_CLIENT_API_PORT = 40800 LOGIN_MODE_CLIENT_INDEX = 0 diff --git a/python/fedml/computing/scheduler/slave/client_runner.py b/python/fedml/computing/scheduler/slave/client_runner.py index a5e08612dc..2353d81204 100755 --- a/python/fedml/computing/scheduler/slave/client_runner.py +++ b/python/fedml/computing/scheduler/slave/client_runner.py @@ -22,6 +22,7 @@ import fedml from ..comm_utils.constants import SchedulerConstants +from ..comm_utils.job_utils import JobRunnerUtils from ..comm_utils.run_process_utils import RunProcessUtils from ..scheduler_entry.constants import Constants from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog @@ -577,49 +578,25 @@ def execute_job_task(self, entry_file_full_path, conf_file_full_path, dynamic_ar job_yaml_default_none = run_params.get("job_yaml", None) job_api_key = job_yaml.get("run_api_key", None) assigned_gpu_ids = run_params.get("gpu_ids", None) - using_easy_mode = True - expert_mode = job_yaml.get("expert_mode", None) job_type = job_yaml.get("job_type", None) job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type conf_file_object = load_yaml_config(conf_file_full_path) entry_args_dict = conf_file_object.get("fedml_entry_args", {}) entry_args = entry_args_dict.get("arg_items", None) - error_list = list() - if expert_mode is None: - executable_interpreter = ClientConstants.CLIENT_SHELL_PS \ - if platform.system() == ClientConstants.PLATFORM_WINDOWS else ClientConstants.CLIENT_SHELL_BASH - executable_commands = job_yaml.get("job", "") - else: - using_easy_mode = False - executable_interpreter = expert_mode.get("executable_interpreter", "") - executable_file = expert_mode.get("executable_file", "") - executable_conf_option = expert_mode.get("executable_conf_option", "") - executable_conf_file = expert_mode.get("executable_conf_file", "") - executable_args = expert_mode.get("executable_args", "") + executable_interpreter = ClientConstants.CLIENT_SHELL_PS \ + if platform.system() == ClientConstants.PLATFORM_WINDOWS else ClientConstants.CLIENT_SHELL_BASH if job_yaml_default_none is None: + # Generate the job executing commands for previous federated learning (Compatibility) python_program = get_python_program() logging.info("Run the client: {} {} --cf {} --rank {} --role client".format( python_program, entry_file_full_path, conf_file_full_path, str(dynamic_args_config.get("rank", 1)))) - - # process = ClientConstants.exec_console_with_shell_script_list( - # [ - # python_program, - # entry_file_full_path, - # "--cf", - # conf_file_full_path, - # "--rank", - # str(dynamic_args_config.get("rank", 1)), - # "--role", - # "client" - # ], - # should_capture_stdout=False, - # should_capture_stderr=True - # ) rank = str(dynamic_args_config.get("rank", 1)) entry_command = f"{python_program} {entry_file_full_path} --cf " \ f"{conf_file_full_path} --rank {rank} --role client" shell_cmd_list = [entry_command] + + # Run the job executing commands for previous federated learning (Compatibility) process, error_list = ClientConstants.execute_commands_with_live_logs( shell_cmd_list, callback=self.callback_start_fl_job, should_write_log_file=False) is_launch_task = False @@ -629,92 +606,18 @@ def execute_job_task(self, entry_file_full_path, conf_file_full_path, dynamic_ar self.mlops_metrics.report_client_training_status(self.edge_id, ClientConstants.MSG_MLOPS_SERVER_DEVICE_STATUS_RUNNING, in_run_id=self.run_id) - shell_cmd_list = list() - if using_easy_mode: - entry_commands_origin = list() - if self.package_type == SchedulerConstants.JOB_PACKAGE_TYPE_LAUNCH or \ - os.path.basename(entry_file_full_path) == SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME: - # Read commands if job is not from launch - with open(entry_file_full_path, 'r') as entry_file_handle: - entry_commands_origin.extend(entry_file_handle.readlines()) - entry_file_handle.close() - - # Replace entry commands with environment variable values - export_cmd = "set" if platform.system() == "Windows" else "export" - entry_commands = list() - env_list, env_value_map = self.parse_config_args_as_env_variables(export_cmd, conf_file_object) - for entry_cmd in entry_commands_origin: - for env_name in ClientConstants.FEDML_SUPPORTED_ENVIRONMENT_VARIABLES: - env_value = env_value_map.get(env_name, None) - if env_value is None: - continue - entry_cmd = entry_cmd.replace(env_name, env_value) - - entry_commands.append(entry_cmd) - - # Replace entry arguments with environment variable values - for env_name in ClientConstants.FEDML_SUPPORTED_ENVIRONMENT_VARIABLES: - env_value = env_value_map.get(env_name, None) - if env_value is None: - continue - entry_args = entry_args.replace(env_name, env_value) - - # Export the environment variables - if len(env_list) > 0: - entry_commands.extend(env_list) - - # Add general environment variables - entry_commands.insert(0, f"{export_cmd} FEDML_CURRENT_EDGE_ID={self.edge_id}\n") - entry_commands.insert(0, f"{export_cmd} FEDML_CURRENT_JOB_ID={self.run_id}\n") - if assigned_gpu_ids is not None and assigned_gpu_ids != "": - entry_commands.insert(0, f"{export_cmd} CUDA_VISIBLE_DEVICES={assigned_gpu_ids}\n") - entry_commands.insert(0, f"{export_cmd} FEDML_CURRENT_VERSION={self.version}\n") - entry_commands.insert(0, f"{export_cmd} FEDML_USING_MLOPS=true\n") - entry_commands.insert(0, f"{export_cmd} FEDML_CLIENT_RANK={client_rank}\n") - if job_api_key is not None and str(job_api_key).strip() != "": - entry_commands.insert(0, f"{export_cmd} FEDML_RUN_API_KEY={job_api_key}\n") - - # Set -e for the entry script - entry_commands_filled = list() - if platform.system() == "Windows": - entry_file_full_path = entry_file_full_path.replace(".sh", ".bat") - for cmd in entry_commands: - entry_commands_filled.append(cmd) - entry_commands_filled.append("if %ERRORLEVEL% neq 0 EXIT %ERRORLEVEL%\n") - entry_commands_filled.append("EXIT %ERRORLEVEL%") - else: - entry_commands_filled = entry_commands - entry_commands_filled.insert(0, "set -e\n") - - if self.package_type != SchedulerConstants.JOB_PACKAGE_TYPE_LAUNCH and \ - os.path.basename(entry_file_full_path) != SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME: - python_program = get_python_program() - entry_commands_filled.append(f"{python_program} {entry_file_full_path} {entry_args}\n") - entry_file_full_path = os.path.join( - os.path.dirname(entry_file_full_path), os.path.basename(entry_file_full_path) + ".sh") - - # Write the entry commands to the entry file - with open(entry_file_full_path, 'w') as entry_file_handle: - entry_file_handle.writelines(entry_commands_filled) - entry_file_handle.close() - - shell_cmd_list.append(f"{executable_interpreter} {entry_file_full_path}") - else: - shell_cmd_list.append(executable_interpreter) - if executable_file != "": - shell_cmd_list.append(entry_file_full_path) - if executable_conf_file != "" and executable_conf_option != "": - shell_cmd_list.append(executable_conf_option) - shell_cmd_list.append(conf_file_full_path) - shell_cmd_list.append(executable_args) - shell_cmd_list.append(f"--run_id {self.run_id}") - shell_cmd_list.append(f"--run_device_id {self.edge_id}") - shell_cmd_list.append(f"--rank {client_rank}") - shell_cmd_list.append(f"--role client") - shell_cmd_list.append("--using_mlops True") + + # Generate the job executing commands + job_executing_commands = JobRunnerUtils.generate_job_execute_commands( + self.run_id, self.edge_id, self.version, + self.package_type, executable_interpreter, entry_file_full_path, + conf_file_object, entry_args, assigned_gpu_ids, + job_api_key, client_rank, job_yaml=job_yaml_default_none) + + # Run the job executing commands logging.info(f"Run the client job with job id {self.run_id}, device id {self.edge_id}.") process, error_list = ClientConstants.execute_commands_with_live_logs( - shell_cmd_list, callback=self.start_job_perf, error_processor=self.job_error_processor, + job_executing_commands, callback=self.start_job_perf, error_processor=self.job_error_processor, should_write_log_file=False if job_type == Constants.JOB_TASK_TYPE_FEDERATE else True) is_launch_task = False if job_type == Constants.JOB_TASK_TYPE_FEDERATE else True @@ -731,50 +634,6 @@ def job_error_processor(self, error_list): error_str = "\n".join(error_list) raise Exception(f"Error occurs when running the job... {error_str}") - def parse_config_args_as_env_variables(self, export_cmd, run_params): - model_args = run_params.get("fedml_model_args", {}) - data_args = run_params.get("fedml_data_args", {}) - model_name = model_args.get("model_name", None) - model_cache_path = model_args.get("model_cache_path", None) - input_dim = model_args.get("input_dim", None) - output_dim = model_args.get("output_dim", None) - dataset_name = data_args.get("dataset_name", None) - dataset_path = data_args.get("dataset_path", None) - dataset_type = data_args.get("dataset_type", None) - - env_list = list() - env_value_map = dict() - - if model_name is not None and str(model_name).strip() != "": - env_list.append(f"{export_cmd} FEDML_MODEL_NAME={model_name}\n") - env_value_map["$FEDML_MODEL_NAME"] = model_name - - if model_cache_path is not None and str(model_cache_path).strip() != "": - env_list.append(f"{export_cmd} FEDML_MODEL_CACHE_PATH={model_cache_path}\n") - env_value_map["$FEDML_MODEL_CACHE_PATH"] = model_cache_path - - if input_dim is not None and str(input_dim).strip() != "": - env_list.append(f"{export_cmd} FEDML_MODEL_INPUT_DIM={input_dim}\n") - env_value_map["$FEDML_MODEL_INPUT_DIM"] = input_dim - - if output_dim is not None and str(output_dim).strip() != "": - env_list.append(f"{export_cmd} MODEL_OUTPUT_DIM={output_dim}\n") - env_value_map["$MODEL_OUTPUT_DIM"] = output_dim - - if dataset_name is not None and str(dataset_name).strip() != "": - env_list.append(f"{export_cmd} FEDML_DATASET_NAME={dataset_name}\n") - env_value_map["$FEDML_DATASET_NAME"] = dataset_name - - if dataset_path is not None and str(dataset_path).strip() != "": - env_list.append(f"{export_cmd} FEDML_DATASET_PATH={dataset_path}\n") - env_value_map["$FEDML_DATASET_PATH"] = dataset_path - - if dataset_type is not None and str(dataset_type).strip() != "": - env_list.append(f"{export_cmd} FEDML_DATASET_TYPE={dataset_type}\n") - env_value_map["$FEDML_DATASET_TYPE"] = dataset_type - - return env_list, env_value_map - def reset_devices_status(self, edge_id, status, should_send_client_id_status=True): self.mlops_metrics.run_id = self.run_id self.mlops_metrics.edge_id = edge_id diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py index 629064b652..8182616bda 100644 --- a/python/fedml/core/mlops/mlops_device_perfs.py +++ b/python/fedml/core/mlops/mlops_device_perfs.py @@ -101,7 +101,7 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None): gpu_cores_available, sent_bytes, recv_bytes, gpu_available_ids = sys_utils.get_sys_realtime_stats() topic_name = "ml_client/mlops/gpu_device_info" - artifact_info_json = { + device_info_json = { "edgeId": edge_id, "memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2), "memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2), @@ -115,7 +115,7 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None): "networkTraffic": sent_bytes + recv_bytes, "updateTime": int(MLOpsUtils.get_ntp_time()) } - message_json = json.dumps(artifact_info_json) + message_json = json.dumps(device_info_json) if mqtt_mgr is not None: mqtt_mgr.send_message_json(topic_name, message_json) diff --git a/python/setup.py b/python/setup.py index 844bb1ead1..7af82a06cd 100644 --- a/python/setup.py +++ b/python/setup.py @@ -94,7 +94,7 @@ def finalize_options(self): setup( name="fedml", - version="0.8.8a151", + version="0.8.8a152", author="FedML Team", author_email="ch@fedml.ai", description="A research and production integrated edge-cloud library for "