Merge pull request #2021 from FedML-AI/alexleung/dev_branch_online

Alexleung/dev branch online
FedML-AI · Apr 6, 2024 · 52b5b2d · 52b5b2d
2 parents 4ea1c7c + 46cbab2
commit 52b5b2d
Show file tree

Hide file tree

Showing 128 changed files with 13,024 additions and 1,317 deletions.
diff --git a/python/examples/deploy/dummy_failed/config.yaml b/python/examples/deploy/dummy_failed/config.yaml
@@ -0,0 +1,26 @@
+workspace: "./src"
+
+inference_image_name: "raphaeljin/fedml"
+enable_custom_image: true
+
+bootstrap: |
+  echo "Bootstrap start..."
+  pwd
+  ls -l
+  echo "Check shell script"
+  cat fedml-deploy-bootstrap-entry-auto-gen.sh
+  echo "Check main script"
+  cat serve_main.py  
+  echo "Bootstrap finished"
+
+## Simulate a successful deployment
+#job: |
+#  python3 serve_main.py
+
+# Then during update, simulate a failed deployment
+job: |
+  echo "Simulate a failed deployment"
+  exit 1
+
+auto_detect_public_ip: true
+use_gpu: true
diff --git a/python/examples/deploy/dummy_failed/src/serve_main.py b/python/examples/deploy/dummy_failed/src/serve_main.py
@@ -0,0 +1,32 @@
+from fedml.serving import FedMLPredictor
+from fedml.serving import FedMLInferenceRunner
+import uuid
+import torch
+
+# Calculate the number of elements
+num_elements = 1_073_741_824 // 4  # using integer division for whole elements
+
+
+class DummyPredictor(FedMLPredictor):
+    def __init__(self):
+        super().__init__()
+        # Create a tensor with these many elements
+        tensor = torch.empty(num_elements, dtype=torch.float32)
+
+        # Move the tensor to GPU
+        tensor_gpu = tensor.cuda()
+
+        # for debug
+        with open("/tmp/dummy_gpu_occupier.txt", "w") as f:
+            f.write("GPU is occupied")
+
+        self.worker_id = uuid.uuid4()
+
+    def predict(self, request):
+        return {f"AlohaV0From{self.worker_id}": request}
+
+
+if __name__ == "__main__":
+    predictor = DummyPredictor()
+    fedml_inference_runner = FedMLInferenceRunner(predictor)
+    fedml_inference_runner.run()
diff --git a/python/examples/deploy/dummy_gpu_occupier/config.yaml b/python/examples/deploy/dummy_gpu_occupier/config.yaml
@@ -0,0 +1,9 @@
+workspace: "./src"
+entry_point: "serve_main.py"
+bootstrap: |
+  echo "Bootstrap start..."
+  sleep 5
+  echo "Bootstrap finished"
+
+auto_detect_public_ip: true
+use_gpu: true
diff --git a/python/examples/deploy/dummy_gpu_occupier/src/serve_main.py b/python/examples/deploy/dummy_gpu_occupier/src/serve_main.py
@@ -0,0 +1,32 @@
+from fedml.serving import FedMLPredictor
+from fedml.serving import FedMLInferenceRunner
+import uuid
+import torch
+
+# Calculate the number of elements
+num_elements = 1_073_741_824 // 4  # using integer division for whole elements
+
+
+class DummyPredictor(FedMLPredictor):
+    def __init__(self):
+        super().__init__()
+        # Create a tensor with these many elements
+        tensor = torch.empty(num_elements, dtype=torch.float32)
+
+        # Move the tensor to GPU
+        tensor_gpu = tensor.cuda()
+
+        # for debug
+        with open("/tmp/dummy_gpu_occupier.txt", "w") as f:
+            f.write("GPU is occupied")
+
+        self.worker_id = uuid.uuid4()
+
+    def predict(self, request):
+        return {f"AlohaV0From{self.worker_id}": request}
+
+
+if __name__ == "__main__":
+    predictor = DummyPredictor()
+    fedml_inference_runner = FedMLInferenceRunner(predictor)
+    fedml_inference_runner.run()
diff --git a/python/examples/deploy/dummy_job/config.yaml b/python/examples/deploy/dummy_job/config.yaml
@@ -2,5 +2,5 @@ workspace: "./src"
 entry_point: "serve_main.py"
 bootstrap: |
   echo "Bootstrap start..."
-  sleep 15
-  echo "Bootstrap finished"
+  sleep 5
+  echo "Bootstrap finished"
diff --git a/python/examples/deploy/dummy_job/config/bootstrap.sh b/python/examples/deploy/dummy_job/config/bootstrap.sh
diff --git a/python/examples/launch/train_build_package/src/bootstrap.sh b/python/examples/launch/train_build_package/src/bootstrap.sh
@@ -0,0 +1,10 @@
+### don't modify this part ###
+set -x
+##############################
+
+pip install -r requirements.txt
+echo "Bootstrap finished."
+
+### don't modify this part ###
+exit 0
+##############################
diff --git a/...mples/launch/train_build_package/train.py → ...s/launch/train_build_package/src/train.py b/...mples/launch/train_build_package/train.py → ...s/launch/train_build_package/src/train.py
diff --git a/python/examples/launch/train_build_package/train_job.yaml b/python/examples/launch/train_build_package/train_job.yaml
@@ -1,7 +1,7 @@
 # Local directory where your source code resides.
 # It should be the relative path to this job yaml file or the absolute path.
 # If your job doesn't contain any source code, it can be empty.
-workspace: .
+workspace: "./src"
 
 # Running entry commands which will be executed as the job entry point.
 # If an error occurs, you should exit with a non-zero code, e.g. exit 1.
@@ -14,14 +14,11 @@ job_type: train              # options: train, deploy, federate
 
 # Bootstrap shell commands which will be executed before running entry commands.
 # Support multiple lines, which can be empty.
-bootstrap: |
-  echo "Bootstrap finished."
+bootstrap: bash bootstrap.sh
 
 computing:
   minimum_num_gpus: 1           # minimum # of GPUs to provision
   maximum_cost_per_hour: $3000   # max cost per hour for your job per gpu card
-  #allow_cross_cloud_resources: true # true, false
-  #device_type: CPU              # options: GPU, CPU, hybrid
   resource_type: A100-80G       # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type
 
 data_args:
@@ -36,4 +33,4 @@ model_args:
   output_dim: '10'
 
 training_params:
-  learning_rate: 0.004
+  learning_rate: 0.004
diff --git a/python/examples/train/llm_train/README.md b/python/examples/train/llm_train/README.md
@@ -2,12 +2,12 @@
  <img src="assets/fedml_logo_light_mode.png" width="400px" alt="FedML logo">
 </div>
 
-# LLM Fine-tune
+# LLM Training
 
 This repo contains an MLOps-supported training pipeline to help users build their own large language model (LLM) on proprietary/private
 data.
 This repo aims to provide a minimalist example of efficient LLM training/fine-tuning
-and to illustrate how to use FedML Launch and fine-tuning.
+and to illustrate how to use FEDML Launch.
 We leverage Pythia 7B by default and recently added support for Llama 2.
 
 The repo contains:
@@ -18,41 +18,16 @@ The repo contains:
     - Supports [DeepSpeed](https://www.deepspeed.ai/).
     - Dataset implementation with [datasets](https://huggingface.co/docs/datasets/index).
 
-## How to Use Llama 2
-
-Our example uses Pythia by default, but we recently added support for Llama2.
-If you'd like to use Llama2, please see the following instructions before getting started.
-
-To use [Llama 2](https://ai.meta.com/llama/), you need to apply access from Meta and request Meta's private
-Hugging Face repo access.
-
-1. Make sure your `transformers` version is `4.31.0` or newer. You could update your transformers via
-   `pip install --upgrade transformers`.
-2. Please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and apply for
-   access.
-3. Apply for [Meta's private repo](https://huggingface.co/meta-llama/Llama-2-7b-hf)
-   on [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf). See below image for detail.
-   ![Meta's private repo on Hugging Face](assets/Llama/huggingface_llama_repo.png)
-4. Once both access are granted, you can start using Llama by passing `--model_name "meta-llama/Llama-2-7b-hf"` to the training script.
-
-> **Warning**
-> Since Llama 2 is on a private Hugging Face repo, you need to either login to Hugging Face or provide your access token.
-> - To login to huggingface (see https://huggingface.co/settings/tokens for detail), run `huggingface-cli login` in
-    command line.
-> - To pass an access token, you need to do one of the following:
->   - Set environment variable `HUGGING_FACE_HUB_TOKEN="<your access token>"`
->   - For centralized/conventional training, pass `--auth_token "<your access token>"` in the command line.
-
 ## Getting Started
 
 Clone the repo then go to the project directory:
 
 ```shell
 # clone the repo
-git clone https://github.com/FedML-AI/llm-finetune.git
+git clone https://github.com/FedML-AI/FedML.git
 
 # go to the project directory
-cd llm-finetune
+cd python/examples/train/llm_train
 ```
 
 Install dependencies with the following command:
@@ -63,7 +38,7 @@ pip install -r requirements.txt
 
 See [Dependencies](#dependencies) for more information on the dependency versions.
 
-### Conventional/Centralized Training
+### Training
 
 The [`run_train.py`](run_train.py) contains a minimal example for conventional/centralized LLM training and fine-tuning
 on [`databricks-dolly-15k`](https://huggingface.co/datasets/FedML/databricks-dolly-15k-niid) dataset.
@@ -84,6 +59,9 @@ bash scripts/train_deepspeed.sh \
   ... # additional arguments
 ```
 
+> **Note**
+> You can use `bash scripts/train.sh -h` to list all the supported CLI options.
+
 > **Note**
 > If you have an Amper or newer GPU (e.g., RTX 3000 series or newer), you could turn on **bf16** to have more
 > efficient training by passing `--bf16 "True"` in the command line.
@@ -92,20 +70,53 @@ bash scripts/train_deepspeed.sh \
 > when using PyTorch DDP with LoRA and gradient checkpointing, you need to turn off `find_unused_parameters`
 > by passing `--ddp_find_unused_parameters "False"` in the command line.
 
+### Train with FEDML Launch
+
+If you have trouble finding computing resources, you can launch your training job via [FEDML Launch](https://doc.fedml.ai/launch) and left FEDML to find the most cost-effective resource for your task.
+
+```shell
+# install fedml library
+pip3 install fedml
+
+# launch your training job
+fedml launch job.yaml
+```
+
+You can modify the training command in [job.yaml](job.yaml) by
+- specify training settings in `job` section
+- specify environment setup settings in `bootstrap` section
+- specify compute resources in `computing` section
+
+## How to Use Llama 2
+
+Our example uses Pythia by default, but we recently added support for Llama2.
+If you'd like to use Llama2, please see the following instructions before getting started.
+
+To use [Llama 2](https://ai.meta.com/llama/), you need to apply access from Meta and request Meta's private
+Hugging Face repo access.
+
+1. Make sure your `transformers` version is `4.31.0` or newer. You could update your transformers via
+   `pip install --upgrade transformers`.
+2. Please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and apply for
+   access.
+3. Apply for [Meta's private repo](https://huggingface.co/meta-llama/Llama-2-7b-hf)
+   on [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf). See below image for detail.
+   ![Meta's private repo on Hugging Face](assets/Llama/huggingface_llama_repo.png)
+4. Once both access are granted, you can start using Llama by passing `--model_name "meta-llama/Llama-2-7b-hf"` to the training script.
+
+> **Warning**
+> Since Llama 2 is on a private Hugging Face repo, you need to either login to Hugging Face or provide your access token.
+> - To login to huggingface (see https://huggingface.co/settings/tokens for detail), run `huggingface-cli login` in
+    command line.
+> - To pass an access token, you need to do one of the following:
+>   - Set environment variable `HUGGING_FACE_HUB_TOKEN="<your access token>"`
+>   - For centralized/conventional training, pass `--auth_token "<your access token>"` in the command line.
+
+
 ### Dependencies
 
 We have tested our implement with the following setup:
 
 - Ubuntu `20.04.5 LTS` and `22.04.2 LTS`
 - CUDA `12.2`, `11.8`, `11.7` and `11.6`
-- Python `3.8.13` and `3.9.16`
-    - `fedml>=0.8.4a7`
-    - `torch>=2.0.0,<=2.1.0`
-    - `torchvision>=0.15.1,<=0.16.0`
-    - `transformers>=4.31.0,<=4.34.0`
-    - `peft>=0.4.0,<=0.5.0`
-    - `datasets>=2.11.0,<=2.14.5`
-    - `deepspeed>=0.9.1,<=0.10.3`
-    - `numpy>=1.24.3,<=1.24.4`
-    - `tensorboard>=2.12.2,<=2.13.0`
-    - `mpi4py>=3.1.4,<=3.1.5`
+- Python `3.8.13`, `3.9.16` and `3.10.13`
diff --git a/python/examples/train/llm_train/requirements.txt b/python/examples/train/llm_train/requirements.txt
@@ -8,7 +8,7 @@ transformers[torch]>=4.31.0
 safetensors
 datasets>=2.14.0
 einops
-fedml[llm]>=0.8.17
+fedml[llm]>=0.8.18
 tqdm
 wandb
 pyyaml
diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py
@@ -34,7 +34,7 @@
 _global_training_type = None
 _global_comm_backend = None
 
-__version__ = "0.8.29.dev4"
+__version__ = "0.8.29.dev10"
 
 
 # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release

diff --git a/python/fedml/api/modules/federate.py b/python/fedml/api/modules/federate.py
@@ -5,6 +5,7 @@
 from fedml.computing.scheduler.comm_utils.sys_utils import generate_yaml_doc
 from fedml.computing.scheduler.comm_utils.yaml_utils import load_yaml_config
 from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
+from fedml.computing.scheduler.scheduler_entry.constants import Constants as SchedulerEntryConstants
 import fedml.api.modules.build
 from fedml.computing.scheduler.scheduler_entry.launch_manager import FedMLLaunchManager
 
@@ -66,8 +67,8 @@ def build_with_job_yaml(job_yaml_file, dest_folder=None):
         shutil.copyfile(server_package, dest_package)
         print(f"Your server package file is located at: {dest_package}")
 
-    bootstrap_bat_file = os.path.join(job_dir_path, "bootstrap.bat")
-    bootstrap_sh_file = os.path.join(job_dir_path, "bootstrap.sh")
+    bootstrap_sh_file = os.path.join(job_dir_path, SchedulerEntryConstants.BOOTSTRAP_FILE_NAME)
+    bootstrap_bat_file = bootstrap_sh_file.rstrip(".sh") + ".bat"
     job_entry_bat_file = os.path.join(
         job_dir_path, SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME.rstrip('.sh') + '.bat')
     job_entry_sh_file = os.path.join(job_dir_path, SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME)

diff --git a/python/fedml/api/modules/launch.py b/python/fedml/api/modules/launch.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from typing import List
 
@@ -131,12 +132,13 @@ def job(
 
     inner_id = run_id if create_run_result.inner_id is None else create_run_result.inner_id
 
-    if (result_code == ApiConstants.ERROR_CODE[ApiConstants.LAUNCHED] or
-            result_code != ApiConstants.ERROR_CODE[ApiConstants.RESOURCE_MATCHED_STATUS_MATCHED]):
-        if create_run_result.inner_id is not None:
-            FedMLLaunchManager.get_instance().cleanup_launch(run_id, create_run_result.inner_id)
+    if result_code == ApiConstants.ERROR_CODE[ApiConstants.LAUNCHED]:
         return LaunchResult(result_code=result_code, result_message=result_message, run_id=run_id,
                             project_id=project_id, inner_id=inner_id, result_object=create_run_result)
+
+    if result_code != ApiConstants.ERROR_CODE[ApiConstants.RESOURCE_MATCHED_STATUS_MATCHED]:
+        if create_run_result.inner_id is not None:
+            logging.info("Job run id {} cannot match GPU resource".format(run_id))
 
     # Run Job
     run_result = run(create_run_result=create_run_result, api_key=api_key, device_server=device_server,