Skip to content

Commit

Permalink
Merge pull request #2021 from FedML-AI/alexleung/dev_branch_online
Browse files Browse the repository at this point in the history
Alexleung/dev branch online
  • Loading branch information
fedml-alex authored Apr 6, 2024
2 parents 4ea1c7c + 46cbab2 commit 52b5b2d
Show file tree
Hide file tree
Showing 128 changed files with 13,024 additions and 1,317 deletions.
26 changes: 26 additions & 0 deletions python/examples/deploy/dummy_failed/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
workspace: "./src"

inference_image_name: "raphaeljin/fedml"
enable_custom_image: true

bootstrap: |
echo "Bootstrap start..."
pwd
ls -l
echo "Check shell script"
cat fedml-deploy-bootstrap-entry-auto-gen.sh
echo "Check main script"
cat serve_main.py
echo "Bootstrap finished"
## Simulate a successful deployment
#job: |
# python3 serve_main.py

# Then during update, simulate a failed deployment
job: |
echo "Simulate a failed deployment"
exit 1
auto_detect_public_ip: true
use_gpu: true
32 changes: 32 additions & 0 deletions python/examples/deploy/dummy_failed/src/serve_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from fedml.serving import FedMLPredictor
from fedml.serving import FedMLInferenceRunner
import uuid
import torch

# Calculate the number of elements
num_elements = 1_073_741_824 // 4 # using integer division for whole elements


class DummyPredictor(FedMLPredictor):
def __init__(self):
super().__init__()
# Create a tensor with these many elements
tensor = torch.empty(num_elements, dtype=torch.float32)

# Move the tensor to GPU
tensor_gpu = tensor.cuda()

# for debug
with open("/tmp/dummy_gpu_occupier.txt", "w") as f:
f.write("GPU is occupied")

self.worker_id = uuid.uuid4()

def predict(self, request):
return {f"AlohaV0From{self.worker_id}": request}


if __name__ == "__main__":
predictor = DummyPredictor()
fedml_inference_runner = FedMLInferenceRunner(predictor)
fedml_inference_runner.run()
9 changes: 9 additions & 0 deletions python/examples/deploy/dummy_gpu_occupier/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
workspace: "./src"
entry_point: "serve_main.py"
bootstrap: |
echo "Bootstrap start..."
sleep 5
echo "Bootstrap finished"
auto_detect_public_ip: true
use_gpu: true
32 changes: 32 additions & 0 deletions python/examples/deploy/dummy_gpu_occupier/src/serve_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from fedml.serving import FedMLPredictor
from fedml.serving import FedMLInferenceRunner
import uuid
import torch

# Calculate the number of elements
num_elements = 1_073_741_824 // 4 # using integer division for whole elements


class DummyPredictor(FedMLPredictor):
def __init__(self):
super().__init__()
# Create a tensor with these many elements
tensor = torch.empty(num_elements, dtype=torch.float32)

# Move the tensor to GPU
tensor_gpu = tensor.cuda()

# for debug
with open("/tmp/dummy_gpu_occupier.txt", "w") as f:
f.write("GPU is occupied")

self.worker_id = uuid.uuid4()

def predict(self, request):
return {f"AlohaV0From{self.worker_id}": request}


if __name__ == "__main__":
predictor = DummyPredictor()
fedml_inference_runner = FedMLInferenceRunner(predictor)
fedml_inference_runner.run()
4 changes: 2 additions & 2 deletions python/examples/deploy/dummy_job/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ workspace: "./src"
entry_point: "serve_main.py"
bootstrap: |
echo "Bootstrap start..."
sleep 15
echo "Bootstrap finished"
sleep 5
echo "Bootstrap finished"
12 changes: 0 additions & 12 deletions python/examples/deploy/dummy_job/config/bootstrap.sh

This file was deleted.

10 changes: 10 additions & 0 deletions python/examples/launch/train_build_package/src/bootstrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
### don't modify this part ###
set -x
##############################

pip install -r requirements.txt
echo "Bootstrap finished."

### don't modify this part ###
exit 0
##############################
9 changes: 3 additions & 6 deletions python/examples/launch/train_build_package/train_job.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Local directory where your source code resides.
# It should be the relative path to this job yaml file or the absolute path.
# If your job doesn't contain any source code, it can be empty.
workspace: .
workspace: "./src"

# Running entry commands which will be executed as the job entry point.
# If an error occurs, you should exit with a non-zero code, e.g. exit 1.
Expand All @@ -14,14 +14,11 @@ job_type: train # options: train, deploy, federate

# Bootstrap shell commands which will be executed before running entry commands.
# Support multiple lines, which can be empty.
bootstrap: |
echo "Bootstrap finished."
bootstrap: bash bootstrap.sh

computing:
minimum_num_gpus: 1 # minimum # of GPUs to provision
maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card
#allow_cross_cloud_resources: true # true, false
#device_type: CPU # options: GPU, CPU, hybrid
resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type

data_args:
Expand All @@ -36,4 +33,4 @@ model_args:
output_dim: '10'

training_params:
learning_rate: 0.004
learning_rate: 0.004
93 changes: 52 additions & 41 deletions python/examples/train/llm_train/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
<img src="assets/fedml_logo_light_mode.png" width="400px" alt="FedML logo">
</div>

# LLM Fine-tune
# LLM Training

This repo contains an MLOps-supported training pipeline to help users build their own large language model (LLM) on proprietary/private
data.
This repo aims to provide a minimalist example of efficient LLM training/fine-tuning
and to illustrate how to use FedML Launch and fine-tuning.
and to illustrate how to use FEDML Launch.
We leverage Pythia 7B by default and recently added support for Llama 2.

The repo contains:
Expand All @@ -18,41 +18,16 @@ The repo contains:
- Supports [DeepSpeed](https://www.deepspeed.ai/).
- Dataset implementation with [datasets](https://huggingface.co/docs/datasets/index).

## How to Use Llama 2

Our example uses Pythia by default, but we recently added support for Llama2.
If you'd like to use Llama2, please see the following instructions before getting started.

To use [Llama 2](https://ai.meta.com/llama/), you need to apply access from Meta and request Meta's private
Hugging Face repo access.

1. Make sure your `transformers` version is `4.31.0` or newer. You could update your transformers via
`pip install --upgrade transformers`.
2. Please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and apply for
access.
3. Apply for [Meta's private repo](https://huggingface.co/meta-llama/Llama-2-7b-hf)
on [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf). See below image for detail.
![Meta's private repo on Hugging Face](assets/Llama/huggingface_llama_repo.png)
4. Once both access are granted, you can start using Llama by passing `--model_name "meta-llama/Llama-2-7b-hf"` to the training script.

> **Warning**
> Since Llama 2 is on a private Hugging Face repo, you need to either login to Hugging Face or provide your access token.
> - To login to huggingface (see https://huggingface.co/settings/tokens for detail), run `huggingface-cli login` in
command line.
> - To pass an access token, you need to do one of the following:
> - Set environment variable `HUGGING_FACE_HUB_TOKEN="<your access token>"`
> - For centralized/conventional training, pass `--auth_token "<your access token>"` in the command line.
## Getting Started

Clone the repo then go to the project directory:

```shell
# clone the repo
git clone https://github.com/FedML-AI/llm-finetune.git
git clone https://github.com/FedML-AI/FedML.git

# go to the project directory
cd llm-finetune
cd python/examples/train/llm_train
```

Install dependencies with the following command:
Expand All @@ -63,7 +38,7 @@ pip install -r requirements.txt

See [Dependencies](#dependencies) for more information on the dependency versions.

### Conventional/Centralized Training
### Training

The [`run_train.py`](run_train.py) contains a minimal example for conventional/centralized LLM training and fine-tuning
on [`databricks-dolly-15k`](https://huggingface.co/datasets/FedML/databricks-dolly-15k-niid) dataset.
Expand All @@ -84,6 +59,9 @@ bash scripts/train_deepspeed.sh \
... # additional arguments
```

> **Note**
> You can use `bash scripts/train.sh -h` to list all the supported CLI options.
> **Note**
> If you have an Amper or newer GPU (e.g., RTX 3000 series or newer), you could turn on **bf16** to have more
> efficient training by passing `--bf16 "True"` in the command line.
Expand All @@ -92,20 +70,53 @@ bash scripts/train_deepspeed.sh \
> when using PyTorch DDP with LoRA and gradient checkpointing, you need to turn off `find_unused_parameters`
> by passing `--ddp_find_unused_parameters "False"` in the command line.
### Train with FEDML Launch

If you have trouble finding computing resources, you can launch your training job via [FEDML Launch](https://doc.fedml.ai/launch) and left FEDML to find the most cost-effective resource for your task.

```shell
# install fedml library
pip3 install fedml

# launch your training job
fedml launch job.yaml
```

You can modify the training command in [job.yaml](job.yaml) by
- specify training settings in `job` section
- specify environment setup settings in `bootstrap` section
- specify compute resources in `computing` section

## How to Use Llama 2

Our example uses Pythia by default, but we recently added support for Llama2.
If you'd like to use Llama2, please see the following instructions before getting started.

To use [Llama 2](https://ai.meta.com/llama/), you need to apply access from Meta and request Meta's private
Hugging Face repo access.

1. Make sure your `transformers` version is `4.31.0` or newer. You could update your transformers via
`pip install --upgrade transformers`.
2. Please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and apply for
access.
3. Apply for [Meta's private repo](https://huggingface.co/meta-llama/Llama-2-7b-hf)
on [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf). See below image for detail.
![Meta's private repo on Hugging Face](assets/Llama/huggingface_llama_repo.png)
4. Once both access are granted, you can start using Llama by passing `--model_name "meta-llama/Llama-2-7b-hf"` to the training script.

> **Warning**
> Since Llama 2 is on a private Hugging Face repo, you need to either login to Hugging Face or provide your access token.
> - To login to huggingface (see https://huggingface.co/settings/tokens for detail), run `huggingface-cli login` in
command line.
> - To pass an access token, you need to do one of the following:
> - Set environment variable `HUGGING_FACE_HUB_TOKEN="<your access token>"`
> - For centralized/conventional training, pass `--auth_token "<your access token>"` in the command line.

### Dependencies

We have tested our implement with the following setup:

- Ubuntu `20.04.5 LTS` and `22.04.2 LTS`
- CUDA `12.2`, `11.8`, `11.7` and `11.6`
- Python `3.8.13` and `3.9.16`
- `fedml>=0.8.4a7`
- `torch>=2.0.0,<=2.1.0`
- `torchvision>=0.15.1,<=0.16.0`
- `transformers>=4.31.0,<=4.34.0`
- `peft>=0.4.0,<=0.5.0`
- `datasets>=2.11.0,<=2.14.5`
- `deepspeed>=0.9.1,<=0.10.3`
- `numpy>=1.24.3,<=1.24.4`
- `tensorboard>=2.12.2,<=2.13.0`
- `mpi4py>=3.1.4,<=3.1.5`
- Python `3.8.13`, `3.9.16` and `3.10.13`
2 changes: 1 addition & 1 deletion python/examples/train/llm_train/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ transformers[torch]>=4.31.0
safetensors
datasets>=2.14.0
einops
fedml[llm]>=0.8.17
fedml[llm]>=0.8.18
tqdm
wandb
pyyaml
2 changes: 1 addition & 1 deletion python/fedml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
_global_training_type = None
_global_comm_backend = None

__version__ = "0.8.29.dev4"
__version__ = "0.8.29.dev10"


# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
Expand Down
5 changes: 3 additions & 2 deletions python/fedml/api/modules/federate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from fedml.computing.scheduler.comm_utils.sys_utils import generate_yaml_doc
from fedml.computing.scheduler.comm_utils.yaml_utils import load_yaml_config
from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
from fedml.computing.scheduler.scheduler_entry.constants import Constants as SchedulerEntryConstants
import fedml.api.modules.build
from fedml.computing.scheduler.scheduler_entry.launch_manager import FedMLLaunchManager

Expand Down Expand Up @@ -66,8 +67,8 @@ def build_with_job_yaml(job_yaml_file, dest_folder=None):
shutil.copyfile(server_package, dest_package)
print(f"Your server package file is located at: {dest_package}")

bootstrap_bat_file = os.path.join(job_dir_path, "bootstrap.bat")
bootstrap_sh_file = os.path.join(job_dir_path, "bootstrap.sh")
bootstrap_sh_file = os.path.join(job_dir_path, SchedulerEntryConstants.BOOTSTRAP_FILE_NAME)
bootstrap_bat_file = bootstrap_sh_file.rstrip(".sh") + ".bat"
job_entry_bat_file = os.path.join(
job_dir_path, SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME.rstrip('.sh') + '.bat')
job_entry_sh_file = os.path.join(job_dir_path, SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME)
Expand Down
10 changes: 6 additions & 4 deletions python/fedml/api/modules/launch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
from typing import List

Expand Down Expand Up @@ -131,12 +132,13 @@ def job(

inner_id = run_id if create_run_result.inner_id is None else create_run_result.inner_id

if (result_code == ApiConstants.ERROR_CODE[ApiConstants.LAUNCHED] or
result_code != ApiConstants.ERROR_CODE[ApiConstants.RESOURCE_MATCHED_STATUS_MATCHED]):
if create_run_result.inner_id is not None:
FedMLLaunchManager.get_instance().cleanup_launch(run_id, create_run_result.inner_id)
if result_code == ApiConstants.ERROR_CODE[ApiConstants.LAUNCHED]:
return LaunchResult(result_code=result_code, result_message=result_message, run_id=run_id,
project_id=project_id, inner_id=inner_id, result_object=create_run_result)

if result_code != ApiConstants.ERROR_CODE[ApiConstants.RESOURCE_MATCHED_STATUS_MATCHED]:
if create_run_result.inner_id is not None:
logging.info("Job run id {} cannot match GPU resource".format(run_id))

# Run Job
run_result = run(create_run_result=create_run_result, api_key=api_key, device_server=device_server,
Expand Down
Loading

0 comments on commit 52b5b2d

Please sign in to comment.