Skip to content

Commit

Permalink
[usability] accelerate support - scripts update
Browse files Browse the repository at this point in the history
  • Loading branch information
wheresmyhair committed Feb 27, 2025
1 parent beba6ef commit ef083b6
Show file tree
Hide file tree
Showing 62 changed files with 1,229 additions and 135 deletions.
29 changes: 29 additions & 0 deletions configs/accelerate_fsdp_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: FSDP

fsdp_config:
fsdp_auto_wrap_policy: "SIZE"
fsdp_min_num_params: 1000000
fsdp_backward_prefetch: BACKWARD_PRE
fsdp_forward_prefetch: false
fsdp_cpu_ram_efficient_loading: true
fsdp_offload_params: false
fsdp_sharding_strategy: FULL_SHARD
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_sync_module_states: true
fsdp_use_orig_params: true

downcast_bf16: true
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
main_process_port: 1204
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion contrib/rlhflow/run_reward_modeling.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ deepspeed ${deepspeed_args} \
--block_size 512 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1\
--deepspeed configs/ds_config_zero2.json \
--deepspeed configs/archive/ds_config_zero2.json \
--bf16 \
--run_name rm_test \
--validation_split_percentage 10 \
Expand Down
2 changes: 1 addition & 1 deletion contrib/tool-finetune/run_function_call_finetune.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ deepspeed ${deepspeed_args} \
--disable_group_texts 1 \
--block_size 1024 \
--per_device_train_batch_size 1 \
--deepspeed configs/ds_config_zero3.json \
--deepspeed configs/archive/ds_config_zero3.json \
--fp16 \
--run_name finetune \
--validation_split_percentage 0 \
Expand Down
7 changes: 0 additions & 7 deletions examples/merge_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,6 @@ class MergeLoraArguments:
"help": "device to merge model on",
},
)
ds_config: str = field(
default='configs/ds_config_eval.json',
metadata={
"help": "deepspeed config file path",
},
)
output_model_path: Optional[str] = field(
default=None,
metadata={
Expand Down Expand Up @@ -64,7 +58,6 @@ def main():
model_args,
do_train=False,
device=merge_lora_args.device,
ds_config=merge_lora_args.ds_config
)
model.activate_model_for_inference()
model.merge_lora_weights()
Expand Down
2 changes: 1 addition & 1 deletion experimental/Hymba/run_finetune_hymba.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ deepspeed ${deepspeed_args} \
--block_size 256 \
--trust_remote_code True \
--per_device_train_batch_size 1 \
--deepspeed configs/ds_config_zero2_no_offload.json \
--deepspeed configs/archive/ds_config_zero2_no_offload.json \
--bf16 \
--run_name hymba_finetune \
--validation_split_percentage 0 \
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ fi
log_dir=output_dir/${model_name}_lmflow_chat_nll_eval
mkdir -p ${log_dir}
echo "[Evaluating] Evaluate on LMFlow_chat"
./scripts/run_benchmark.sh ${extra_args} --dataset_name lmflow_chat_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
./scripts/archive/run_benchmark.sh ${extra_args} --dataset_name lmflow_chat_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err

log_dir=output_dir/${model_name}_all_nll_eval
mkdir -p ${log_dir}
echo "[Evaluating] Evaluate on [commonsense, wiki, instruction_following (gpt4) ] nll evaluation"
./scripts/run_benchmark.sh ${extra_args} --dataset_name all_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
./scripts/archive/run_benchmark.sh ${extra_args} --dataset_name all_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err

log_dir=output_dir/${model_name}_commonsense_qa_eval
mkdir -p ${log_dir}
echo "[Evaluating] Evaluate on commonsense QA Accuracy evaluation"
./scripts/run_benchmark.sh ${extra_args} --dataset_name commonsense_qa_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
./scripts/archive/run_benchmark.sh ${extra_args} --dataset_name commonsense_qa_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
2 changes: 1 addition & 1 deletion scripts/run_app.sh → scripts/archive/run_app.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singlegpu_config.yaml service/app.py \
CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/archive/accelerate_singlegpu_config.yaml service/app.py \
--model_name_or_path gpt2 \
--torch_dtype bfloat16 \
--max_new_tokens 200
File renamed without changes.
4 changes: 2 additions & 2 deletions scripts/run_chatbot.sh → scripts/archive/run_chatbot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ if [ $# -ge 2 ]; then
fi

# --temperature 0.7 \
accelerate launch --config_file configs/accelerator_multigpu_config.yaml \
accelerate launch --config_file configs/archive/accelerate_multigpu_config.yaml \
examples/chatbot.py \
--deepspeed configs/ds_config_chatbot.json \
--deepspeed configs/archive/ds_config_chatbot.json \
--model_name_or_path ${model} \
--max_new_tokens 256 \
--temperature 1.0 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ fi
CUDA_VISIBLE_DEVICES=0 \
deepspeed examples/chatbot.py \
--arch_type encoder_decoder \
--deepspeed configs/ds_config_chatbot.json \
--deepspeed configs/archive/ds_config_chatbot.json \
--model_name_or_path ${model} \
${lora_args}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ fi

CUDA_VISIBLE_DEVICES="" \
python examples/chatbot.py \
--deepspeed configs/ds_config_chatbot.json \
--deepspeed configs/archive/ds_config_chatbot.json \
--model_name_or_path ${model} \
--device "cpu" \
${lora_args}
File renamed without changes.
File renamed without changes.
81 changes: 81 additions & 0 deletions scripts/archive/run_dpov2_align.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/bin/bash

# Parses arguments
run_name=dpov2_align
model_name_or_path=meta-llama/Meta-Llama-3-8B-Instruct
reference_model_name_or_path=meta-llama/Meta-Llama-3-8B-Instruct
dataset_path=data/iterative-prompt/train
eval_dataset_path=data/iterative-prompt/eval
output_dir=output_models/${run_name}

while [[ $# -ge 1 ]]; do
key="$1"
case ${key} in
-r|--run_name)
run_name="$2"
shift
;;
--model_name_or_path)
model_name_or_path="$2"
shift
;;
--reference_model_name_or_path)
reference_model_name_or_path="$2"
shift
;;
--dataset_path)
dataset_path="$2"
shift
;;
--eval_dataset_path)
eval_dataset_path="$2"
shift
;;
-o|--output_dir)
output_dir="$2"
shift
;;
*)
echo "error: unknown option \"${key}\"" 1>&2
exit 1
esac
shift
done

project_dir=$(cd "$(dirname $0)"/..; pwd)
log_dir=${project_dir}/log/${run_name}
mkdir -p ${output_dir} ${log_dir}

accelerate launch --config_file configs/accelerate_dsz3_config.yaml \
examples/dpov2_train.py \
--model_name_or_path ${model_name_or_path} \
--reference_model_name_or_path ${reference_model_name_or_path} \
--do_train True \
--dataset_path ${dataset_path} \
--eval_dataset_path ${eval_dataset_path} \
--bf16 True \
--learning_rate 5e-7 \
--lr_scheduler_type cosine \
--warmup_steps 100 \
--optim paged_adamw_32bit \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--gradient_checkpointing True \
--margin_scale 1.0 \
--max_prompt_length 1000 \
--num_train_epochs 2 \
--logging_steps 2 \
--save_strategy epoch \
--save_steps 5000 \
--evaluation_strategy steps \
--eval_steps 100 \
--loss_type sigmoid \
--output_dir ${output_dir} \
--run_name ${run_name} \
--sampling_paired_method max_min \
--report_to wandb \
--mask_prompt True \
--length_penalty 0 \
| tee ${log_dir}/train.log \
2> ${log_dir}/train.err
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ if [ ! -d data/MedQA-USMLE ]; then
cd data && ./download.sh MedQA-USMLE && cd -
fi

CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singlegpu_config.yaml examples/evaluation.py \
CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/archive/accelerate_singlegpu_config.yaml examples/evaluation.py \
--answer_type usmle \
--model_name_or_path gpt2-large \
--dataset_path data/MedQA-USMLE/validation \
Expand Down
File renamed without changes.
78 changes: 78 additions & 0 deletions scripts/archive/run_finetune.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/bin/bash
# Please run this script under ${project_id} in project directory of
# https://github.com/shizhediao/llm-ft
# COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4

# Parses arguments
model_name_or_path=gpt2
dataset_path=data/alpaca/train_conversation
output_dir=output_models/finetune
deepspeed_args="--master_port=11000"
conversation_template=llama2

# Safety related arguments
trust_remote_code=0

while [[ $# -ge 1 ]]; do
key="$1"
case ${key} in
-m|--model_name_or_path)
model_name_or_path="$2"
shift
;;
-d|--dataset_path)
dataset_path="$2"
shift
;;
-o|--output_model_path)
output_dir="$2"
shift
;;
--conversation_template)
conversation_template="$2"
shift
;;
--deepspeed_args)
deepspeed_args="$2"
shift
;;
--trust_remote_code)
trust_remote_code="$2"
shift
;;
*)
echo "error: unknown option \"${key}\"" 1>&2
exit 1
esac
shift
done

# Finetune
exp_id=finetune
project_dir=$(cd "$(dirname $0)"/..; pwd)
log_dir=${project_dir}/log/${exp_id}
mkdir -p ${output_dir} ${log_dir}

deepspeed ${deepspeed_args} \
examples/finetune.py \
--model_name_or_path ${model_name_or_path} \
--trust_remote_code ${trust_remote_code} \
--dataset_path ${dataset_path} \
--output_dir ${output_dir} --overwrite_output_dir \
--conversation_template ${conversation_template} \
--num_train_epochs 0.01 \
--learning_rate 2e-5 \
--disable_group_texts 1 \
--block_size 256 \
--per_device_train_batch_size 1 \
--deepspeed configs/archive/ds_config_zero3.json \
--fp16 \
--run_name finetune \
--validation_split_percentage 0 \
--logging_steps 20 \
--do_train \
--ddp_timeout 72000 \
--save_steps 5000 \
--dataloader_num_workers 1 \
> >(tee ${log_dir}/train.log) \
2> >(tee ${log_dir}/train.err >&2)
Loading

0 comments on commit ef083b6

Please sign in to comment.