Merge pull request #23 from ddlBoJack/dev-mzy

update audio dataset and model
X-LANCE · Jan 10, 2024 · 6c5913e · 6c5913e
2 parents e6f4517 + 9623db5
commit 6c5913e
Show file tree

Hide file tree

Showing 19 changed files with 2,016 additions and 122 deletions.
diff --git a/scripts/finetune_aac_llama.sh b/scripts/finetune_aac_llama.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# export PYTHONPATH=/root/whisper:$PYTHONPATH
+export PYTHONPATH=/root/fairseq:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=0
+# export CUDA_LAUNCH_BLOCKING=1
+export OMP_NUM_THREADS=1
+
+# debug setting for multiple gpus
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+# export TORCH_DISTRIBUTED_DEBUG=INFO
+
+cd /root/SLAM-LLM
+
+# speech_encoder_path=/nfs/zhifu.gzf/ckpt/Whisper/large-v2.pt
+# speech_encoder_path=/nfs/maziyang.mzy/models/Whisper/large-v2-qwen.pt
+audio_encoder_path=/nfs/maziyang.mzy/models/BEATs/BEATs_iter3_plus_AS2M.pt
+
+llm_path=/nfs/zhifu.gzf/ckpt/Llama-2-7b-hf
+# llm_path=/nfs/maziyang.mzy/models/vicuna-13b-v1.5/vicuna-13b-v1.5
+
+output_dir=/nfs/maziyang.mzy/exps/debug
+
+# -m debugpy --listen 5678 --wait-for-client
+if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then
+python -m debugpy --listen 5678 --wait-for-client src/llama_recipes/pipeline/finetune.py \
+--model_name aac \
+--freeze_encoder \
+--freeze_llm \
+--llm_name llama-2-7b-hf \
+--llm_path $llm_path \
+--llm_dim 4096 \
+--encoder_name beats \
+--encoder_ds_rate 2 \
+--encoder_path $audio_encoder_path \
+--encoder_dim 768 \
+--encoder_projector linear \
+--encoder_projector_ds_rate 5 \
+--dataset audio_dataset \
+--audio_dataset.train_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl \
+--audio_dataset.val_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl \
+--batching_strategy custom \
+--num_epochs 100 \
+--batch_size_training 4 \
+--val_batch_size 4 \
+--num_workers_dataloader 1 \
+--lr 1e-4 \
+--output_dir $output_dir \
+--metric acc \
+# --log_file $output_dir/test.log \
+# --use_wandb \
+# --wandb_dir $output_dir \
+# --wandb_entity_name zym22 \
+# --wandb_project_name slam-llm \
+# --wandb_exp_name test \
+# --log_interval 5 \
+# --ckpt_path "/nfs/maziyang.mzy/exps/llama-2-hf-finetune-asr-ds5-proj2048-lr1e-5-whisper-lora-prompt/asr/5/model.pt" \
+# --peft_ckpt "/nfs/maziyang.mzy/exps/llama-2-hf-finetune-asr-ds5-proj2048-lr1e-5-whisper-lora-prompt/asr/5" \
+# --use_peft --peft_method lora \
+
+else
+torchrun \
+--nnodes 1 \
+--nproc_per_node 2 \
+src/llama_recipes/pipeline/finetune.py \
+--model_name aac \
+--freeze_encoder \
+--freeze_llm \
+--enable_fsdp \
+--llm_name llama-2-7b-hf \
+--llm_path $llm_path \
+--llm_dim 4096 \
+--encoder_name beats \
+--encoder_ds_rate 2 \
+--encoder_path $audio_encoder_path \
+--encoder_dim 768 \
+--encoder_projector linear \
+--encoder_projector_ds_rate 5 \
+--dataset audio_dataset \
+--audio_dataset.train_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl \
+--audio_dataset.val_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl \
+--batching_strategy custom \
+--num_epochs 100 \
+--batch_size_training 4 \
+--val_batch_size 4 \
+--num_workers_dataloader 4 \
+--lr 1e-4 \
+--output_dir $output_dir \
+--metric acc \
+--log_file /$output_dir/train.log \
+--use_wandb \
+--wandb_dir $output_dir \
+--wandb_entity_name zym22 \
+--wandb_project_name slam-llm \
+--wandb_exp_name test \
+--log_interval 5 \
+# --peft_ckpt "/nfs/maziyang.mzy/exps/llama-2-hf-finetune-asr-ds5-proj2048-lr1e-5-whisper-prompt-padding30-20231228/asr/4" \
+# --ckpt_path "/nfs/maziyang.mzy/exps/llama-2-hf-finetune-asr-ds5-proj2048-lr1e-5-whisper-prompt-padding30-20231228/asr/4/model.pt" \
+# --use_peft --peft_method lora \
+fi
+
+# {"key": "1001-134707-0000_ASR", "prompt": "<ASR>", "source": "/cpfs01/shared/Group-speech/beinian.lzr/data/open_data/librispeech_audio/audio/se_librispeech_1001-134707-0000.wav", "target": "1 little recks the laborer. How near his work is holding him to God, The loving laborer through space and time, after all, not to create, only or found only.", "target_len": 157, "source_len": 1581, "text-type": "Transcribe", "audio_language": "en", "text_language": "en", "task-type": "<ASR>"}
+# {"key": "1688-142285-0005", "prompt": "<ASR>", "source": "/nfs/beinian.lzr/workspace/datasets/data/16k/opendata/librispeech/test_other/wav/1688-142285-0005.wav", "target": "YOU WHO WERE ALWAYS ACCUSING PEOPLE OF BEING SHOPPY AT HELSTONE", "target_len": 11, "source_len": 220, "text-type": "Transcribe", "audio_language": "en", "text_language": "en", "task-type": "<ASR>"}
diff --git a/scripts/finetune_asr_llama.sh b/scripts/finetune_asr_llama.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # export PYTHONPATH=/root/whisper:$PYTHONPATH
 export PYTHONPATH=/root/fairseq:$PYTHONPATH
-export CUDA_VISIBLE_DEVICES=0,1
+export CUDA_VISIBLE_DEVICES=0
 # export CUDA_LAUNCH_BLOCKING=1
 export OMP_NUM_THREADS=1
 
@@ -18,7 +18,7 @@ speech_encoder_path=/nfs/zhifu.gzf/ckpt/Whisper/large-v2.pt
 llm_path=/nfs/zhifu.gzf/ckpt/Llama-2-7b-hf
 # llm_path=/nfs/maziyang.mzy/models/vicuna-13b-v1.5/vicuna-13b-v1.5
 
-output_dir=/nfs/maziyang.mzy/exps/vicuna-13b-v1.5-finetune-asr-ds5-proj2048-lr1e-4-whisper-prompt-padding30-20240106-test
+output_dir=/nfs/maziyang.mzy/exps/debug
 
 # -m debugpy --listen 5678 --wait-for-client
 if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then
@@ -35,10 +35,9 @@ python -m debugpy --listen 5678 --wait-for-client src/llama_recipes/pipeline/fin
 --encoder_dim 1280 \
 --encoder_projector linear \
 --encoder_projector_ds_rate 5 \
---dataset custom_dataset \
---custom_dataset.file src/llama_recipes/datasets/speech_dataset.py:get_audio_dataset \
---custom_dataset.train_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.trans.jsonl \
---custom_dataset.val_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl \
+--dataset speech_dataset \
+--speech_dataset.train_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl \
+--speech_dataset.val_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl \
 --batching_strategy custom \
 --num_epochs 100 \
 --batch_size_training 4 \
@@ -66,21 +65,19 @@ src/llama_recipes/pipeline/finetune.py \
 --model_name asr \
 --freeze_encoder \
 --freeze_llm \
---use_fp16 \
 --enable_fsdp \
---llm_name vicuna-13b-v1.5 \
+--llm_name llama-2-7b-hf \
 --llm_path $llm_path \
---llm_dim 5120 \
+--llm_dim 4096 \
 --encoder_name whisper \
 --encoder_ds_rate 2 \
 --encoder_path $speech_encoder_path \
 --encoder_dim 1280 \
 --encoder_projector linear \
 --encoder_projector_ds_rate 5 \
---dataset custom_dataset \
---custom_dataset.file src/llama_recipes/datasets/speech_dataset.py:get_audio_dataset \
---custom_dataset.train_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.trans.jsonl \
---custom_dataset.val_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl \
+--dataset speech_dataset \
+--speech_dataset.train_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl \
+--speech_dataset.val_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl \
 --batching_strategy custom \
 --num_epochs 100 \
 --batch_size_training 4 \

diff --git a/scripts/finetune_asr_vicuna.sh b/scripts/finetune_asr_vicuna.sh
@@ -36,10 +36,9 @@ python -m debugpy --listen 5678 --wait-for-client src/llama_recipes/pipeline/fin
 --encoder_dim 1280 \
 --encoder_projector linear \
 --encoder_projector_ds_rate 5 \
---dataset custom_dataset \
---custom_dataset.file src/llama_recipes/datasets/speech_dataset.py:get_audio_dataset \
---custom_dataset.train_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.trans.jsonl \
---custom_dataset.val_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl \
+--dataset speech_dataset \
+--speech_dataset.train_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl \
+--speech_dataset.val_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl \
 --batching_strategy custom \
 --num_epochs 100 \
 --batch_size_training 4 \
@@ -78,10 +77,9 @@ src/llama_recipes/pipeline/finetune.py \
 --encoder_dim 1280 \
 --encoder_projector linear \
 --encoder_projector_ds_rate 5 \
---dataset custom_dataset \
---custom_dataset.file src/llama_recipes/datasets/speech_dataset.py:get_audio_dataset \
---custom_dataset.train_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.trans.jsonl \
---custom_dataset.val_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl \
+--dataset speech_dataset \
+--speech_dataset.train_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl \
+--speech_dataset.val_data_path /nfs/maziyang.mzy/data/librispeech/librispeech_dev_other_filtered.jsonl \
 --batching_strategy custom \
 --num_epochs 100 \
 --batch_size_training 8 \

diff --git a/scripts/inference_asr_batch.sh b/scripts/inference_asr_batch.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #export PYTHONPATH=/root/whisper:$PYTHONPATH
-export CUDA_VISIBLE_DEVICES=1
+export CUDA_VISIBLE_DEVICES=0
 # export CUDA_LAUNCH_BLOCKING=1
 
 cd /root/SLAM-LLM
@@ -11,11 +11,11 @@ speech_encoder_path=/nfs/zhifu.gzf/ckpt/Whisper/large-v2.pt
 # llm_path=/nfs/zhifu.gzf/ckpt/Llama-2-7b-hf
 llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5
 
-output_dir=nfs/maziyang.mzy/exps/vicuna-7b-v1.5-finetune-asr-ds5-proj2048-lr1e-4-whisper-prompt-padding30-20240106
-ckpt_path=/nfs/maziyang.mzy/exps/vicuna-7b-v1.5-finetune-asr-ds5-proj2048-lr1e-4-whisper-prompt-padding30-20240106/asr/2
+output_dir=/nfs/maziyang.mzy/exps/vicuna-7b-v1.5-finetune-asr-ds5-proj2048-lr1e-4-whisper-prompt-padding0-20240107
+ckpt_path=$output_dir/asr/2
 # peft_ckpt=/nfs/maziyang.mzy/exps/llama-2-hf-finetune-asr-ds5-proj2048-lr1e-4-whisper-lora-prompt-paddinglr-20240102/asr/4
-val_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_test_other_filtered.jsonl
-decode_log=$ckpt_path/decode_log_test_other_bs4_beam4_repetition_penalty1
+val_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_test_clean_filtered.jsonl
+decode_log=$ckpt_path/decode_log_test_clean_bs8_beam4_repetition_penalty1
 
 # -m debugpy --listen 5678 --wait-for-client
 python src/llama_recipes/pipeline/inference_batch.py \
@@ -30,12 +30,12 @@ python src/llama_recipes/pipeline/inference_batch.py \
 --encoder_dim 1280 \
 --encoder_projector linear \
 --encoder_projector_ds_rate 5 \
---dataset custom_dataset \
---custom_dataset.file src/llama_recipes/datasets/speech_dataset_inference.py:get_audio_dataset \
---custom_dataset.val_data_path $val_data_path \
+--dataset speech_dataset \
+--speech_dataset.file src/llama_recipes/datasets/speech_dataset_inference.py:get_audio_dataset \
+--speech_dataset.val_data_path $val_data_path \
 --batching_strategy custom \
 --num_epochs 1 \
---val_batch_size 4 \
+--val_batch_size 8 \
 --num_workers_dataloader 4 \
 --output_dir $output_dir \
 --ckpt_path $ckpt_path/model.pt \

diff --git a/src/llama_recipes/configs/datasets.py b/src/llama_recipes/configs/datasets.py
@@ -27,9 +27,9 @@ class alpaca_dataset:
 
 
 @dataclass
-class custom_dataset:
-    dataset: str = "custom_dataset"
-    file: str = "examples/custom_dataset.py"
+class speech_dataset:
+    dataset: str = "speech_dataset"
+    file: str = "src/llama_recipes/datasets/speech_dataset.py:get_speech_dataset"
     train_split: str = "train"
     test_split: str = "validation"
     data_path: str = None
@@ -41,6 +41,22 @@ class custom_dataset:
     fix_length_audio: int = -1
 
 
+@dataclass
+class audio_dataset:
+    dataset: str = "audio_dataset"
+    file: str = "src/llama_recipes/datasets/audio_dataset.py:get_audio_dataset"
+    train_split: str = "train"
+    test_split: str = "validation"
+    data_path: str = None
+    fbank_mean: float = 15.41663
+    fbank_std: float = 6.55582
+    max_words: int = None
+    train_data_path: str = None
+    val_data_path: str = None
+    max_words: int = None
+    max_mel: int = None
+    fix_length_audio: int = -1
+
 
 @dataclass
 class avsr_dataset: