-
Notifications
You must be signed in to change notification settings - Fork 105
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #169 from texttron/tevatron-v2
tevatron-v2 update: unified toolkit across scale, language and modality
- Loading branch information
Showing
24 changed files
with
1,501 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
{ | ||
"zero_optimization": { | ||
"stage": 0, | ||
"offload_optimizer": { | ||
"device": "none", | ||
"pin_memory": true | ||
}, | ||
"offload_param": { | ||
"device": "none", | ||
"pin_memory": true | ||
}, | ||
"overlap_comm": true, | ||
"contiguous_gradients": true, | ||
"sub_group_size": 1e9, | ||
"reduce_bucket_size": 1e6, | ||
"stage3_prefetch_bucket_size": "auto", | ||
"stage3_param_persistence_threshold": "auto", | ||
"stage3_max_live_parameters": 1e9, | ||
"stage3_max_reuse_distance": 1e9, | ||
"stage3_gather_16bit_weights_on_model_save": true | ||
}, | ||
"fp16": { | ||
"enabled": "auto", | ||
"loss_scale": 0, | ||
"initial_scale_power": 10, | ||
"loss_scale_window": 1000, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1 | ||
}, | ||
"bf16": { | ||
"enabled": "auto", | ||
"loss_scale": 0, | ||
"initial_scale_power": 10, | ||
"loss_scale_window": 1000, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1 | ||
}, | ||
"optimizer": { | ||
"type": "AdamW", | ||
"params": { | ||
"lr": "auto", | ||
"betas": "auto", | ||
"eps": "auto", | ||
"weight_decay": "auto", | ||
"torch_adam": true | ||
} | ||
}, | ||
|
||
"scheduler": { | ||
"type": "WarmupDecayLR", | ||
"params": { | ||
"warmup_min_lr": "auto", | ||
"warmup_max_lr": "auto", | ||
"warmup_num_steps": "auto", | ||
"total_num_steps": "auto" | ||
} | ||
}, | ||
|
||
"gradient_accumulation_steps": "auto", | ||
"gradient_clipping": "auto", | ||
"steps_per_print": 1000, | ||
"train_batch_size": "auto", | ||
"train_micro_batch_size_per_gpu": "auto", | ||
"wall_clock_breakdown": false | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
# Unified Multi-modal and Multilingual Retrieval | ||
|
||
## Train | ||
```bash | ||
deepspeed --include localhost:0,1,2,3,4,5,6,7,8 --master_port 60000 --module tevatron.retriever.driver.train_mm \ | ||
--deepspeed deepspeed/ds_zero0_config.json \ | ||
--output_dir retriever-qwen25vl-bge-pixmo-colpali-wiki \ | ||
--model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ | ||
--lora \ | ||
--lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \ | ||
--save_steps 500 \ | ||
--train_yaml dataset_config.yaml \ | ||
--query_prefix "Query: " \ | ||
--passage_prefix "" \ | ||
--bf16 \ | ||
--tf32 True \ | ||
--pooling eos \ | ||
--append_eos_token \ | ||
--normalize \ | ||
--temperature 0.02 \ | ||
--per_device_train_batch_size 16 \ | ||
--gradient_checkpointing \ | ||
--train_group_size 4 \ | ||
--learning_rate 1e-4 \ | ||
--query_max_len 512 \ | ||
--passage_max_len 512 \ | ||
--num_train_epochs 1 \ | ||
--logging_steps 1 \ | ||
--overwrite_output_dir \ | ||
--gradient_accumulation_steps 2 \ | ||
--warmup_ratio 0.005 \ | ||
--report_to wandb \ | ||
--dataloader_num_workers 4 | ||
``` | ||
|
||
## Inference and evaluation | ||
|
||
### BEIR (textual modality) | ||
|
||
#### Query Encode | ||
```bash | ||
|
||
CKPT=retriever-qwen25vl-bge-pixmo-colpali-wiki | ||
DATASET=scifact | ||
|
||
mkdir -p beir_embedding/${CKPT}/${DATASET} | ||
CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode_mm \ | ||
--output_dir=temp \ | ||
--model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ | ||
--lora_name_or_path ${CKPT} \ | ||
--lora \ | ||
--bf16 \ | ||
--per_device_eval_batch_size 16 \ | ||
--normalize \ | ||
--pooling last \ | ||
--query_prefix "Query: " \ | ||
--passage_prefix "" \ | ||
--append_eos_token \ | ||
--query_max_len 512 \ | ||
--dataset_name Tevatron/beir \ | ||
--dataset_config ${DATASET} \ | ||
--dataset_split test \ | ||
--encode_output_path beir_embedding/${CKPT}/${DATASET}/queries.pkl \ | ||
--encode_is_query | ||
``` | ||
|
||
#### Document Encode | ||
```bash | ||
for s in 0 1 2 3; | ||
do | ||
CUDA_VISIBLE_DEVICES=$s python -m tevatron.retriever.driver.encode_mm \ | ||
--output_dir=temp \ | ||
--model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ | ||
--lora_name_or_path ${CKPT} \ | ||
--lora \ | ||
--bf16 \ | ||
--per_device_eval_batch_size 16 \ | ||
--normalize \ | ||
--pooling last \ | ||
--passage_prefix "" \ | ||
--append_eos_token \ | ||
--passage_max_len 512 \ | ||
--dataset_name Tevatron/beir-corpus \ | ||
--dataset_config ${DATASET} \ | ||
--dataset_split train \ | ||
--encode_output_path beir_embedding/${CKPT}/${DATASET}/corpus.${s}.pkl \ | ||
--dataset_number_of_shards 4 \ | ||
--dataset_shard_index ${s} & | ||
done | ||
wait | ||
``` | ||
|
||
|
||
|
||
#### Search | ||
```bash | ||
mkdir -p beir_results/${CKPT}/scifact | ||
python -m tevatron.retriever.driver.search \ | ||
--query_reps beir_embedding/${CKPT}/${DATASET}/queries.pkl \ | ||
--passage_reps beir_embedding/${CKPT}/${DATASET}/'corpus.*.pkl' \ | ||
--depth 100 \ | ||
--batch_size 64 \ | ||
--save_text \ | ||
--save_ranking_to beir_results/${CKPT}/${DATASET}/rank.scifact.txt | ||
``` | ||
|
||
#### Evaluate | ||
```bash | ||
python -m tevatron.utils.format.convert_result_to_trec \ | ||
--input beir_results/${CKPT}/${DATASET}/rank.scifact.txt \ | ||
--output beir_results/${CKPT}/${DATASET}/rank.scifact.trec \ | ||
--remove_query | ||
|
||
python -m pyserini.eval.trec_eval -c -mrecall.100 -mndcg_cut.10 beir-v1.0.0-scifact-test \ | ||
beir_results/${CKPT}/${DATASET}/rank.scifact.trec | ||
``` | ||
|
||
### MIRACL (Multi-Lingual, Textual Modality) | ||
#### Query Encode | ||
```bash | ||
|
||
CKPT=retriever-qwen25vl-bge-pixmo-colpali-wiki | ||
DATASET=ar | ||
|
||
mkdir -p miracl_embedding/${CKPT}/${DATASET} | ||
CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode_mm \ | ||
--output_dir=temp \ | ||
--model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ | ||
--lora_name_or_path ${CKPT} \ | ||
--lora \ | ||
--bf16 \ | ||
--per_device_eval_batch_size 16 \ | ||
--normalize \ | ||
--pooling last \ | ||
--query_prefix "Query: " \ | ||
--passage_prefix "" \ | ||
--append_eos_token \ | ||
--query_max_len 512 \ | ||
--dataset_name miracl/miracl \ | ||
--dataset_config $DATASET \ | ||
--dataset_split test \ | ||
--encode_output_path miracl_embedding/${CKPT}/${DATASET}/queries.pkl \ | ||
--encode_is_query | ||
``` | ||
|
||
#### Document Encode | ||
```bash | ||
for s in 0 1 2 3; | ||
do | ||
CUDA_VISIBLE_DEVICES=$s python -m tevatron.retriever.driver.encode_mm \ | ||
--output_dir=temp \ | ||
--model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ | ||
--lora_name_or_path ${CKPT} \ | ||
--lora \ | ||
--bf16 \ | ||
--per_device_eval_batch_size 16 \ | ||
--normalize \ | ||
--pooling last \ | ||
--passage_prefix "" \ | ||
--append_eos_token \ | ||
--passage_max_len 512 \ | ||
--dataset_name miracl/miracl-corpus \ | ||
--dataset_config ${DATASET} \ | ||
--dataset_split train \ | ||
--encode_output_path miracl_embedding/${CKPT}/${DATASET}/corpus.${s}.pkl \ | ||
--dataset_number_of_shards 4 \ | ||
--dataset_shard_index ${s} & | ||
done | ||
wait | ||
``` | ||
|
||
|
||
|
||
#### Search | ||
```bash | ||
mkdir -p miracl_results/retriever-qwen25vl-bge-pixmo-colpali-wiki/$DATASET | ||
python -m tevatron.retriever.driver.search \ | ||
--query_reps miracl_embedding/${CKPT}/${DATASET}/queries.pkl \ | ||
--passage_reps miracl_embedding/${CKPT}/${DATASET}/'corpus.*.pkl' \ | ||
--depth 100 \ | ||
--batch_size 64 \ | ||
--save_text \ | ||
--save_ranking_to miracl_results/${CKPT}/${DATASET}/rank.${DATASET}.txt | ||
``` | ||
|
||
#### Evaluate | ||
```bash | ||
python -m tevatron.utils.format.convert_result_to_trec \ | ||
--input miracl_results/${CKPT}/${DATASET}/rank.${DATASET}.txt \ | ||
--output miracl_results/${CKPT}/${DATASET}/rank.${DATASET}.trec | ||
|
||
python -m pyserini.eval.trec_eval -c -mrecall.100 -mndcg_cut.10 miracl-v1.0-${DATASET}-dev \ | ||
miracl_results/${CKPT}/${DATASET}/rank.${DATASET}.trec | ||
``` | ||
|
||
### VIDORE Document screenshot retrieval (Cross modality) | ||
```bash | ||
CUDA_VISIBLE_DEVICES=0 python eval_vidore.py \ | ||
--model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ | ||
--lora_name_or_path ${CKPT} \ | ||
--batch_size 4 \ | ||
--pooling last \ | ||
--normalize \ | ||
--query_prefix "Query: " | ||
``` |
Oops, something went wrong.