From f8acbecf73f32f6ad641d4ce338aeb04a2d093d9 Mon Sep 17 00:00:00 2001 From: Samantha-Zhan Date: Fri, 7 Feb 2025 22:26:48 -0500 Subject: [PATCH 01/16] added dataset transformation scripts for Colpali and Msmarco --- .../colpali_corpus_transformer.py | 42 ++++++++++++++++ .../colpali_transformer.py | 48 +++++++++++++++++++ .../msmarco_corpus_transformer.py | 29 +++++++++++ .../msmarco_transformer.py | 47 ++++++++++++++++++ 4 files changed, 166 insertions(+) create mode 100644 scripts/dataset_transform_scripts/colpali_corpus_transformer.py create mode 100644 scripts/dataset_transform_scripts/colpali_transformer.py create mode 100644 scripts/dataset_transform_scripts/msmarco_corpus_transformer.py create mode 100644 scripts/dataset_transform_scripts/msmarco_transformer.py diff --git a/scripts/dataset_transform_scripts/colpali_corpus_transformer.py b/scripts/dataset_transform_scripts/colpali_corpus_transformer.py new file mode 100644 index 00000000..20b042ce --- /dev/null +++ b/scripts/dataset_transform_scripts/colpali_corpus_transformer.py @@ -0,0 +1,42 @@ +from datasets import load_dataset, load_dataset_builder, Image, DatasetDict, Value, Sequence + + +def loadDatasets(): + # available splits: ['train', 'test'] + ds = load_dataset("vidore/colpali_train_set") + return ds + +def transformPassages(entry, indices): + num_row = len(entry["query"]) + return { + "docid": [str(i) for i in indices], + "text": [None] * num_row, + "source": ["colpali:"+str(source) for source in entry['image_filename']], + } + +def transformDataset(ds): + trans_ds = ds.map(transformPassages, batched=True, with_indices=True, num_proc=8) + + # update column attribute types + trans_ds = trans_ds.cast_column("text", Value("string")) + + # reorder columns + return trans_ds.select_columns(['docid', 'image', 'text', 'source']) + +def uploadDataset(new_dsDict): + new_dsDict.push_to_hub("SamanthaZJQ/colpali-passage-corpus-2.0") + +def main(): + dsDict = loadDatasets() + print(dsDict) + dsDict = {split: transformDataset(dsDict[split]) for split in dsDict} + print(dsDict["train"][0]) + # perform dataset update + uploadDataset(DatasetDict(dsDict)) + # verify feature + print("-------------------") + print(load_dataset_builder("SamanthaZJQ/colpali-passage-corpus-2.0").info.features) + + +if __name__=="__main__": + main() \ No newline at end of file diff --git a/scripts/dataset_transform_scripts/colpali_transformer.py b/scripts/dataset_transform_scripts/colpali_transformer.py new file mode 100644 index 00000000..9a0936d0 --- /dev/null +++ b/scripts/dataset_transform_scripts/colpali_transformer.py @@ -0,0 +1,48 @@ +from datasets import load_dataset, load_dataset_builder, Image, DatasetDict, Value, Sequence + + +def loadDatasets(): + # available splits: ['train', 'test'] + ds = load_dataset("vidore/colpali_train_set") + return ds + +def transformPassages(entry, indices): + num_row = len(entry["query"]) + return { + # transform positive/negative document to id + "positive_document_ids": [[str(i)] for i in indices], + "negative_document_ids": [[] for i in range(num_row)], + # add attributes + "query_id": [str(i) for i in indices], + "query_image": [None] * num_row, + "source": ["colpali: "+str(source) for source in entry['source']], + } + +def transformDataset(ds): + trans_ds = ds.map(transformPassages, remove_columns=["options", "page", "model", "prompt", "answer_type", "image", "image_filename"], batched=True, with_indices=True, num_proc=8) + + # rename attributes + trans_ds = trans_ds.rename_column("query", "query_text") + # update column attribute types + trans_ds = trans_ds.cast_column("query_image", Image()).cast_column("positive_document_ids", Sequence(Value("string"))).cast_column("negative_document_ids", Sequence(Value("string"))) + + # reorder columns + return trans_ds.select_columns(['query_id', 'query_text', 'query_image', 'positive_document_ids', 'negative_document_ids', 'answer', 'source']) + +def uploadDataset(new_dsDict): + new_dsDict.push_to_hub("SamanthaZJQ/colpali-passage-2.0") + +def main(): + dsDict = loadDatasets() + print(dsDict) + dsDict = {split: transformDataset(dsDict[split]) for split in dsDict} + print(dsDict["train"][0]) + # perform dataset update + uploadDataset(DatasetDict(dsDict)) + # verify feature + print("-------------------") + print(load_dataset_builder("SamanthaZJQ/colpali-passage-2.0").info.features) + + +if __name__=="__main__": + main() \ No newline at end of file diff --git a/scripts/dataset_transform_scripts/msmarco_corpus_transformer.py b/scripts/dataset_transform_scripts/msmarco_corpus_transformer.py new file mode 100644 index 00000000..1d4cb492 --- /dev/null +++ b/scripts/dataset_transform_scripts/msmarco_corpus_transformer.py @@ -0,0 +1,29 @@ +from datasets import load_dataset, load_dataset_builder, Image, DatasetDict + +def loadDatasets(): + # available splits: ['train'] + dsDict = load_dataset("Tevatron/msmarco-passage-corpus") + return dsDict + +def transformDataset(ds): + new_image_column = [None] * len(ds) + new_source_column = ["msmarco"] * len(ds) + trans_ds = ds.add_column("image", new_image_column).add_column("source", new_source_column) + return trans_ds.cast_column("image", Image()) + +def uploadDataset(new_dsDict): + new_dsDict.push_to_hub("SamanthaZJQ/msmarco-passage-corpus-2.0") + +def main(): + dsDict = loadDatasets() + print(dsDict) + dsDict = {split: transformDataset(dsDict[split]) for split in dsDict} + print(dsDict["train"][0]) + # perform dataset update + uploadDataset(DatasetDict(dsDict)) + # verify feature + print(load_dataset_builder("SamanthaZJQ/msmarco-passage-corpus-2.0").info.features) + + +if __name__=="__main__": + main() diff --git a/scripts/dataset_transform_scripts/msmarco_transformer.py b/scripts/dataset_transform_scripts/msmarco_transformer.py new file mode 100644 index 00000000..f2618134 --- /dev/null +++ b/scripts/dataset_transform_scripts/msmarco_transformer.py @@ -0,0 +1,47 @@ +from datasets import load_dataset, load_dataset_builder, Image, DatasetDict, Value, Sequence + +def loadDatasets(): + # available splits: ['train', 'dev', 'dl19', 'dl20'] + dsDict = load_dataset("Tevatron/msmarco-passage-aug") + return dsDict + + +def transformPassages(entry): + return { + # transform positive/negative document to id + "positive_document_ids": [[passage['docid'] for passage in passages] for passages in entry['positive_passages']], + "negative_document_ids": [[passage['docid'] for passage in passages] for passages in entry['negative_passages']], + # add query_image, source attributes, and answer + "query_image": [None] * len(entry["query"]), + "source": ["msmarco"] * len(entry["query"]), + "answer": [None] * len(entry["query"]), + } + +def transformDataset(ds): + # convert document_ids to store a list of string docid + trans_ds = ds.map(transformPassages, remove_columns=["positive_passages", "negative_passages"], batched=True, num_proc=8) + + # rename attributes + trans_ds = trans_ds.rename_column("query", "query_text") + # update old column attribute types + trans_ds = trans_ds.cast_column("answer", Value("string")).cast_column("query_image", Image()).cast_column("positive_document_ids", Sequence(Value("string"))).cast_column("negative_document_ids", Sequence(Value("string"))) + + # reorder columns + return trans_ds.select_columns(['query_id', 'query_text', 'query_image', 'positive_document_ids', 'negative_document_ids', 'answer', 'source']) + +def uploadDataset(new_dsDict): + new_dsDict.push_to_hub("SamanthaZJQ/msmarco-passage-aug-2.0") + +def main(): + dsDict = loadDatasets() + print(dsDict) + dsDict = {split: transformDataset(dsDict[split]) for split in dsDict} + # # perform dataset update + uploadDataset(DatasetDict(dsDict)) + # # verify feature + print("-------------------") + print(load_dataset_builder("SamanthaZJQ/msmarco-passage-aug-2.0").info.features) + + +if __name__=="__main__": + main() \ No newline at end of file From ce07901c1b9d6439c7eff3b6dc1d76b0fc1d49ab Mon Sep 17 00:00:00 2001 From: Xueguang Ma Date: Sat, 8 Feb 2025 17:12:48 +0000 Subject: [PATCH 02/16] update for training --- deepspeed/ds_zero0_config.json | 65 ++++++++++++ src/tevatron/retriever/arguments.py | 18 +++- src/tevatron/retriever/collator.py | 105 +++++++++++++++++++- src/tevatron/retriever/dataset.py | 76 +++++++------- src/tevatron/retriever/driver/train_mm.py | 99 ++++++++++++++++++ src/tevatron/retriever/modeling/__init__.py | 2 +- src/tevatron/retriever/modeling/dense.py | 30 +++++- src/tevatron/retriever/modeling/encoder.py | 2 +- 8 files changed, 352 insertions(+), 45 deletions(-) create mode 100644 deepspeed/ds_zero0_config.json create mode 100644 src/tevatron/retriever/driver/train_mm.py diff --git a/deepspeed/ds_zero0_config.json b/deepspeed/ds_zero0_config.json new file mode 100644 index 00000000..6d1732f4 --- /dev/null +++ b/deepspeed/ds_zero0_config.json @@ -0,0 +1,65 @@ +{ + "zero_optimization": { + "stage": 0, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": 1e6, + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + }, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "initial_scale_power": 10, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto", + "loss_scale": 0, + "initial_scale_power": 10, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto", + "torch_adam": true + } + }, + + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "total_num_steps": "auto" + } + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 1000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/src/tevatron/retriever/arguments.py b/src/tevatron/retriever/arguments.py index def20db0..09b3e62b 100644 --- a/src/tevatron/retriever/arguments.py +++ b/src/tevatron/retriever/arguments.py @@ -43,7 +43,7 @@ class ModelArguments: ) lora_r: int = field( - default=8, + default=16, metadata={"help": "lora r"} ) @@ -94,6 +94,22 @@ class DataArguments: default=None, metadata={"help": "Where do you want to store the data downloaded from huggingface"} ) + corpus_name: str = field( + default='json', metadata={"help": "huggingface dataset name for corpus"} + ) + + corpus_config: str = field( + default=None, metadata={"help": "huggingface dataset config for corpus, useful for datasets with sub-datasets"} + ) + + corpus_path: str = field( + default=None, metadata={"help": "Path to local corpus files or directory"} + ) + + corpus_split: str = field( + default='train', metadata={"help": "corpus split"} + ) + dataset_number_of_shards: int = field( default=1, metadata={"help": "number of shards to split the dataset into"} ) diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index 076b3d64..6b0ea64f 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -1,7 +1,10 @@ import logging from typing import List, Tuple from dataclasses import dataclass -from transformers import PreTrainedTokenizer +from transformers import PreTrainedTokenizer, ProcessorMixin +from qwen_vl_utils import process_vision_info +from PIL import Image + from tevatron.retriever.arguments import DataArguments @@ -10,6 +13,9 @@ @dataclass class TrainCollator: + """ + simple collator for text only data. + """ data_args: DataArguments tokenizer: PreTrainedTokenizer @@ -23,6 +29,8 @@ def __call__(self, features: List[Tuple[str, List[str]]]): all_passages = [] for f in features: all_passages.extend(f[1]) + all_queries = [q[0] for q in all_queries] + all_passages = [p[0] for p in all_passages] q_collated = self.tokenizer( all_queries, padding=False, @@ -63,6 +71,101 @@ def __call__(self, features: List[Tuple[str, List[str]]]): return q_collated, d_collated +@dataclass +class MultiModalTrainCollator: + """ + collator for text-visual data. + """ + data_args: DataArguments + processor: ProcessorMixin + + def __call__(self, features): + """ + Collate function for training. + :param features: list of (query, passages) tuples + :return: prepared model inputs + """ + all_queries = [f[0] for f in features] + all_passages = [] + for f in features: + all_passages.extend(f[1]) + + query_messages = [] + for query in all_queries: + text = query[0] + image = query[1] + content = [] + if text: + text = self.processor.tokenizer.decode( + self.processor.tokenizer.encode(text, max_length=self.data_args.query_max_len, truncation=True) + ) + content.append({'type': 'text', 'text': text}) + if image: + content.append({'type': 'image', 'image': image, 'resized_height': 784, 'resized_width': 784}) + message = [ + { + 'role': 'user', + 'content': content + } + ] + query_messages.append(message) + + passage_messages = [] + for idx in range(len(all_passages)): + text = all_passages[idx][0] + image = all_passages[idx][1] + content = [] + if text: + text = self.processor.tokenizer.decode( + self.processor.tokenizer.encode(text, max_length=self.data_args.passage_max_len, truncation=True) + ) + content.append({'type': 'text', 'text': text}) + if image: + content.append({'type': 'image', 'image': image, 'resized_height': 784, 'resized_width': 784}) + message = [ + { + 'role': 'user', + 'content': content + } + ] + passage_messages.append(message) + + query_texts = [ + self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) + for msg in query_messages + ] + + passage_texts = [ + self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) + for msg in passage_messages + ] + + if self.data_args.append_eos_token: + # should already have a eos token so not very necessary to have additional one. + query_texts = [x + '<|endoftext|>' for x in query_texts] + passage_texts = [x + '<|endoftext|>' for x in passage_texts] + + + query_image_inputs, query_video_inputs = process_vision_info(query_messages) + passage_image_inputs, passage_video_inputs = process_vision_info(passage_messages) + + query_inputs = self.processor( + text=query_texts, + images=query_image_inputs, + videos=query_video_inputs, + return_tensors="pt", + padding="longest", + ) + + passage_inputs = self.processor( + text=passage_texts, + images=passage_image_inputs, + videos=passage_video_inputs, + return_tensors="pt", + padding="longest", + ) + return query_inputs, passage_inputs + @dataclass class EncodeCollator: data_args: DataArguments diff --git a/src/tevatron/retriever/dataset.py b/src/tevatron/retriever/dataset.py index fd666b94..df94c419 100644 --- a/src/tevatron/retriever/dataset.py +++ b/src/tevatron/retriever/dataset.py @@ -4,20 +4,12 @@ from datasets import load_dataset from torch.utils.data import Dataset - +from PIL import Image from tevatron.retriever.arguments import DataArguments import logging logger = logging.getLogger(__name__) - -def format_query(query: str, prefix: str = '') -> str: - return f'{prefix}{query.strip()}'.strip() - -def format_passage(text: str, title: str = '', prefix: str = '') -> str: - return f'{prefix}{title.strip()} {text.strip()}'.strip() - - class TrainDataset(Dataset): def __init__(self, data_args: DataArguments, trainer = None): self.data_args = data_args @@ -28,54 +20,62 @@ def __init__(self, data_args: DataArguments, trainer = None): split=self.data_args.dataset_split, cache_dir=self.data_args.dataset_cache_dir, ) - if self.data_args.dataset_number_of_shards > 1: - self.encode_data = self.encode_data.shard( - num_shards=self.data_args.dataset_number_of_shards, - index=self.data_args.dataset_shard_index, - ) + self.corpus = load_dataset( + self.data_args.corpus_name, + self.data_args.corpus_config, + data_files=self.data_args.corpus_path, + split=self.data_args.corpus_split, + cache_dir=self.data_args.dataset_cache_dir, + ) self.trainer = trainer def __len__(self): return len(self.train_data) + + def _get_info_from_docid(self, docid, prefix): + document_info = self.corpus[int(docid)] + assert int(document_info['docid']) == int(docid) + image = None if 'image' not in document_info else document_info['image'] + image = image.convert('RGB') + text = None if 'text' not in document_info else document_info['text'] + text = '' if text is None else text + return prefix + text, image def __getitem__(self, item) -> Tuple[str, List[str]]: group = self.train_data[item] epoch = int(self.trainer.state.epoch) _hashed_seed = hash(item + self.trainer.args.seed) + query_id = group['query_id'] + query_text = group['query_text'] + query_text = '' if query_text is None else query_text + query_image = group['query_image'] + positive_document_ids = group['positive_document_ids'] + negative_document_ids = group['negative_document_ids'] - query = group['query'] - group_positives = group['positive_passages'] - group_negatives = group['negative_passages'] - - formated_query = format_query(query, self.data_args.query_prefix) - formated_passages = [] + formated_query = (self.data_args.query_prefix + query_text, query_image) + formated_documents = [] - if self.data_args.positive_passage_no_shuffle: - pos_psg = group_positives[0] - else: - pos_psg = group_positives[(_hashed_seed + epoch) % len(group_positives)] + selected_positive_document_id = positive_document_ids[(_hashed_seed + epoch) % len(positive_document_ids)] - formated_passages.append(format_passage(pos_psg['text'], pos_psg['title'], self.data_args.passage_prefix)) + formated_documents.append(self._get_info_from_docid(selected_positive_document_id, self.data_args.passage_prefix)) negative_size = self.data_args.train_group_size - 1 - if len(group_negatives) < negative_size: - negs = random.choices(group_negatives, k=negative_size) + if len(negative_document_ids) < negative_size: + selected_negative_document_ids = random.choices(negative_document_ids, k=negative_size) elif self.data_args.train_group_size == 1: - negs = [] - elif self.data_args.negative_passage_no_shuffle: - negs = group_negatives[:negative_size] + selected_negative_document_ids = [] else: - _offset = epoch * negative_size % len(group_negatives) - negs = [x for x in group_negatives] - random.Random(_hashed_seed).shuffle(negs) - negs = negs * 2 - negs = negs[_offset: _offset + negative_size] + _offset = epoch * negative_size % len(negative_document_ids) + selected_negative_document_ids = [x for x in negative_document_ids] + random.Random(_hashed_seed).shuffle(selected_negative_document_ids) + selected_negative_document_ids = selected_negative_document_ids * 2 + selected_negative_document_ids = selected_negative_document_ids[_offset: _offset + negative_size] - for neg_psg in negs: - formated_passages.append(format_passage(neg_psg['text'], neg_psg['title'], self.data_args.passage_prefix)) + for negative_document_id in selected_negative_document_ids: + formated_documents.append(self._get_info_from_docid(negative_document_id, self.data_args.passage_prefix)) - return formated_query, formated_passages + return formated_query, formated_documents class EncodeDataset(Dataset): diff --git a/src/tevatron/retriever/driver/train_mm.py b/src/tevatron/retriever/driver/train_mm.py new file mode 100644 index 00000000..e5c9e831 --- /dev/null +++ b/src/tevatron/retriever/driver/train_mm.py @@ -0,0 +1,99 @@ +import logging +import os +import sys + +from transformers import AutoProcessor +from transformers import ( + HfArgumentParser, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint + +from tevatron.retriever.arguments import ModelArguments, DataArguments, \ + TevatronTrainingArguments as TrainingArguments +from tevatron.retriever.dataset import TrainDataset +from tevatron.retriever.collator import MultiModalTrainCollator +from tevatron.retriever.modeling import MultiModalDenseModel +from tevatron.retriever.trainer import TevatronTrainer + +logger = logging.getLogger(__name__) + + +def main(): + parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args: ModelArguments + data_args: DataArguments + training_args: TrainingArguments + + if ( + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + logger.info("MODEL parameters %s", model_args) + + set_seed(training_args.seed) + + processor = AutoProcessor.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + trust_remote_code=True, + ) + if processor.tokenizer.pad_token_id is None: + processor.tokenizer.pad_token_id = tokenizer.eos_token_id + processor.tokenizer.padding_side = "left" + + model = MultiModalDenseModel.build( + model_args, + training_args, + cache_dir=model_args.cache_dir, + ) + + train_dataset = TrainDataset(data_args) + collator = MultiModalTrainCollator(data_args, processor) + + trainer = TevatronTrainer( + model=model, + args=training_args, + train_dataset=train_dataset, + data_collator=collator + ) + train_dataset.trainer = trainer + + last_checkpoint = None + if os.path.isdir(training_args.output_dir): + last_checkpoint = get_last_checkpoint(training_args.output_dir) + + trainer.train(resume_from_checkpoint=(last_checkpoint is not None)) + trainer.save_model() + if trainer.is_world_process_zero(): + tokenizer.save_pretrained(training_args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/src/tevatron/retriever/modeling/__init__.py b/src/tevatron/retriever/modeling/__init__.py index a48d9543..852ce614 100644 --- a/src/tevatron/retriever/modeling/__init__.py +++ b/src/tevatron/retriever/modeling/__init__.py @@ -1,4 +1,4 @@ from .encoder import EncoderModel, EncoderOutput -from .dense import DenseModel +from .dense import DenseModel, MultiModalDenseModel from .unicoil import UniCoilModel from .splade import SpladeModel diff --git a/src/tevatron/retriever/modeling/dense.py b/src/tevatron/retriever/modeling/dense.py index d1691c2b..9d7e9cbb 100644 --- a/src/tevatron/retriever/modeling/dense.py +++ b/src/tevatron/retriever/modeling/dense.py @@ -1,5 +1,6 @@ import torch import logging +from transformers import Qwen2VLForConditionalGeneration from .encoder import EncoderModel logger = logging.getLogger(__name__) @@ -24,11 +25,34 @@ def _pooling(self, last_hidden_state, attention_mask): masked_hiddens = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0) reps = masked_hiddens.sum(dim=1) / attention_mask.sum(dim=1)[..., None] elif self.pooling in ['last', 'eos']: - sequence_lengths = attention_mask.sum(dim=1) - 1 - batch_size = last_hidden_state.shape[0] - reps = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths] + # sequence_lengths = attention_mask.sum(dim=1) - 1 + # batch_size = last_hidden_state.shape[0] + # reps = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths] + # it is left padded, so the last token is the representation + reps = last_hidden_state[:, -1] else: raise ValueError(f'unknown pooling method: {self.pooling}') if self.normalize: reps = torch.nn.functional.normalize(reps, p=2, dim=-1) return reps + + +class MultiModalDenseModel(DenseModel): + TRANSFORMER_CLS = Qwen2VLForConditionalGeneration + + def __init__(self, encoder, pooling='eos', normalize=True, temperature=0.02): + super().__init__(encoder, pooling, normalize, temperature) + # freeze visual encoder + for param in self.encoder.visual.parameters(): + param.requires_grad = False + + def encode_query(self, qry): + cache_position = torch.arange(0, qry['input_ids'].shape[1], device=qry['input_ids'].device) + qry = self.encoder.prepare_inputs_for_generation(**qry, use_cache=True, cache_position=cache_position) + query_hidden_states = self.encoder(**qry, return_dict=True, output_hidden_states=True) + query_hidden_states = query_hidden_states.hidden_states[-1] + return self._pooling(query_hidden_states, qry['attention_mask']) + + def encode_passage(self, psg): + # encode passage is the same as encode query + return self.encode_query(psg) \ No newline at end of file diff --git a/src/tevatron/retriever/modeling/encoder.py b/src/tevatron/retriever/modeling/encoder.py index 726ecb7e..b102f134 100644 --- a/src/tevatron/retriever/modeling/encoder.py +++ b/src/tevatron/retriever/modeling/encoder.py @@ -116,7 +116,7 @@ def build( train_args: TrainingArguments, **hf_kwargs, ): - base_model = cls.TRANSFORMER_CLS.from_pretrained(model_args.model_name_or_path, **hf_kwargs) + base_model = cls.TRANSFORMER_CLS.from_pretrained(model_args.model_name_or_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", **hf_kwargs) if base_model.config.pad_token_id is None: base_model.config.pad_token_id = 0 if model_args.lora or model_args.lora_name_or_path: From 5026f5eb27a641549739dd5271e5518e7b0be469 Mon Sep 17 00:00:00 2001 From: Samantha-Zhan Date: Sun, 9 Feb 2025 09:42:07 -0500 Subject: [PATCH 03/16] convert to snake style --- ...sformer.py => ColpaliCorpusTransformer.py} | 22 ++++++++--------- ...i_transformer.py => ColpaliTransformer.py} | 22 ++++++++--------- ...sformer.py => MsmarcoCorpusTransformer.py} | 22 ++++++++--------- ...o_transformer.py => MsmarcoTransformer.py} | 24 +++++++++---------- 4 files changed, 45 insertions(+), 45 deletions(-) rename scripts/dataset_transform_scripts/{colpali_corpus_transformer.py => ColpaliCorpusTransformer.py} (63%) rename scripts/dataset_transform_scripts/{colpali_transformer.py => ColpaliTransformer.py} (69%) rename scripts/dataset_transform_scripts/{msmarco_corpus_transformer.py => MsmarcoCorpusTransformer.py} (55%) rename scripts/dataset_transform_scripts/{msmarco_transformer.py => MsmarcoTransformer.py} (72%) diff --git a/scripts/dataset_transform_scripts/colpali_corpus_transformer.py b/scripts/dataset_transform_scripts/ColpaliCorpusTransformer.py similarity index 63% rename from scripts/dataset_transform_scripts/colpali_corpus_transformer.py rename to scripts/dataset_transform_scripts/ColpaliCorpusTransformer.py index 20b042ce..282a7034 100644 --- a/scripts/dataset_transform_scripts/colpali_corpus_transformer.py +++ b/scripts/dataset_transform_scripts/ColpaliCorpusTransformer.py @@ -1,12 +1,12 @@ from datasets import load_dataset, load_dataset_builder, Image, DatasetDict, Value, Sequence -def loadDatasets(): +def load_datasets(): # available splits: ['train', 'test'] ds = load_dataset("vidore/colpali_train_set") return ds -def transformPassages(entry, indices): +def transform_passages(entry, indices): num_row = len(entry["query"]) return { "docid": [str(i) for i in indices], @@ -14,8 +14,8 @@ def transformPassages(entry, indices): "source": ["colpali:"+str(source) for source in entry['image_filename']], } -def transformDataset(ds): - trans_ds = ds.map(transformPassages, batched=True, with_indices=True, num_proc=8) +def transform_dataset(ds): + trans_ds = ds.map(transform_passages, batched=True, with_indices=True, num_proc=8) # update column attribute types trans_ds = trans_ds.cast_column("text", Value("string")) @@ -23,16 +23,16 @@ def transformDataset(ds): # reorder columns return trans_ds.select_columns(['docid', 'image', 'text', 'source']) -def uploadDataset(new_dsDict): - new_dsDict.push_to_hub("SamanthaZJQ/colpali-passage-corpus-2.0") +def upload_dataset(new_ds_dict): + new_ds_dict.push_to_hub("SamanthaZJQ/colpali-passage-corpus-2.0") def main(): - dsDict = loadDatasets() - print(dsDict) - dsDict = {split: transformDataset(dsDict[split]) for split in dsDict} - print(dsDict["train"][0]) + ds_dict = load_datasets() + print(ds_dict) + ds_dict = {split: transform_dataset(ds_dict[split]) for split in ds_dict} + print(ds_dict["train"][0]) # perform dataset update - uploadDataset(DatasetDict(dsDict)) + upload_dataset(DatasetDict(ds_dict)) # verify feature print("-------------------") print(load_dataset_builder("SamanthaZJQ/colpali-passage-corpus-2.0").info.features) diff --git a/scripts/dataset_transform_scripts/colpali_transformer.py b/scripts/dataset_transform_scripts/ColpaliTransformer.py similarity index 69% rename from scripts/dataset_transform_scripts/colpali_transformer.py rename to scripts/dataset_transform_scripts/ColpaliTransformer.py index 9a0936d0..e7e65e8c 100644 --- a/scripts/dataset_transform_scripts/colpali_transformer.py +++ b/scripts/dataset_transform_scripts/ColpaliTransformer.py @@ -1,12 +1,12 @@ from datasets import load_dataset, load_dataset_builder, Image, DatasetDict, Value, Sequence -def loadDatasets(): +def load_datasets(): # available splits: ['train', 'test'] ds = load_dataset("vidore/colpali_train_set") return ds -def transformPassages(entry, indices): +def transform_passages(entry, indices): num_row = len(entry["query"]) return { # transform positive/negative document to id @@ -18,8 +18,8 @@ def transformPassages(entry, indices): "source": ["colpali: "+str(source) for source in entry['source']], } -def transformDataset(ds): - trans_ds = ds.map(transformPassages, remove_columns=["options", "page", "model", "prompt", "answer_type", "image", "image_filename"], batched=True, with_indices=True, num_proc=8) +def transform_dataset(ds): + trans_ds = ds.map(transform_passages, remove_columns=["options", "page", "model", "prompt", "answer_type", "image", "image_filename"], batched=True, with_indices=True, num_proc=8) # rename attributes trans_ds = trans_ds.rename_column("query", "query_text") @@ -29,16 +29,16 @@ def transformDataset(ds): # reorder columns return trans_ds.select_columns(['query_id', 'query_text', 'query_image', 'positive_document_ids', 'negative_document_ids', 'answer', 'source']) -def uploadDataset(new_dsDict): - new_dsDict.push_to_hub("SamanthaZJQ/colpali-passage-2.0") +def upload_dataset(new_ds_dict): + new_ds_dict.push_to_hub("SamanthaZJQ/colpali-passage-2.0") def main(): - dsDict = loadDatasets() - print(dsDict) - dsDict = {split: transformDataset(dsDict[split]) for split in dsDict} - print(dsDict["train"][0]) + ds_dict = load_datasets() + print(ds_dict) + ds_dict = {split: transform_dataset(ds_dict[split]) for split in ds_dict} + print(ds_dict["train"][0]) # perform dataset update - uploadDataset(DatasetDict(dsDict)) + upload_dataset(DatasetDict(ds_dict)) # verify feature print("-------------------") print(load_dataset_builder("SamanthaZJQ/colpali-passage-2.0").info.features) diff --git a/scripts/dataset_transform_scripts/msmarco_corpus_transformer.py b/scripts/dataset_transform_scripts/MsmarcoCorpusTransformer.py similarity index 55% rename from scripts/dataset_transform_scripts/msmarco_corpus_transformer.py rename to scripts/dataset_transform_scripts/MsmarcoCorpusTransformer.py index 1d4cb492..c1d6e0b1 100644 --- a/scripts/dataset_transform_scripts/msmarco_corpus_transformer.py +++ b/scripts/dataset_transform_scripts/MsmarcoCorpusTransformer.py @@ -1,26 +1,26 @@ from datasets import load_dataset, load_dataset_builder, Image, DatasetDict -def loadDatasets(): +def load_datasets(): # available splits: ['train'] - dsDict = load_dataset("Tevatron/msmarco-passage-corpus") - return dsDict + ds_dict = load_dataset("Tevatron/msmarco-passage-corpus") + return ds_dict -def transformDataset(ds): +def transform_dataset(ds): new_image_column = [None] * len(ds) new_source_column = ["msmarco"] * len(ds) trans_ds = ds.add_column("image", new_image_column).add_column("source", new_source_column) return trans_ds.cast_column("image", Image()) -def uploadDataset(new_dsDict): - new_dsDict.push_to_hub("SamanthaZJQ/msmarco-passage-corpus-2.0") +def upload_dataset(new_ds_dict): + new_ds_dict.push_to_hub("SamanthaZJQ/msmarco-passage-corpus-2.0") def main(): - dsDict = loadDatasets() - print(dsDict) - dsDict = {split: transformDataset(dsDict[split]) for split in dsDict} - print(dsDict["train"][0]) + ds_dict = load_datasets() + print(ds_dict) + ds_dict = {split: transform_dataset(ds_dict[split]) for split in ds_dict} + print(ds_dict["train"][0]) # perform dataset update - uploadDataset(DatasetDict(dsDict)) + upload_dataset(DatasetDict(ds_dict)) # verify feature print(load_dataset_builder("SamanthaZJQ/msmarco-passage-corpus-2.0").info.features) diff --git a/scripts/dataset_transform_scripts/msmarco_transformer.py b/scripts/dataset_transform_scripts/MsmarcoTransformer.py similarity index 72% rename from scripts/dataset_transform_scripts/msmarco_transformer.py rename to scripts/dataset_transform_scripts/MsmarcoTransformer.py index f2618134..631c577a 100644 --- a/scripts/dataset_transform_scripts/msmarco_transformer.py +++ b/scripts/dataset_transform_scripts/MsmarcoTransformer.py @@ -1,12 +1,12 @@ from datasets import load_dataset, load_dataset_builder, Image, DatasetDict, Value, Sequence -def loadDatasets(): +def load_datasets(): # available splits: ['train', 'dev', 'dl19', 'dl20'] - dsDict = load_dataset("Tevatron/msmarco-passage-aug") - return dsDict + ds_dict = load_dataset("Tevatron/msmarco-passage-aug") + return ds_dict -def transformPassages(entry): +def transform_passages(entry): return { # transform positive/negative document to id "positive_document_ids": [[passage['docid'] for passage in passages] for passages in entry['positive_passages']], @@ -17,9 +17,9 @@ def transformPassages(entry): "answer": [None] * len(entry["query"]), } -def transformDataset(ds): +def transform_dataset(ds): # convert document_ids to store a list of string docid - trans_ds = ds.map(transformPassages, remove_columns=["positive_passages", "negative_passages"], batched=True, num_proc=8) + trans_ds = ds.map(transform_passages, remove_columns=["positive_passages", "negative_passages"], batched=True, num_proc=8) # rename attributes trans_ds = trans_ds.rename_column("query", "query_text") @@ -29,15 +29,15 @@ def transformDataset(ds): # reorder columns return trans_ds.select_columns(['query_id', 'query_text', 'query_image', 'positive_document_ids', 'negative_document_ids', 'answer', 'source']) -def uploadDataset(new_dsDict): - new_dsDict.push_to_hub("SamanthaZJQ/msmarco-passage-aug-2.0") +def upload_dataset(new_ds_dict): + new_ds_dict.push_to_hub("SamanthaZJQ/msmarco-passage-aug-2.0") def main(): - dsDict = loadDatasets() - print(dsDict) - dsDict = {split: transformDataset(dsDict[split]) for split in dsDict} + ds_dict = load_datasets() + print(ds_dict) + ds_dict = {split: transform_dataset(ds_dict[split]) for split in ds_dict} # # perform dataset update - uploadDataset(DatasetDict(dsDict)) + upload_dataset(DatasetDict(ds_dict)) # # verify feature print("-------------------") print(load_dataset_builder("SamanthaZJQ/msmarco-passage-aug-2.0").info.features) From a4fb2a4597674f500bbeb05e4e5d800b58322dd5 Mon Sep 17 00:00:00 2001 From: Xueguang Ma Date: Sun, 9 Feb 2025 17:28:46 +0000 Subject: [PATCH 04/16] add encoding and multidataset --- src/tevatron/retriever/arguments.py | 4 ++ src/tevatron/retriever/collator.py | 80 +++++++++++++++++++--- src/tevatron/retriever/dataset.py | 77 +++++++++++++++++---- src/tevatron/retriever/driver/train_mm.py | 14 +++- src/tevatron/retriever/modeling/dense.py | 12 ++-- src/tevatron/retriever/modeling/encoder.py | 2 +- 6 files changed, 157 insertions(+), 32 deletions(-) diff --git a/src/tevatron/retriever/arguments.py b/src/tevatron/retriever/arguments.py index 09b3e62b..d5cb842d 100644 --- a/src/tevatron/retriever/arguments.py +++ b/src/tevatron/retriever/arguments.py @@ -110,6 +110,10 @@ class DataArguments: default='train', metadata={"help": "corpus split"} ) + train_yaml: str = field( + default=None, metadata={"help": "yaml file for training datasets, if there is more multiple datasets used for training"} + ) + dataset_number_of_shards: int = field( default=1, metadata={"help": "number of shards to split the dataset into"} ) diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index 6b0ea64f..1f8f1703 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -141,7 +141,6 @@ def __call__(self, features): ] if self.data_args.append_eos_token: - # should already have a eos token so not very necessary to have additional one. query_texts = [x + '<|endoftext|>' for x in query_texts] passage_texts = [x + '<|endoftext|>' for x in passage_texts] @@ -168,18 +167,23 @@ def __call__(self, features): @dataclass class EncodeCollator: + """ + simple collator for text only data. + """ data_args: DataArguments tokenizer: PreTrainedTokenizer - def __call__(self, features: List[Tuple[str, str]]): + def __call__(self, features): """ Collate function for encoding. - :param features: list of (id, text) tuples + :param features: list of (id, text, image) tuples + but in this case, it's just image is None """ - text_ids = [x[0] for x in features] + content_ids = [x[0] for x in features] texts = [x[1] for x in features] + images = [x[2] for x in features] # this will be ignored max_length = self.data_args.query_max_len if self.data_args.encode_is_query else self.data_args.passage_max_len - collated_texts = self.tokenizer( + collated_inputs = self.tokenizer( texts, padding=False, truncation=True, @@ -189,15 +193,73 @@ def __call__(self, features: List[Tuple[str, str]]): add_special_tokens=True, ) if self.data_args.append_eos_token: - collated_texts['input_ids'] = [x + [self.tokenizer.eos_token_id] for x in collated_texts['input_ids']] - collated_texts = self.tokenizer.pad( - collated_texts, + collated_inputs['input_ids'] = [x + [self.tokenizer.eos_token_id] for x in collated_inputs['input_ids']] + collated_inputs = self.tokenizer.pad( + collated_inputs, padding=True, pad_to_multiple_of=self.data_args.pad_to_multiple_of, return_attention_mask=True, return_tensors='pt', ) - return text_ids, collated_texts + return content_ids, collated_inputs + +@dataclass +class MultiModalEncodeCollator: + """ + collator for text-visual data. + """ + data_args: DataArguments + processor: ProcessorMixin + + def __call__(self, features): + """ + Collate function for encoding. + :param features: list of (id, text, image) tuples + but in this case, it's just image is None + """ + content_ids = [x[0] for x in features] + texts = [x[1] for x in features] + images = [x[2] for x in features] + messages = [] + max_length = self.data_args.query_max_len if self.data_args.encode_is_query else self.data_args.passage_max_len + for idx in range(len(texts)): + text = texts[idx] + image = images[idx] + content = [] + if text: + text = self.processor.tokenizer.decode( + self.processor.tokenizer.encode(text, max_length=max_length, truncation=True) + ) + content.append({'type': 'text', 'text': text}) + if image: + content.append({'type': 'image', 'image': image, 'resized_height': 784, 'resized_width': 784}) + # content.append({'type': 'text', 'text': 'What is shown in this image?'}) + message = [ + { + 'role': 'user', + 'content': content + } + ] + messages.append(message) + + texts = [ + self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) + for msg in messages + ] + + if self.data_args.append_eos_token: + texts = [x + '<|endoftext|>' for x in texts] + + image_inputs, video_inputs = process_vision_info(messages) + + collated_inputs = self.processor( + text=texts, + images=image_inputs, + videos=video_inputs, + return_tensors="pt", + padding="longest", + ) + return content_ids, collated_inputs @dataclass diff --git a/src/tevatron/retriever/dataset.py b/src/tevatron/retriever/dataset.py index df94c419..68fa62e3 100644 --- a/src/tevatron/retriever/dataset.py +++ b/src/tevatron/retriever/dataset.py @@ -1,4 +1,5 @@ import random +import os from typing import List, Tuple from datasets import load_dataset @@ -11,24 +12,27 @@ logger = logging.getLogger(__name__) class TrainDataset(Dataset): - def __init__(self, data_args: DataArguments, trainer = None): + def __init__(self, data_args: DataArguments, trainer = None, dataset_name = None, corpus_name = None, dataset_path = None, corpus_path = None): self.data_args = data_args self.train_data = load_dataset( - self.data_args.dataset_name, + self.data_args.dataset_name if dataset_name is None else dataset_name, self.data_args.dataset_config, - data_files=self.data_args.dataset_path, + data_files=self.data_args.dataset_path if dataset_path is None else dataset_path, split=self.data_args.dataset_split, cache_dir=self.data_args.dataset_cache_dir, ) self.corpus = load_dataset( - self.data_args.corpus_name, + self.data_args.corpus_name if corpus_name is None else corpus_name, self.data_args.corpus_config, - data_files=self.data_args.corpus_path, + data_files=self.data_args.corpus_path if corpus_path is None else corpus_path, split=self.data_args.corpus_split, cache_dir=self.data_args.dataset_cache_dir, ) self.trainer = trainer + def set_trainer(self, trainer): + self.trainer = trainer + def __len__(self): return len(self.train_data) @@ -36,12 +40,11 @@ def _get_info_from_docid(self, docid, prefix): document_info = self.corpus[int(docid)] assert int(document_info['docid']) == int(docid) image = None if 'image' not in document_info else document_info['image'] - image = image.convert('RGB') text = None if 'text' not in document_info else document_info['text'] text = '' if text is None else text return prefix + text, image - def __getitem__(self, item) -> Tuple[str, List[str]]: + def __getitem__(self, item): group = self.train_data[item] epoch = int(self.trainer.state.epoch) @@ -77,6 +80,47 @@ def __getitem__(self, item) -> Tuple[str, List[str]]: return formated_query, formated_documents +class MultiTrainDataset(Dataset): + + def __init__(self, data_args: DataArguments, dataset_list = None, corpus_list = None, trainer = None): + self.data_args = data_args + self.trainer = trainer + self.train_datasets = [] + for dataset_name_or_path, corpus_name_or_path in zip(dataset_list, corpus_list): + dataset_name = None + dataset_path = None + corpus_name = None + corpus_path = None + if os.path.isdir(dataset_name_or_path): + dataset_name = dataset_name_or_path + elif dataset_name_or_path.endswith('.jsonl'): + dataset_name = 'json' + dataset_path = dataset_name_or_path + else: + dataset_name = dataset_name_or_path + if os.path.isdir(corpus_name_or_path): + corpus_name = corpus_name_or_path + elif corpus_name_or_path.endswith('.jsonl'): + corpus_name = 'json' + corpus_path = corpus_name_or_path + else: + corpus_name = corpus_name_or_path + self.train_datasets.append(TrainDataset(data_args, trainer, dataset_name, corpus_name, dataset_path, corpus_path)) + + def __len__(self): + return sum([len(dataset) for dataset in self.train_datasets]) + + def __getitem__(self, item): + dataset_index = 0 + while item >= len(self.train_datasets[dataset_index]): + item -= len(self.train_datasets[dataset_index]) + dataset_index += 1 + return self.train_datasets[dataset_index][item] + + def set_trainer(self, trainer): + for dataset in self.train_datasets: + dataset.set_trainer(trainer) + class EncodeDataset(Dataset): @@ -98,12 +142,17 @@ def __init__(self, data_args: DataArguments): def __len__(self): return len(self.encode_data) - def __getitem__(self, item) -> Tuple[str, str]: - text = self.encode_data[item] + def __getitem__(self, item): + content = self.encode_data[item] if self.data_args.encode_is_query: - text_id = text['query_id'] - formated_text = format_query(text['query'], self.data_args.query_prefix) + content_id = content['query_id'] + content_text = content['query_text'] if 'query_text' in content else '' + content_text = self.data_args.query_prefix + content_text + content_image = content['query_image'] if 'query_image' in content else None else: - text_id = text['docid'] - formated_text = format_passage(text['text'], text['title'], self.data_args.passage_prefix) - return text_id, formated_text + content_id = content['docid'] + content_text = content['text'] if 'text' in content else None + content_text = '' if content_text is None else content_text + content_text = self.data_args.passage_prefix + content_text + content_image = content['image'] if 'image' in content else None + return content_id, content_text, content_image diff --git a/src/tevatron/retriever/driver/train_mm.py b/src/tevatron/retriever/driver/train_mm.py index e5c9e831..1bad4cab 100644 --- a/src/tevatron/retriever/driver/train_mm.py +++ b/src/tevatron/retriever/driver/train_mm.py @@ -1,6 +1,7 @@ import logging import os import sys +import yaml from transformers import AutoProcessor from transformers import ( @@ -11,7 +12,7 @@ from tevatron.retriever.arguments import ModelArguments, DataArguments, \ TevatronTrainingArguments as TrainingArguments -from tevatron.retriever.dataset import TrainDataset +from tevatron.retriever.dataset import TrainDataset, MultiTrainDataset from tevatron.retriever.collator import MultiModalTrainCollator from tevatron.retriever.modeling import MultiModalDenseModel from tevatron.retriever.trainer import TevatronTrainer @@ -74,7 +75,13 @@ def main(): cache_dir=model_args.cache_dir, ) - train_dataset = TrainDataset(data_args) + if data_args.train_yaml is not None: + with open(data_args.train_yaml, 'r') as f: + train_yaml = yaml.safe_load(f) + dataset_list = train_yaml['train'] + corpus_list = train_yaml['corpus'] + + train_dataset = MultiTrainDataset(data_args, dataset_list, corpus_list) if data_args.train_yaml is not None else TrainDataset(data_args) collator = MultiModalTrainCollator(data_args, processor) trainer = TevatronTrainer( @@ -83,7 +90,8 @@ def main(): train_dataset=train_dataset, data_collator=collator ) - train_dataset.trainer = trainer + + train_dataset.set_trainer(trainer) last_checkpoint = None if os.path.isdir(training_args.output_dir): diff --git a/src/tevatron/retriever/modeling/dense.py b/src/tevatron/retriever/modeling/dense.py index 9d7e9cbb..18ddb978 100644 --- a/src/tevatron/retriever/modeling/dense.py +++ b/src/tevatron/retriever/modeling/dense.py @@ -25,11 +25,13 @@ def _pooling(self, last_hidden_state, attention_mask): masked_hiddens = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0) reps = masked_hiddens.sum(dim=1) / attention_mask.sum(dim=1)[..., None] elif self.pooling in ['last', 'eos']: - # sequence_lengths = attention_mask.sum(dim=1) - 1 - # batch_size = last_hidden_state.shape[0] - # reps = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths] - # it is left padded, so the last token is the representation - reps = last_hidden_state[:, -1] + left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) + if left_padding: + reps = last_hidden_state[:, -1] + else: + sequence_lengths = attention_mask.sum(dim=1) - 1 + batch_size = last_hidden_state.shape[0] + reps = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths] else: raise ValueError(f'unknown pooling method: {self.pooling}') if self.normalize: diff --git a/src/tevatron/retriever/modeling/encoder.py b/src/tevatron/retriever/modeling/encoder.py index b102f134..7800172c 100644 --- a/src/tevatron/retriever/modeling/encoder.py +++ b/src/tevatron/retriever/modeling/encoder.py @@ -158,7 +158,7 @@ def load(cls, normalize: bool = False, lora_name_or_path: str = None, **hf_kwargs): - base_model = cls.TRANSFORMER_CLS.from_pretrained(model_name_or_path, **hf_kwargs) + base_model = cls.TRANSFORMER_CLS.from_pretrained(model_name_or_path, attn_implementation="flash_attention_2", **hf_kwargs) if base_model.config.pad_token_id is None: base_model.config.pad_token_id = 0 if lora_name_or_path: From 93656cb4bea8b7e628355cb03d4f1ac3c8696307 Mon Sep 17 00:00:00 2001 From: Xueguang Ma Date: Thu, 13 Feb 2025 04:21:32 +0000 Subject: [PATCH 05/16] update training with mixed data --- dataset_config.yaml | 11 +++++ src/tevatron/retriever/arguments.py | 2 +- src/tevatron/retriever/dataset.py | 51 +++++++++++++++++++----- src/tevatron/retriever/driver/train.py | 2 +- src/tevatron/retriever/modeling/dense.py | 4 +- src/tevatron/retriever/searcher.py | 8 ++-- train_multimodal.sh | 27 +++++++++++++ 7 files changed, 87 insertions(+), 18 deletions(-) create mode 100644 dataset_config.yaml create mode 100644 train_multimodal.sh diff --git a/dataset_config.yaml b/dataset_config.yaml new file mode 100644 index 00000000..b6f1ee70 --- /dev/null +++ b/dataset_config.yaml @@ -0,0 +1,11 @@ +train: + - Tevatron/colpali + - Tevatron/bge-ir + - Tevatron/pixmo-docs + - Tevatron/wiki-ss-nq-new + +corpus: + - Tevatron/colpali-corpus + - null + - Tevatron/pixmo-docs-corpus + - Tevatron/wiki-ss-corpus-new \ No newline at end of file diff --git a/src/tevatron/retriever/arguments.py b/src/tevatron/retriever/arguments.py index d5cb842d..c4f7efa0 100644 --- a/src/tevatron/retriever/arguments.py +++ b/src/tevatron/retriever/arguments.py @@ -95,7 +95,7 @@ class DataArguments: ) corpus_name: str = field( - default='json', metadata={"help": "huggingface dataset name for corpus"} + default=None, metadata={"help": "huggingface dataset name for corpus"} ) corpus_config: str = field( diff --git a/src/tevatron/retriever/dataset.py b/src/tevatron/retriever/dataset.py index 68fa62e3..27b7b92d 100644 --- a/src/tevatron/retriever/dataset.py +++ b/src/tevatron/retriever/dataset.py @@ -21,13 +21,16 @@ def __init__(self, data_args: DataArguments, trainer = None, dataset_name = None split=self.data_args.dataset_split, cache_dir=self.data_args.dataset_cache_dir, ) - self.corpus = load_dataset( - self.data_args.corpus_name if corpus_name is None else corpus_name, - self.data_args.corpus_config, - data_files=self.data_args.corpus_path if corpus_path is None else corpus_path, - split=self.data_args.corpus_split, - cache_dir=self.data_args.dataset_cache_dir, - ) + if self.data_args.corpus_name is None and corpus_name is None: + self.corpus = None + else: + self.corpus = load_dataset( + self.data_args.corpus_name if corpus_name is None else corpus_name, + self.data_args.corpus_config, + data_files=self.data_args.corpus_path if corpus_path is None else corpus_path, + split=self.data_args.corpus_split, + cache_dir=self.data_args.dataset_cache_dir, + ) self.trainer = trainer def set_trainer(self, trainer): @@ -49,10 +52,36 @@ def __getitem__(self, item): epoch = int(self.trainer.state.epoch) _hashed_seed = hash(item + self.trainer.args.seed) + + if 'positive_passages' in group: + # this dataset is in old format text data, for backward compatibility + query_text = group['query'] + query_image = None + formated_query = (self.data_args.query_prefix + query_text, query_image) + formated_documents = [] + selected_positive_document = group['positive_passages'][(_hashed_seed + epoch) % len(group['positive_passages'])] + positive_document_text = selected_positive_document['title'] + ' ' + selected_positive_document['text'] if 'title' in selected_positive_document else selected_positive_document['text'] + formated_documents.append((self.data_args.passage_prefix + positive_document_text, None)) + negative_size = self.data_args.train_group_size - 1 + if len(group['negative_passages']) < negative_size: + selected_negative_documents = random.choices(group['negative_passages'], k=negative_size) + elif self.data_args.train_group_size == 1: + selected_negative_documents = [] + else: + _offset = epoch * negative_size % len(group['negative_passages']) + selected_negative_documents = [x for x in group['negative_passages']] + random.Random(_hashed_seed).shuffle(selected_negative_documents) + selected_negative_documents = selected_negative_documents * 2 + selected_negative_documents = selected_negative_documents[_offset: _offset + negative_size] + for negative_document in selected_negative_documents: + negative_document_text = negative_document['title'] + ' ' + negative_document['text'] if 'title' in negative_document else negative_document['text'] + formated_documents.append((self.data_args.passage_prefix + negative_document_text, None)) + return formated_query, formated_documents + query_id = group['query_id'] - query_text = group['query_text'] + query_text = '' if 'query_text' not in group else group['query_text'] query_text = '' if query_text is None else query_text - query_image = group['query_image'] + query_image = None if 'query_image' not in group else group['query_image'] positive_document_ids = group['positive_document_ids'] negative_document_ids = group['negative_document_ids'] @@ -98,7 +127,9 @@ def __init__(self, data_args: DataArguments, dataset_list = None, corpus_list = dataset_path = dataset_name_or_path else: dataset_name = dataset_name_or_path - if os.path.isdir(corpus_name_or_path): + if corpus_name_or_path is None: + corpus_name = None + elif os.path.isdir(corpus_name_or_path): corpus_name = corpus_name_or_path elif corpus_name_or_path.endswith('.jsonl'): corpus_name = 'json' diff --git a/src/tevatron/retriever/driver/train.py b/src/tevatron/retriever/driver/train.py index 24ade375..75fec023 100644 --- a/src/tevatron/retriever/driver/train.py +++ b/src/tevatron/retriever/driver/train.py @@ -85,7 +85,7 @@ def main(): train_dataset=train_dataset, data_collator=collator ) - train_dataset.trainer = trainer + train_dataset.set_trainer(trainer) last_checkpoint = None if os.path.isdir(training_args.output_dir): diff --git a/src/tevatron/retriever/modeling/dense.py b/src/tevatron/retriever/modeling/dense.py index 18ddb978..7b65b536 100644 --- a/src/tevatron/retriever/modeling/dense.py +++ b/src/tevatron/retriever/modeling/dense.py @@ -1,6 +1,6 @@ import torch import logging -from transformers import Qwen2VLForConditionalGeneration +from transformers import Qwen2_5_VLForConditionalGeneration from .encoder import EncoderModel logger = logging.getLogger(__name__) @@ -40,7 +40,7 @@ def _pooling(self, last_hidden_state, attention_mask): class MultiModalDenseModel(DenseModel): - TRANSFORMER_CLS = Qwen2VLForConditionalGeneration + TRANSFORMER_CLS = Qwen2_5_VLForConditionalGeneration def __init__(self, encoder, pooling='eos', normalize=True, temperature=0.02): super().__init__(encoder, pooling, normalize, temperature) diff --git a/src/tevatron/retriever/searcher.py b/src/tevatron/retriever/searcher.py index f89510fa..6dab5f68 100644 --- a/src/tevatron/retriever/searcher.py +++ b/src/tevatron/retriever/searcher.py @@ -12,10 +12,10 @@ class FaissFlatSearcher: def __init__(self, init_reps: np.ndarray): index = faiss.IndexFlatIP(init_reps.shape[1]) # check if cuda is available - # if faiss.get_num_gpus() > 0: - # logger.info("Using GPU") - # res = faiss.StandardGpuResources() - # index = faiss.index_cpu_to_gpu(res, 0, index) + if faiss.get_num_gpus() > 0: + logger.info("Using GPU") + res = faiss.StandardGpuResources() + index = faiss.index_cpu_to_gpu(res, 0, index) self.index = index def add(self, p_reps: np.ndarray): diff --git a/train_multimodal.sh b/train_multimodal.sh new file mode 100644 index 00000000..41f2fdc5 --- /dev/null +++ b/train_multimodal.sh @@ -0,0 +1,27 @@ +deepspeed --include localhost:0,1,2,3 --master_port 60000 --module tevatron.retriever.driver.train_mm \ + --deepspeed deepspeed/ds_zero0_config.json \ + --output_dir retriever-qwen25vl-bge-ir \ + --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ + --lora \ + --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \ + --save_steps 500 \ + --train_yaml dataset_config.yaml \ + --query_prefix "Query: " \ + --passage_prefix "" \ + --bf16 \ + --tf32 True \ + --pooling eos \ + --append_eos_token \ + --normalize \ + --temperature 0.02 \ + --per_device_train_batch_size 16 \ + --gradient_checkpointing \ + --train_group_size 4 \ + --learning_rate 1e-4 \ + --query_max_len 512 \ + --passage_max_len 512 \ + --num_train_epochs 1 \ + --logging_steps 1 \ + --overwrite_output_dir \ + --gradient_accumulation_steps 4 \ + --warmup_ratio 0.005 \ No newline at end of file From b3a3e895b0a7ff856c2c78cfa71e47c30847e52d Mon Sep 17 00:00:00 2001 From: Xueguang Ma Date: Thu, 13 Feb 2025 04:22:23 +0000 Subject: [PATCH 06/16] update --- train_multimodal.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_multimodal.sh b/train_multimodal.sh index 41f2fdc5..8d2ec2f4 100644 --- a/train_multimodal.sh +++ b/train_multimodal.sh @@ -1,6 +1,6 @@ deepspeed --include localhost:0,1,2,3 --master_port 60000 --module tevatron.retriever.driver.train_mm \ --deepspeed deepspeed/ds_zero0_config.json \ - --output_dir retriever-qwen25vl-bge-ir \ + --output_dir retriever-qwen25vl-bge-pixmo-colpali-wiki \ --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ --lora \ --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \ From db0bf50338b7e5e505b1cf1e8b5bb5f91d309221 Mon Sep 17 00:00:00 2001 From: ArvinZhuang Date: Sun, 16 Feb 2025 13:22:29 +1000 Subject: [PATCH 07/16] new train config and add num_process --- dataset_config.yaml | 8 ++++++++ src/tevatron/retriever/arguments.py | 4 ++++ src/tevatron/retriever/dataset.py | 3 +++ train_multimodal.sh | 8 +++++--- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/dataset_config.yaml b/dataset_config.yaml index b6f1ee70..1cbd7d17 100644 --- a/dataset_config.yaml +++ b/dataset_config.yaml @@ -1,10 +1,18 @@ train: + - Tevatron/colpali + - Tevatron/colpali + - Tevatron/colpali + - Tevatron/colpali - Tevatron/colpali - Tevatron/bge-ir - Tevatron/pixmo-docs - Tevatron/wiki-ss-nq-new corpus: + - Tevatron/colpali-corpus + - Tevatron/colpali-corpus + - Tevatron/colpali-corpus + - Tevatron/colpali-corpus - Tevatron/colpali-corpus - null - Tevatron/pixmo-docs-corpus diff --git a/src/tevatron/retriever/arguments.py b/src/tevatron/retriever/arguments.py index c4f7efa0..6ca3214d 100644 --- a/src/tevatron/retriever/arguments.py +++ b/src/tevatron/retriever/arguments.py @@ -170,6 +170,10 @@ class DataArguments: }, ) + num_proc: int = field( + default=1, metadata={"help": "number of processes to use for loading the dataset"} + ) + @dataclass class TevatronTrainingArguments(TrainingArguments): diff --git a/src/tevatron/retriever/dataset.py b/src/tevatron/retriever/dataset.py index 27b7b92d..4b3f3768 100644 --- a/src/tevatron/retriever/dataset.py +++ b/src/tevatron/retriever/dataset.py @@ -20,6 +20,7 @@ def __init__(self, data_args: DataArguments, trainer = None, dataset_name = None data_files=self.data_args.dataset_path if dataset_path is None else dataset_path, split=self.data_args.dataset_split, cache_dir=self.data_args.dataset_cache_dir, + num_proc=self.data_args.num_proc, ) if self.data_args.corpus_name is None and corpus_name is None: self.corpus = None @@ -30,6 +31,7 @@ def __init__(self, data_args: DataArguments, trainer = None, dataset_name = None data_files=self.data_args.corpus_path if corpus_path is None else corpus_path, split=self.data_args.corpus_split, cache_dir=self.data_args.dataset_cache_dir, + num_proc=self.data_args.num_proc, ) self.trainer = trainer @@ -163,6 +165,7 @@ def __init__(self, data_args: DataArguments): data_files=self.data_args.dataset_path, split=self.data_args.dataset_split, cache_dir=self.data_args.dataset_cache_dir, + num_proc=self.data_args.num_proc, ) if self.data_args.dataset_number_of_shards > 1: self.encode_data = self.encode_data.shard( diff --git a/train_multimodal.sh b/train_multimodal.sh index 8d2ec2f4..c185d0ef 100644 --- a/train_multimodal.sh +++ b/train_multimodal.sh @@ -1,4 +1,4 @@ -deepspeed --include localhost:0,1,2,3 --master_port 60000 --module tevatron.retriever.driver.train_mm \ +deepspeed --include localhost:0,1,2,3,4,5,6,7,8 --master_port 60000 --module tevatron.retriever.driver.train_mm \ --deepspeed deepspeed/ds_zero0_config.json \ --output_dir retriever-qwen25vl-bge-pixmo-colpali-wiki \ --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ @@ -23,5 +23,7 @@ deepspeed --include localhost:0,1,2,3 --master_port 60000 --module tevatron.retr --num_train_epochs 1 \ --logging_steps 1 \ --overwrite_output_dir \ - --gradient_accumulation_steps 4 \ - --warmup_ratio 0.005 \ No newline at end of file + --gradient_accumulation_steps 2 \ + --warmup_ratio 0.005 \ + --report_to wandb \ + --dataloader_num_workers 4 \ No newline at end of file From 027d5cffd6381a95218877f928da2514a743f531 Mon Sep 17 00:00:00 2001 From: ArvinZhuang Date: Mon, 17 Feb 2025 09:30:22 +1000 Subject: [PATCH 08/16] encode mm --- README.md | 2 +- examples/example_repllama.md | 4 +- examples/example_repllama_vllm.md | 4 +- src/tevatron/retriever/dataset.py | 14 ++- src/tevatron/retriever/driver/encode.py | 2 +- src/tevatron/retriever/driver/encode_mm.py | 115 +++++++++++++++++++++ src/tevatron/retriever/driver/search.py | 18 ++++ src/tevatron/retriever/driver/train_mm.py | 4 +- 8 files changed, 151 insertions(+), 12 deletions(-) create mode 100644 src/tevatron/retriever/driver/encode_mm.py diff --git a/README.md b/README.md index 85815741..58e686e6 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Tevatron aims to provide a flexible and efficient toolkit that enables training ```bash pip install transformers datasets peft pip install deepspeed accelerate -pip install faiss +pip install faiss-cpu pip install -e . ``` diff --git a/examples/example_repllama.md b/examples/example_repllama.md index 8ac68c67..9c53f187 100644 --- a/examples/example_repllama.md +++ b/examples/example_repllama.md @@ -23,8 +23,8 @@ CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode \ --dataset_config scifact \ --dataset_split train \ --encode_output_path beir_embedding_scifact/corpus_scifact.${s}.pkl \ - --encode_num_shard 4 \ - --encode_shard_index ${s} + --dataset_number_of_shards 4 \ + --dataset_shard_index ${s} done ``` diff --git a/examples/example_repllama_vllm.md b/examples/example_repllama_vllm.md index 390e5930..8213db1d 100644 --- a/examples/example_repllama_vllm.md +++ b/examples/example_repllama_vllm.md @@ -43,8 +43,8 @@ CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.vllm_encode \ --dataset_config scifact \ --dataset_split train \ --encode_output_path beir_embedding_scifact/corpus_scifact.${s}.pkl \ - --encode_num_shard 4 \ - --encode_shard_index ${s} + --dataset_number_of_shards 4 \ + --dataset_shard_index ${s} done ``` diff --git a/src/tevatron/retriever/dataset.py b/src/tevatron/retriever/dataset.py index 4b3f3768..9d9f8500 100644 --- a/src/tevatron/retriever/dataset.py +++ b/src/tevatron/retriever/dataset.py @@ -180,13 +180,19 @@ def __getitem__(self, item): content = self.encode_data[item] if self.data_args.encode_is_query: content_id = content['query_id'] - content_text = content['query_text'] if 'query_text' in content else '' + if 'query_text' in content: + content_text = content['query_text'] + elif 'query' in content: # integrate with tevatron-v1 format + content_text = content['query'] + else: + content_text = '' content_text = self.data_args.query_prefix + content_text + content_image = content['query_image'] if 'query_image' in content else None else: content_id = content['docid'] - content_text = content['text'] if 'text' in content else None - content_text = '' if content_text is None else content_text - content_text = self.data_args.passage_prefix + content_text + content_text = content['text'] if 'text' in content else '' + content_text = content['title'] + ' ' + content_text if 'title' in content else content_text + content_text = self.data_args.passage_prefix + content_text.strip() content_image = content['image'] if 'image' in content else None return content_id, content_text, content_image diff --git a/src/tevatron/retriever/driver/encode.py b/src/tevatron/retriever/driver/encode.py index 81576cc2..8b5635cc 100644 --- a/src/tevatron/retriever/driver/encode.py +++ b/src/tevatron/retriever/driver/encode.py @@ -93,7 +93,7 @@ def main(): for (batch_ids, batch) in tqdm(encode_loader): lookup_indices.extend(batch_ids) - with torch.cuda.amp.autocast() if training_args.fp16 or training_args.bf16 else nullcontext(): + with torch.amp.autocast('cuda') if training_args.fp16 or training_args.bf16 else nullcontext(): with torch.no_grad(): for k, v in batch.items(): batch[k] = v.to(training_args.device) diff --git a/src/tevatron/retriever/driver/encode_mm.py b/src/tevatron/retriever/driver/encode_mm.py new file mode 100644 index 00000000..2cb917b9 --- /dev/null +++ b/src/tevatron/retriever/driver/encode_mm.py @@ -0,0 +1,115 @@ +import logging +import os +import pickle +import sys +from contextlib import nullcontext + +import numpy as np +from tqdm import tqdm + +import torch + +from torch.utils.data import DataLoader +from transformers import AutoProcessor +from transformers import ( + HfArgumentParser, +) + +from tevatron.retriever.arguments import ModelArguments, DataArguments, \ + TevatronTrainingArguments as TrainingArguments +from tevatron.retriever.dataset import EncodeDataset +from tevatron.retriever.collator import MultiModalEncodeCollator +from tevatron.retriever.modeling import EncoderOutput, MultiModalDenseModel + +logger = logging.getLogger(__name__) + + +def main(): + parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args: ModelArguments + data_args: DataArguments + training_args: TrainingArguments + + if training_args.local_rank > 0 or training_args.n_gpu > 1: + raise NotImplementedError('Multi-GPU encoding is not supported.') + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + + + processor = AutoProcessor.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + trust_remote_code=True, + ) + if processor.tokenizer.pad_token_id is None: + processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + processor.tokenizer.padding_side = "left" + + if training_args.bf16: + torch_dtype = torch.bfloat16 + elif training_args.fp16: + torch_dtype = torch.float16 + else: + torch_dtype = torch.float32 + + model = MultiModalDenseModel.load( + model_args.model_name_or_path, + pooling=model_args.pooling, + normalize=model_args.normalize, + lora_name_or_path=model_args.lora_name_or_path, + cache_dir=model_args.cache_dir, + torch_dtype=torch_dtype + ) + + encode_dataset = EncodeDataset( + data_args=data_args, + ) + + encode_collator = MultiModalEncodeCollator( + data_args=data_args, + processor=processor, + ) + + encode_loader = DataLoader( + encode_dataset, + batch_size=training_args.per_device_eval_batch_size, + collate_fn=encode_collator, + shuffle=False, + drop_last=False, + num_workers=training_args.dataloader_num_workers, + ) + encoded = [] + lookup_indices = [] + model = model.to(training_args.device) + model.eval() + + for (batch_ids, batch) in tqdm(encode_loader): + lookup_indices.extend(batch_ids) + with torch.amp.autocast('cuda') if training_args.fp16 or training_args.bf16 else nullcontext(): + with torch.no_grad(): + for k, v in batch.items(): + batch[k] = v.to(training_args.device) + if data_args.encode_is_query: + model_output: EncoderOutput = model(query=batch) + encoded.append(model_output.q_reps.cpu().detach().numpy()) + else: + model_output: EncoderOutput = model(passage=batch) + encoded.append(model_output.p_reps.cpu().detach().numpy()) + + encoded = np.concatenate(encoded) + + with open(data_args.encode_output_path, 'wb') as f: + pickle.dump((encoded, lookup_indices), f) + + +if __name__ == "__main__": + main() diff --git a/src/tevatron/retriever/driver/search.py b/src/tevatron/retriever/driver/search.py index 66d139c0..1f374eac 100644 --- a/src/tevatron/retriever/driver/search.py +++ b/src/tevatron/retriever/driver/search.py @@ -5,6 +5,7 @@ from argparse import ArgumentParser from itertools import chain from tqdm import tqdm +import faiss from tevatron.retriever.searcher import FaissFlatSearcher @@ -77,6 +78,23 @@ def main(): q_reps, q_lookup = pickle_load(args.query_reps) q_reps = q_reps + num_gpus = faiss.get_num_gpus() + if num_gpus == 0: + logger.info("No GPU found or using faiss-cpu. Back to CPU.") + else: + logger.info(f"Using {num_gpus} GPU") + if num_gpus == 1: + co = faiss.GpuClonerOptions() + co.useFloat16 = True + res = faiss.StandardGpuResources() + retriever.index = faiss.index_cpu_to_gpu(res, 0, retriever.index, co) + else: + co = faiss.GpuMultipleClonerOptions() + co.shard = True + co.useFloat16 = True + retriever.index = faiss.index_cpu_to_all_gpus(retriever.index, co, + ngpu=num_gpus) + logger.info('Index Search Start') all_scores, psg_indices = search_queries(retriever, q_reps, look_up, args) logger.info('Index Search Finished') diff --git a/src/tevatron/retriever/driver/train_mm.py b/src/tevatron/retriever/driver/train_mm.py index 1bad4cab..bb5b1978 100644 --- a/src/tevatron/retriever/driver/train_mm.py +++ b/src/tevatron/retriever/driver/train_mm.py @@ -66,7 +66,7 @@ def main(): trust_remote_code=True, ) if processor.tokenizer.pad_token_id is None: - processor.tokenizer.pad_token_id = tokenizer.eos_token_id + processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id processor.tokenizer.padding_side = "left" model = MultiModalDenseModel.build( @@ -100,7 +100,7 @@ def main(): trainer.train(resume_from_checkpoint=(last_checkpoint is not None)) trainer.save_model() if trainer.is_world_process_zero(): - tokenizer.save_pretrained(training_args.output_dir) + processor.save_pretrained(training_args.output_dir) if __name__ == "__main__": From 07da92f1fd285512bc35a3c8b0cf2aae4d8983f9 Mon Sep 17 00:00:00 2001 From: ArvinZhuang Date: Mon, 17 Feb 2025 10:24:21 +1000 Subject: [PATCH 09/16] encode mm --- examples/multimodal/README.md | 127 ++++++++++++++++ examples/multimodal/eval_vidore.py | 233 +++++++++++++++++++++++++++++ src/tevatron/retriever/searcher.py | 5 - 3 files changed, 360 insertions(+), 5 deletions(-) create mode 100644 examples/multimodal/README.md create mode 100644 examples/multimodal/eval_vidore.py diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md new file mode 100644 index 00000000..8ffadb63 --- /dev/null +++ b/examples/multimodal/README.md @@ -0,0 +1,127 @@ +# Multimodal Retrieval + +## Train +```bash +deepspeed --include localhost:0,1,2,3,4,5,6,7,8 --master_port 60000 --module tevatron.retriever.driver.train_mm \ + --deepspeed deepspeed/ds_zero0_config.json \ + --output_dir retriever-qwen25vl-bge-pixmo-colpali-wiki \ + --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ + --lora \ + --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \ + --save_steps 500 \ + --train_yaml dataset_config.yaml \ + --query_prefix "Query: " \ + --passage_prefix "" \ + --bf16 \ + --tf32 True \ + --pooling eos \ + --append_eos_token \ + --normalize \ + --temperature 0.02 \ + --per_device_train_batch_size 16 \ + --gradient_checkpointing \ + --train_group_size 4 \ + --learning_rate 1e-4 \ + --query_max_len 512 \ + --passage_max_len 512 \ + --num_train_epochs 1 \ + --logging_steps 1 \ + --overwrite_output_dir \ + --gradient_accumulation_steps 2 \ + --warmup_ratio 0.005 \ + --report_to wandb \ + --dataloader_num_workers 4 +``` + +## Inference and evaluation + +### BEIR (textual modality) + +#### Query Encode +```bash + +CKPT=retriever-qwen25vl-bge-pixmo-colpali-wiki +DATASET=scifact + +mkdir -p beir_embedding/${CKPT}/${DATASET} +CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode_mm \ + --output_dir=temp \ + --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ + --lora_name_or_path ${CKPT} \ + --lora \ + --bf16 \ + --per_device_eval_batch_size 16 \ + --normalize \ + --pooling last \ + --query_prefix "Query: " \ + --passage_prefix "" \ + --append_eos_token \ + --query_max_len 512 \ + --dataset_name Tevatron/beir \ + --dataset_config scifact \ + --dataset_split test \ + --encode_output_path beir_embedding/${CKPT}/${DATASET}/queries.pkl \ + --encode_is_query +``` + +#### Document Encode +```bash +for s in 0 1 2 3; +do +CUDA_VISIBLE_DEVICES=$s python -m tevatron.retriever.driver.encode_mm \ + --output_dir=temp \ + --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ + --lora_name_or_path /scratch3/zhu042/tevatron/retriever-qwen25vl-bge-pixmo-colpali5-wiki/checkpoint-4000 \ + --lora \ + --bf16 \ + --per_device_eval_batch_size 16 \ + --normalize \ + --pooling last \ + --passage_prefix "" \ + --append_eos_token \ + --passage_max_len 512 \ + --dataset_name Tevatron/beir-corpus \ + --dataset_config scifact \ + --dataset_split train \ + --encode_output_path beir_embedding/${CKPT}/${DATASET}/corpus.${s}.pkl \ + --dataset_number_of_shards 4 \ + --dataset_shard_index ${s} & +done +wait +``` + + + +#### Search +```bash +mkdir -p beir_results/retriever-qwen25vl-bge-pixmo-colpali5-wiki/scifact +python -m tevatron.retriever.driver.search \ + --query_reps beir_embedding/${CKPT}/${DATASET}/queries.pkl \ + --passage_reps beir_embedding/${CKPT}/${DATASET}/'corpus.*.pkl' \ + --depth 100 \ + --batch_size 64 \ + --save_text \ + --save_ranking_to beir_results/${CKPT}/${DATASET}/rank.scifact.txt +``` + +#### Evaluate +```bash +python -m tevatron.utils.format.convert_result_to_trec \ +--input beir_results/${CKPT}/${DATASET}/rank.scifact.txt \ +--output beir_results/${CKPT}/${DATASET}/rank.scifact.trec \ +--remove_query + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mndcg_cut.10 beir-v1.0.0-scifact-test \ +beir_results/${CKPT}/${DATASET}/rank.scifact.trec +``` + +### VIDORE Document screenshot retrieval (Cross modality) +```bash +CUDA_VISIBLE_DEVICES=0 python eval_vidore.py \ +--model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ +--lora_name_or_path ${CKPT} \ +--batch_size 4 \ +--pooling last \ +--normalize \ +--query_prefix "Query: " +``` \ No newline at end of file diff --git a/examples/multimodal/eval_vidore.py b/examples/multimodal/eval_vidore.py new file mode 100644 index 00000000..bf6bc715 --- /dev/null +++ b/examples/multimodal/eval_vidore.py @@ -0,0 +1,233 @@ +from typing import Dict, Optional + +import torch +from datasets import load_dataset +from tqdm import tqdm +from tevatron.retriever.modeling import MultiModalDenseModel +from transformers import AutoProcessor + +from vidore_benchmark.evaluation.vidore_evaluators import ViDoReEvaluatorQA +from vidore_benchmark.retrievers import BaseVisionRetriever, VisionRetriever +from qwen_vl_utils import process_vision_info +from vidore_benchmark.utils.iter_utils import batched +import math +from vidore_benchmark.utils.data_utils import get_datasets_from_collection +from argparse import ArgumentParser + + +class TevatronVisionRetriever(BaseVisionRetriever): + """ + Dummy retriever that generates random dense embeddings. + """ + + def __init__( + self, + model: MultiModalDenseModel, + processor, + query_prefix: Optional[str] = None, + **kwargs, + ): + super().__init__(use_visual_embedding=True) + self.model = model + self.processor = processor + self.query_prefix = query_prefix + + def forward_queries( + self, + queries, + batch_size: int, + **kwargs, + ) -> torch.Tensor: + qs = [] + queries = [f'{self.query_prefix}{q}' for q in queries] + for batch_query in tqdm( + batched(queries, batch_size), + desc="Forwarding query batches", + total=math.ceil(len(queries) / batch_size), + leave=False, + ): + messages = [] + for idx in range(len(batch_query)): + text = batch_query[idx] + content = [] + if text: + text = self.processor.tokenizer.decode( + self.processor.tokenizer.encode(text, max_length=512, truncation=True) + ) + content.append({'type': 'text', 'text': text}) + message = [ + { + 'role': 'user', + 'content': content + } + ] + messages.append(message) + + texts = [ + self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) + for msg in messages + ] + texts = [x + '<|endoftext|>' for x in texts] + + image_inputs, video_inputs = process_vision_info(messages) + + collated_inputs = self.processor( + text=texts, + images=image_inputs, + videos=video_inputs, + return_tensors="pt", + padding="longest", + ) + with torch.no_grad(): + query_embeddings = self.model.encode_query(collated_inputs.to('cuda')) + qs.extend(list(torch.unbind(query_embeddings.to("cpu")))) + return qs + + + def forward_passages( + self, + passages, + batch_size: int, + **kwargs, + ) -> torch.Tensor: + + ps = [] + + for batch_passage in tqdm( + batched(passages, batch_size), + desc="Forwarding passage batches", + total=math.ceil(len(passages) / batch_size), + leave=False, + ): + messages = [] + for idx in range(len(batch_passage)): + image = batch_passage[idx] + content = [] + if image: + content.append({'type': 'image', 'image': image, 'resized_height': 784, 'resized_width': 784}) + # content.append({'type': 'text', 'text': 'What is shown in this image?'}) + message = [ + { + 'role': 'user', + 'content': content + } + ] + messages.append(message) + + texts = [ + self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) + for msg in messages + ] + + texts = [x + '<|endoftext|>' for x in texts] + + image_inputs, video_inputs = process_vision_info(messages) + + collated_inputs = self.processor( + text=texts, + images=image_inputs, + videos=video_inputs, + return_tensors="pt", + padding="longest", + ) + with torch.no_grad(): + passage_embeddings = self.model.encode_passage(collated_inputs.to('cuda')) + ps.extend(list(torch.unbind(passage_embeddings.to("cpu")))) + return ps + + + def get_scores( + self, + query_embeddings, + passage_embeddings, + batch_size: Optional[int] = None, + ) -> torch.Tensor: + """ + Dot-product similarity between queries and passages. + """ + if isinstance(query_embeddings, list): + query_embeddings = torch.stack(query_embeddings) + if isinstance(passage_embeddings, list): + passage_embeddings = torch.stack(passage_embeddings) + + scores = torch.einsum("bd,cd->bc", query_embeddings, passage_embeddings) + + return scores + +def main(): + parser = ArgumentParser() + parser.add_argument('--model_name_or_path', required=True) + parser.add_argument('--cache_dir', default=None) + parser.add_argument('--batch_size', type=int, default=4) + parser.add_argument('--lora_name_or_path', default=None) + parser.add_argument('--pooling', default='last') + parser.add_argument('--normalize', action='store_true') + parser.add_argument('--query_prefix', default='') + args = parser.parse_args() + + # Load model and processor + model = MultiModalDenseModel.load( + args.model_name_or_path, + pooling=args.pooling, + normalize=args.normalize, + lora_name_or_path=args.lora_name_or_path, + cache_dir=args.cache_dir, + torch_dtype=torch.bfloat16, + ).eval().to('cuda') + + processor = AutoProcessor.from_pretrained( + args.model_name_or_path, + cache_dir=args.cache_dir, + trust_remote_code=True, + ) + if processor.tokenizer.pad_token_id is None: + processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + processor.tokenizer.padding_side = "left" + + # Get retriever instance + vision_retriever = TevatronVisionRetriever( + model=model, + processor=processor, + query_prefix=args.query_prefix, + ) + + vidore_evaluator = ViDoReEvaluatorQA(vision_retriever) + + + # dataset_names = [ + # 'vidore/arxivqa_test_subsampled', + # 'vidore/docvqa_test_subsampled', + # 'vidore/infovqa_test_subsampled', + # 'vidore/tabfquad_test_subsampled', + # 'vidore/tatdqa_test', + # 'vidore/shiftproject_test', + # 'vidore/syntheticDocQA_artificial_intelligence_test', + # 'vidore/syntheticDocQA_energy_test', + # 'vidore/syntheticDocQA_government_reports_test', + # 'vidore/syntheticDocQA_healthcare_industry_test' + # ] + + collection_name = "vidore/vidore-benchmark-667173f98e70a1c0fa4db00d" # ViDoRe Benchmark + dataset_names = get_datasets_from_collection(collection_name) + + res = [] + for dataset_name in dataset_names: + print('Evaluating', dataset_name) + ds = load_dataset(dataset_name, split="test") + metrics_dataset = vidore_evaluator.evaluate_dataset( + ds=ds, + batch_query=args.batch_size, + batch_passage=args.batch_size, + batch_score=args.batch_size, + ) + print(dataset_name, f"ndcg@5: {metrics_dataset['ndcg_at_5']}") + res.append((dataset_name, f"ndcg@5: {metrics_dataset['ndcg_at_5']}")) + + print(res) + # average + print(sum([float(x[1].split(': ')[1]) for x in res]) / len(res) + ) + +if __name__ == '__main__': + main() + diff --git a/src/tevatron/retriever/searcher.py b/src/tevatron/retriever/searcher.py index 6dab5f68..0c174c8e 100644 --- a/src/tevatron/retriever/searcher.py +++ b/src/tevatron/retriever/searcher.py @@ -11,11 +11,6 @@ class FaissFlatSearcher: def __init__(self, init_reps: np.ndarray): index = faiss.IndexFlatIP(init_reps.shape[1]) - # check if cuda is available - if faiss.get_num_gpus() > 0: - logger.info("Using GPU") - res = faiss.StandardGpuResources() - index = faiss.index_cpu_to_gpu(res, 0, index) self.index = index def add(self, p_reps: np.ndarray): From d1840d93fbafadc460e369a3ee12e9a5ab0098e5 Mon Sep 17 00:00:00 2001 From: ArvinZhuang Date: Mon, 17 Feb 2025 10:27:40 +1000 Subject: [PATCH 10/16] move data config to example --- .../multimodal/dataset_config.yaml | 0 train_multimodal.sh | 29 ------------------- 2 files changed, 29 deletions(-) rename dataset_config.yaml => examples/multimodal/dataset_config.yaml (100%) delete mode 100644 train_multimodal.sh diff --git a/dataset_config.yaml b/examples/multimodal/dataset_config.yaml similarity index 100% rename from dataset_config.yaml rename to examples/multimodal/dataset_config.yaml diff --git a/train_multimodal.sh b/train_multimodal.sh deleted file mode 100644 index c185d0ef..00000000 --- a/train_multimodal.sh +++ /dev/null @@ -1,29 +0,0 @@ -deepspeed --include localhost:0,1,2,3,4,5,6,7,8 --master_port 60000 --module tevatron.retriever.driver.train_mm \ - --deepspeed deepspeed/ds_zero0_config.json \ - --output_dir retriever-qwen25vl-bge-pixmo-colpali-wiki \ - --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ - --lora \ - --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \ - --save_steps 500 \ - --train_yaml dataset_config.yaml \ - --query_prefix "Query: " \ - --passage_prefix "" \ - --bf16 \ - --tf32 True \ - --pooling eos \ - --append_eos_token \ - --normalize \ - --temperature 0.02 \ - --per_device_train_batch_size 16 \ - --gradient_checkpointing \ - --train_group_size 4 \ - --learning_rate 1e-4 \ - --query_max_len 512 \ - --passage_max_len 512 \ - --num_train_epochs 1 \ - --logging_steps 1 \ - --overwrite_output_dir \ - --gradient_accumulation_steps 2 \ - --warmup_ratio 0.005 \ - --report_to wandb \ - --dataloader_num_workers 4 \ No newline at end of file From 50e214585f0a4c1018760b2c7988635f90013e55 Mon Sep 17 00:00:00 2001 From: ArvinZhuang Date: Mon, 17 Feb 2025 16:06:32 +1000 Subject: [PATCH 11/16] fix readme --- examples/multimodal/README.md | 4 +-- src/tevatron/retriever/collator.py | 51 +++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md index 8ffadb63..6a5d8c30 100644 --- a/examples/multimodal/README.md +++ b/examples/multimodal/README.md @@ -71,7 +71,7 @@ do CUDA_VISIBLE_DEVICES=$s python -m tevatron.retriever.driver.encode_mm \ --output_dir=temp \ --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ - --lora_name_or_path /scratch3/zhu042/tevatron/retriever-qwen25vl-bge-pixmo-colpali5-wiki/checkpoint-4000 \ + --lora_name_or_path ${CKPT} \ --lora \ --bf16 \ --per_device_eval_batch_size 16 \ @@ -94,7 +94,7 @@ wait #### Search ```bash -mkdir -p beir_results/retriever-qwen25vl-bge-pixmo-colpali5-wiki/scifact +mkdir -p beir_results/retriever-qwen25vl-bge-pixmo-colpali-wiki/scifact python -m tevatron.retriever.driver.search \ --query_reps beir_embedding/${CKPT}/${DATASET}/queries.pkl \ --passage_reps beir_embedding/${CKPT}/${DATASET}/'corpus.*.pkl' \ diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index 1f8f1703..6b65ccbb 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -1,4 +1,5 @@ import logging +from inspect import classify_class_attrs from typing import List, Tuple from dataclasses import dataclass from transformers import PreTrainedTokenizer, ProcessorMixin @@ -279,4 +280,52 @@ def __call__(self, features: List[Tuple[str, str]]): ) if self.data_args.append_eos_token: collated_texts['input_ids'] = [x + [self.tokenizer.eos_token_id] for x in collated_texts['input_ids']] - return text_ids, collated_texts['input_ids'] \ No newline at end of file + return text_ids, collated_texts['input_ids'] + + +@dataclass +class VllmMultiModalEncodeCollator(MultiModalEncodeCollator): + def __call__(self, features): + """ + Collate function for encoding. + :param features: list of (id, text, image) tuples + but in this case, it's just image is None + """ + content_ids = [x[0] for x in features] + texts = [x[1] for x in features] + images = [x[2] for x in features] + messages = [] + max_length = self.data_args.query_max_len if self.data_args.encode_is_query else self.data_args.passage_max_len + for idx in range(len(texts)): + text = texts[idx] + image = images[idx] + content = [] + if text: + text = self.processor.tokenizer.decode( + self.processor.tokenizer.encode(text, max_length=max_length, truncation=True) + ) + content.append({'type': 'text', 'text': text}) + if image: + content.append({'type': 'image', 'image': image, 'resized_height': 784, 'resized_width': 784}) + else: + image = Image.new('RGB', (28, 28)) + content.append({'type': 'image', 'image': image, 'resized_height': 1, 'resized_width': 1}) + # content.append({'type': 'text', 'text': 'What is shown in this image?'}) + message = [ + { + 'role': 'user', + 'content': content + } + ] + messages.append(message) + + texts = [ + self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) + for msg in messages + ] + + if self.data_args.append_eos_token: + texts = [x + '<|endoftext|>' for x in texts] + + return content_ids, texts, images + From df9790afb1653234fe855633932e41597a7c192f Mon Sep 17 00:00:00 2001 From: ArvinZhuang Date: Wed, 19 Feb 2025 07:57:39 +1000 Subject: [PATCH 12/16] vllm mm encode --- src/tevatron/retriever/collator.py | 1 - .../retriever/driver/vllm_encode_mm.py | 119 ++++++++++++++++++ 2 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 src/tevatron/retriever/driver/vllm_encode_mm.py diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index 6b65ccbb..293ffa78 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -1,5 +1,4 @@ import logging -from inspect import classify_class_attrs from typing import List, Tuple from dataclasses import dataclass from transformers import PreTrainedTokenizer, ProcessorMixin diff --git a/src/tevatron/retriever/driver/vllm_encode_mm.py b/src/tevatron/retriever/driver/vllm_encode_mm.py new file mode 100644 index 00000000..83d920db --- /dev/null +++ b/src/tevatron/retriever/driver/vllm_encode_mm.py @@ -0,0 +1,119 @@ +import logging +import os +import pickle +import sys + +import numpy as np +from tqdm import tqdm + +import torch +from torch.utils.data import DataLoader +from transformers import AutoProcessor +from transformers import ( + HfArgumentParser, +) + +from tevatron.retriever.arguments import ModelArguments, DataArguments, \ + TevatronTrainingArguments as TrainingArguments +from tevatron.retriever.dataset import EncodeDataset +from tevatron.retriever.collator import VllmMultiModalEncodeCollator +from vllm import LLM +from vllm.config import PoolerConfig +from vllm.inputs import token_inputs +from PIL import Image + +logger = logging.getLogger(__name__) + + +def main(): + parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args: ModelArguments + data_args: DataArguments + training_args: TrainingArguments + + if training_args.local_rank > 0 or training_args.n_gpu > 1: + raise NotImplementedError('Multi-GPU encoding is not supported.') + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + + processor = AutoProcessor.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + trust_remote_code=True, + ) + if processor.tokenizer.pad_token_id is None: + processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + processor.tokenizer.padding_side = "left" + + min_pixels = 1 * 28 * 28 + max_pixels = 2560 * 28 * 28 + + if training_args.bf16: + torch_dtype = 'bfloat16' + elif training_args.fp16: + torch_dtype = 'float16' + else: + torch_dtype = 'float32' + + + pooler_config = PoolerConfig(pooling_type=model_args.pooling.upper(), + normalize=model_args.normalize) + + model = LLM( + model=model_args.model_name_or_path, + task="embed", + enforce_eager=True, + override_pooler_config=pooler_config, + dtype=torch_dtype + ) + + encode_dataset = EncodeDataset( + data_args=data_args, + ) + + encode_collator = VllmMultiModalEncodeCollator( + data_args=data_args, + processor=processor, + ) + + encode_loader = DataLoader( + encode_dataset, + batch_size=training_args.per_device_eval_batch_size, + collate_fn=encode_collator, + shuffle=False, + drop_last=False, + num_workers=training_args.dataloader_num_workers, + ) + + lookup_indices = [] + vllm_inputs = [] + for (batch_ids, texts, images) in tqdm(encode_loader, desc="Preprocessing"): + lookup_indices.extend(batch_ids) + for prompt, image in zip(texts, images): + vllm_inputs.append({ + "prompt": prompt, + "multi_modal_data": {'image': image if image is not None else Image.new('RGB', (28, 28))}, + "mm_processor_kwargs": {"min_pixels": min_pixels, "max_pixels": max_pixels}, + }) + + outputs = model.embed(vllm_inputs) + encoded = [] + for output in outputs: + encoded.append(output.outputs.embedding) + encoded = np.stack(encoded, dtype=np.float16) + + with open(data_args.encode_output_path, 'wb') as f: + pickle.dump((encoded, lookup_indices), f) + + +if __name__ == "__main__": + main() From ae014203b5a9fba6ae3d12cf8aa865dcb1793094 Mon Sep 17 00:00:00 2001 From: Xueguang Ma Date: Tue, 18 Feb 2025 22:25:25 +0000 Subject: [PATCH 13/16] add miracl script --- examples/multimodal/README.md | 81 ++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md index 6a5d8c30..bf039f6d 100644 --- a/examples/multimodal/README.md +++ b/examples/multimodal/README.md @@ -58,7 +58,7 @@ CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode_mm \ --append_eos_token \ --query_max_len 512 \ --dataset_name Tevatron/beir \ - --dataset_config scifact \ + --dataset_config ${DATASET} \ --dataset_split test \ --encode_output_path beir_embedding/${CKPT}/${DATASET}/queries.pkl \ --encode_is_query @@ -81,7 +81,7 @@ CUDA_VISIBLE_DEVICES=$s python -m tevatron.retriever.driver.encode_mm \ --append_eos_token \ --passage_max_len 512 \ --dataset_name Tevatron/beir-corpus \ - --dataset_config scifact \ + --dataset_config ${DATASET} \ --dataset_split train \ --encode_output_path beir_embedding/${CKPT}/${DATASET}/corpus.${s}.pkl \ --dataset_number_of_shards 4 \ @@ -115,6 +115,83 @@ python -m pyserini.eval.trec_eval -c -mrecall.100 -mndcg_cut.10 beir-v1.0.0-scif beir_results/${CKPT}/${DATASET}/rank.scifact.trec ``` +### MIRACL (Multi-Lingual, Textual Modality) +#### Query Encode +```bash + +CKPT=retriever-qwen25vl-bge-pixmo-colpali-wiki +DATASET=ar + +mkdir -p miracl_embedding/${CKPT}/${DATASET} +CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode_mm \ + --output_dir=temp \ + --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ + --lora_name_or_path ${CKPT} \ + --lora \ + --bf16 \ + --per_device_eval_batch_size 16 \ + --normalize \ + --pooling last \ + --query_prefix "Query: " \ + --passage_prefix "" \ + --append_eos_token \ + --query_max_len 512 \ + --dataset_name miracl/miracl \ + --dataset_config $DATASET \ + --dataset_split test \ + --encode_output_path miracl_embedding/${CKPT}/${DATASET}/queries.pkl \ + --encode_is_query +``` + +#### Document Encode +```bash +for s in 0 1 2 3; +do +CUDA_VISIBLE_DEVICES=$s python -m tevatron.retriever.driver.encode_mm \ + --output_dir=temp \ + --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \ + --lora_name_or_path ${CKPT} \ + --lora \ + --bf16 \ + --per_device_eval_batch_size 16 \ + --normalize \ + --pooling last \ + --passage_prefix "" \ + --append_eos_token \ + --passage_max_len 512 \ + --dataset_name miracl/miracl-corpus \ + --dataset_config ${DATASET} \ + --dataset_split train \ + --encode_output_path miracl_embedding/${CKPT}/${DATASET}/corpus.${s}.pkl \ + --dataset_number_of_shards 4 \ + --dataset_shard_index ${s} & +done +wait +``` + + + +#### Search +```bash +mkdir -p miracl_results/retriever-qwen25vl-bge-pixmo-colpali-wiki/$DATASET +python -m tevatron.retriever.driver.search \ + --query_reps miracl_embedding/${CKPT}/${DATASET}/queries.pkl \ + --passage_reps miracl_embedding/${CKPT}/${DATASET}/'corpus.*.pkl' \ + --depth 100 \ + --batch_size 64 \ + --save_text \ + --save_ranking_to miracl_results/${CKPT}/${DATASET}/rank.${DATASET}.txt +``` + +#### Evaluate +```bash +python -m tevatron.utils.format.convert_result_to_trec \ +--input beir_results/${CKPT}/${DATASET}/rank.${DATASET}.txt \ +--output beir_results/${CKPT}/${DATASET}/rank.${DATASET}.trec + +python -m pyserini.eval.trec_eval -c -mrecall.100 -mndcg_cut.10 miracl-v1.0-${DATASET}-dev \ +miracl_results/${CKPT}/${DATASET}/rank.${DATASET}.trec + ### VIDORE Document screenshot retrieval (Cross modality) ```bash CUDA_VISIBLE_DEVICES=0 python eval_vidore.py \ From 940119cf3e05fcd9756a49f81cfe1064aecef0f2 Mon Sep 17 00:00:00 2001 From: ArvinZhuang Date: Wed, 19 Feb 2025 19:27:25 +1000 Subject: [PATCH 14/16] update readme --- README.md | 35 +++++++++++++++++------------------ examples/multimodal/README.md | 9 +++++---- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 58e686e6..3fbdb251 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ -# Tevatron V1.5 -Tevatron aims to provide a flexible and efficient toolkit that enables training and inference for neural retrieval models at scale. +# Tevatron V2.0 +Tevatron: Unified Document Retrieval Toolkit across Scale, Language, and Modality. -> Some of the features in Tevatron v1 is not yet migrated to Tevatron v1.5. We are working on it. +> Some of the features in Tevatron v1 is not yet migrated to Tevatron v2.0. We are working on it. > If you are looking for the Tevatron v1 features, please pull the [v1 branch](https://github.com/texttron/tevatron/tree/tevatron-v1). ## Features - Training billion-scale LLM neural retriever on GPUs and TPUs. - Parameter efficient tuning with LoRA. -- Integration with DeepSpeed, flash attention, gradient accumulation, and other efficient training techniques. -- Self-contained datasets for neural retrieval and open-domain QA tasks. +- Integration with vLLM, DeepSpeed, FlashAttention, gradient accumulation, and other efficient training and inference techniques. +- Self-contained datasets for multi-modal and multilingual neural retrieval and open-domain QA tasks. - Direct loading and finetuning SoTA pre-trained models (BGE-Embbedding, Instruct-E5) from HuggingFace. ## Installation @@ -90,15 +90,10 @@ Tevatron takes training or inference data in `jsonl` format with each line organ ```json { "query_id": "", - "query": "", - "positive_passages": [ - {"docid": "", "title": "", "text": ""}, - ... - ], - "negative_passages": [ - {"docid": "", "title": "", "text": ""}, - ... - ] + "query_text": "", + "query_image": "", + "positive_document_ids": ["", ...], + "negative_document_ids": ["", ...], } ``` where the passages in `positive_passages` are the annotated relevant passages of the `query` @@ -107,12 +102,14 @@ and passages in `negative_passages` are usually non-relevant (hard negative) pas #### 2. Corpus Data ```json { - "docid": "", - "title": "", - "text": "" + "docid": "", + "document_text": "", + "document_image": "", } ``` -where each line represents a passage in the corpus. +where each line represents a document in the corpus. + +Note that the image field for both training and corpus data are optional and can be omitted (i.e., pure textual modality retrieval). ### Self-Contained Dataset Tevatron self-contained several commonlly used datasets for neural retrieval. @@ -323,6 +320,8 @@ The output file is in the format of ` ` in each li +## Examples ++ [Unified multi-modal and multilingual retrieval](./examples/multimodal/README.md). ## Citation If you find Tevatron helpful, please consider citing our [paper](https://arxiv.org/abs/2203.05765). diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md index bf039f6d..36f3de2b 100644 --- a/examples/multimodal/README.md +++ b/examples/multimodal/README.md @@ -1,4 +1,4 @@ -# Multimodal Retrieval +# Unified Multi-modal and Multilingual Retrieval ## Train ```bash @@ -94,7 +94,7 @@ wait #### Search ```bash -mkdir -p beir_results/retriever-qwen25vl-bge-pixmo-colpali-wiki/scifact +mkdir -p beir_results/${CKPT}/scifact python -m tevatron.retriever.driver.search \ --query_reps beir_embedding/${CKPT}/${DATASET}/queries.pkl \ --passage_reps beir_embedding/${CKPT}/${DATASET}/'corpus.*.pkl' \ @@ -186,11 +186,12 @@ python -m tevatron.retriever.driver.search \ #### Evaluate ```bash python -m tevatron.utils.format.convert_result_to_trec \ ---input beir_results/${CKPT}/${DATASET}/rank.${DATASET}.txt \ ---output beir_results/${CKPT}/${DATASET}/rank.${DATASET}.trec +--input miracl_results/${CKPT}/${DATASET}/rank.${DATASET}.txt \ +--output miracl_results/${CKPT}/${DATASET}/rank.${DATASET}.trec python -m pyserini.eval.trec_eval -c -mrecall.100 -mndcg_cut.10 miracl-v1.0-${DATASET}-dev \ miracl_results/${CKPT}/${DATASET}/rank.${DATASET}.trec +``` ### VIDORE Document screenshot retrieval (Cross modality) ```bash From a5f5f42d1ba2cce6bf9147cc481e3faabc33e570 Mon Sep 17 00:00:00 2001 From: ArvinZhuang Date: Wed, 19 Feb 2025 19:29:44 +1000 Subject: [PATCH 15/16] update readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3fbdb251..e36c7622 100644 --- a/README.md +++ b/README.md @@ -321,7 +321,8 @@ The output file is in the format of ` ` in each li ## Examples -+ [Unified multi-modal and multilingual retrieval](./examples/multimodal/README.md). ++ [Unified multi-modal and multilingual retrieval](./examples/multimodal/README.md) ++ [vLLM encoding and retrieval](./examples/example_repllama_vllm.md) ## Citation If you find Tevatron helpful, please consider citing our [paper](https://arxiv.org/abs/2203.05765). From 2faaf852976535ffbe3ae46e8e0b373a93270b9a Mon Sep 17 00:00:00 2001 From: ArvinZhuang Date: Wed, 19 Feb 2025 19:43:10 +1000 Subject: [PATCH 16/16] update readme --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e36c7622..306a2b0e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,14 @@ # Tevatron V2.0 + +
+ + + +PyPI Downloads + GitHub stars + +
+ Tevatron: Unified Document Retrieval Toolkit across Scale, Language, and Modality. > Some of the features in Tevatron v1 is not yet migrated to Tevatron v2.0. We are working on it. @@ -8,7 +18,7 @@ Tevatron: Unified Document Retrieval Toolkit across Scale, Language, and Modalit - Training billion-scale LLM neural retriever on GPUs and TPUs. - Parameter efficient tuning with LoRA. - Integration with vLLM, DeepSpeed, FlashAttention, gradient accumulation, and other efficient training and inference techniques. -- Self-contained datasets for multi-modal and multilingual neural retrieval and open-domain QA tasks. +- Self-contained [huggingface datasets](https://huggingface.co/Tevatron) for multi-modal and multilingual neural retrieval and open-domain QA tasks. - Direct loading and finetuning SoTA pre-trained models (BGE-Embbedding, Instruct-E5) from HuggingFace. ## Installation