diff --git a/data_prepare/README.md b/data_prepare/README.md index 1ce5d1e5..846abdd9 100644 --- a/data_prepare/README.md +++ b/data_prepare/README.md @@ -150,4 +150,23 @@ We use the train split of ScienceQA. The image data of the train split can be ob ```bash huggingface-cli download Efficient-Large-Model/ScienceQA_train_12K --repo-type dataset --local-dir scienceqa --local-dir-use-symlinks False -``` \ No newline at end of file +``` + +### IDEFICS2-SFT dataset + +We also provide scripts to preprocess IDEFICS2-SFT dataset into llava-SFT like format. + +Please first download [HuggingFaceM4/the_cauldron](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron) to `/home/jasonlu/workspace/idefics2-sft/the_cauldron`. Then, run the following scripts: + +```bash +python preprocess_idefics2.py +python merge_idefics2.py +``` + +A sample in the preprocessed dataset file will look like this: + +```json +{"id": 0, "images": ["images/chart2text/0_0.png"], "conversations": [{"from": "human", "value": "\nPlease clarify the meaning conveyed by this graph."}, {"from": "gpt", "value": "This statistic presents the reach of the most popular social networks among female beauty consumers in the United States as of August 2016. During the survey period, 62 percent of respondents had an Instagram account."}]} +``` + +Haotian's Note: Datasets overlapping with VFLAN / ShareGPT4V-SFT are removed. I also remove `plotqa` since it is too large, `localized_narratives` seems to be a little bit overlapped with captioning efforts within VILA. `websight` and `datikz` are two datasets that target code generation. Since the output is very long, and including them might slow down training, I also temporarily removed these two datasets, but feel free to add them back. diff --git a/data_prepare/sft/merge_idefics2.py b/data_prepare/sft/merge_idefics2.py new file mode 100644 index 00000000..cd579873 --- /dev/null +++ b/data_prepare/sft/merge_idefics2.py @@ -0,0 +1,31 @@ +import os +import json + +dataset_path = "/home/jasonlu/workspace/idefics2-sft/the_cauldron" +save_path = "/home/jasonlu/workspace/idefics2-sft/new-vflan/" +metadata_path = os.path.join(save_path, "metadata") +dataset_names = sorted(os.listdir(metadata_path)) + +def load_jsonl(file_path): + data = [] + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + data.append(json.loads(line)) + return data + +all_data = [] +for dataset_name in dataset_names: + if "websight" in dataset_name or "datikz" in dataset_name: + # skip the snapshot => code datasets for now. + continue + loaded = load_jsonl(os.path.join(metadata_path, dataset_name)) + id_offset = len(all_data) + for item in loaded: + item["id"] += id_offset + all_data += loaded + print(dataset_name, len(all_data), all_data[-1]) + +with open(os.path.join(save_path, "idefics2_sft_train.jsonl"), "w") as f: + for item in all_data: + json.dump(item, f) + f.write("\n") \ No newline at end of file diff --git a/data_prepare/sft/preprocess_idefics2.py b/data_prepare/sft/preprocess_idefics2.py new file mode 100644 index 00000000..c925ffa9 --- /dev/null +++ b/data_prepare/sft/preprocess_idefics2.py @@ -0,0 +1,123 @@ +from datasets import load_dataset, concatenate_datasets +import os +import pickle +import torch +import json +from tqdm import tqdm +from PIL import Image +from multiprocessing import Pool + + + +def general_conversation_preprocessor(item, dataset_name, id): + # process the conversation item to llava format. + conversations = [] + ret_item = dict(id=id) + # ret_item["images"] = item["images"] + img_paths = [] + for img_idx, img in enumerate(item["images"]): + save_path_to_append = os.path.join("images", dataset_name, f"{id}_{img_idx}.png") + img_path = os.path.join(save_path, save_path_to_append) + if img.mode == 'CMYK': + img = img.convert('RGB') + img.save(img_path) + img_paths.append(save_path_to_append) + ret_item["images"] = img_paths + old_conversations = item["texts"] + for idx, conv in enumerate(old_conversations): + if "user" in conv: + if idx > 0: + cur_conv = conv["user"] + new_conv = { + "from": "human", + "value": cur_conv + } + else: + cur_conv = conv["user"] + new_conv = { + "from": "human", + "value": "\n" * len(item["images"]) + cur_conv + } + conversations.append(new_conv) + if "assistant" in conv: + cur_conv = conv["assistant"] + if cur_conv.startswith("Answer: "): + cur_conv = cur_conv.replace("Answer: ", "") + new_conv = { + "from": "gpt", + "value": cur_conv + } + conversations.append(new_conv) + ret_item["conversations"] = conversations + return ret_item + +def process_dataset(args): + dataset_name, dataset_path, metadata_path, save_path = args + if os.path.exists(os.path.join(metadata_path, dataset_name + "_train.jsonl")): + return + print("Processing", dataset_name, "...") + loaded = load_dataset(dataset_path, dataset_name)["train"] + dataset = list(loaded) + cnt = 0 + cur_llava_format_dataset = [] + for item in tqdm(dataset): + new_item = general_conversation_preprocessor(item, dataset_name, cnt) + if cnt == 0: + print(item["texts"], item["images"][0], new_item) + print(new_item) + cnt += 1 + cur_llava_format_dataset.append(new_item) + + with open(os.path.join(metadata_path, dataset_name + "_train.jsonl"), "w") as f: + for item in cur_llava_format_dataset: + json.dump(item, f) + f.write("\n") + +# download M3IT to the dataset_path directory +dataset_path = "/home/jasonlu/workspace/idefics2-sft/the_cauldron" +save_path = "/home/jasonlu/workspace/idefics2-sft/new-vflan/" +metadata_path = os.path.join(save_path, "metadata") +os.makedirs(metadata_path, exist_ok=True) + +skipped_datasets = [ + "ai2d", #internvl-sft + "chartqa", #internvl-sft + "clevr", #vflan, HAS BUG + "clevr_math", # HAS BUG + "docvqa", #internvl-sft + "dvqa", #internvl-sft + "nlvr2", #vflan + "ocrvqa", #vflan + "st_vqa", #vflan + "textcaps", #vflan, llava1.5 + "visualmrc", #vflan + "vqav2", #vflan, llava1.5 + "okvqa", #llava1.5 + "aokvqa", #llava1.5 + "plotqa", # has problem to load (very slow) + "localized_narratives", # has problem to load (very slow) +] + +_dataset_names = sorted(os.listdir(dataset_path)) +dataset_names = [] +for name in _dataset_names: + if name.startswith("."): + continue + if name in skipped_datasets: + continue + if os.path.isdir(os.path.join(dataset_path, name)): + dataset_names.append(name) + os.makedirs(os.path.join(save_path, "images", name), exist_ok=True) +print(dataset_names, len(dataset_names)) + +# sequential version +# for dataset_name in dataset_names: +# process_dataset((dataset_name, dataset_path, metadata_path, save_path)) +# parallel version +with Pool(processes=min(48, len(dataset_names))) as pool: + # Prepare the arguments for the process_dataset function + args = [(dataset_name, dataset_path, metadata_path, save_path) for dataset_name in dataset_names] + + # Map the process_dataset function to the arguments + for _ in tqdm(pool.imap_unordered(process_dataset, args), total=len(args), desc="Processing datasets"): + pass \ No newline at end of file diff --git a/llava/data/dataset.py b/llava/data/dataset.py index 3564adfa..87d50a95 100644 --- a/llava/data/dataset.py +++ b/llava/data/dataset.py @@ -660,6 +660,13 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: image_file = self.list_data_dict[i]["image"] image = process_image(image_file, self.data_args, self.image_folder) sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args) + elif "images" in sources[0]: + all_images = [] + for image_file in self.list_data_dict[i]["images"]: + image = process_image(image_file, self.data_args, self.image_folder) + all_images.append(image) + image_tensor = torch.stack(all_images) + sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args) else: sources = copy.deepcopy([e["conversations"] for e in sources]) @@ -668,6 +675,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: self.tokenizer, has_image=( "image" in self.list_data_dict[i] + or "images" in self.list_data_dict[i] or "video" in self.list_data_dict[i] or "video_id" in self.list_data_dict[i] ), @@ -678,6 +686,8 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: # image exist in the data if "image" in self.list_data_dict[i]: data_dict["image"] = image.unsqueeze(0) + elif ("images" in self.list_data_dict[i]): + data_dict["image"] = image_tensor else: data_dict["image"] = None return data_dict @@ -771,6 +781,13 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: else: image = process_image(image_file, self.data_args, self.image_folder) sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args) + elif "images" in sources[0]: + all_images = [] + for image_file in self.list_data_dict[i]["images"]: + image = process_image(image_file, self.data_args, self.image_folder) + all_images.append(image) + image_tensor = torch.stack(all_images) + sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args) elif ("video" in sources[0]) or ("video_id" in sources[0]): num_video_frames = self.data_args.num_video_frames if "video" in sources[0]: @@ -822,6 +839,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: self.tokenizer, has_image=( "image" in self.list_data_dict[i] + or "images" in self.list_data_dict[i] or "video" in self.list_data_dict[i] or "video_id" in self.list_data_dict[i] ), @@ -835,6 +853,8 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: data_dict["image"] = image else: data_dict["image"] = image.unsqueeze(0) + elif ("images" in self.list_data_dict[i]): + data_dict["image"] = image_tensor elif ("video" in self.list_data_dict[i]) or ("video_id" in self.list_data_dict[i]): data_dict["image"] = image_tensor if not video_loading_succeed: diff --git a/llava/data/datasets_mixture.py b/llava/data/datasets_mixture.py index c194de5f..348168ad 100644 --- a/llava/data/datasets_mixture.py +++ b/llava/data/datasets_mixture.py @@ -629,6 +629,14 @@ def register_datasets_mixtures(): ) add_dataset(synthdog_en) + idefics2_sft = Dataset( + dataset_name="idefics2_sft", + dataset_type="torch", + data_path="/home/jasonlu/workspace/idefics2-sft/new-vflan/idefics2_sft_train.jsonl", + image_path="/home/jasonlu/workspace/idefics2-sft/new-vflan", + description="", + ) + add_dataset(idefics2_sft) test = Dataset( dataset_name="test",