Merge pull request NVlabs#70 from Efficient-Large-Model/dev/idefics2

IDEFICS2-SFT
gheinrich · May 4, 2024 · 4b28396 · 4b28396
2 parents 4f906a9 + db6701a
commit 4b28396
Show file tree

Hide file tree

Showing 5 changed files with 202 additions and 1 deletion.
diff --git a/data_prepare/README.md b/data_prepare/README.md
@@ -150,4 +150,23 @@ We use the train split of ScienceQA. The image data of the train split can be ob
 
 ```bash
 huggingface-cli download Efficient-Large-Model/ScienceQA_train_12K --repo-type dataset --local-dir scienceqa --local-dir-use-symlinks False
-```
+```
+
+### IDEFICS2-SFT dataset
+
+We also provide scripts to preprocess IDEFICS2-SFT dataset into llava-SFT like format. 
+
+Please first download [HuggingFaceM4/the_cauldron](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron) to `/home/jasonlu/workspace/idefics2-sft/the_cauldron`. Then, run the following scripts:
+
+```bash
+python preprocess_idefics2.py
+python merge_idefics2.py
+```
+
+A sample in the preprocessed dataset file will look like this:
+
+```json
+{"id": 0, "images": ["images/chart2text/0_0.png"], "conversations": [{"from": "human", "value": "<image>\nPlease clarify the meaning conveyed by this graph."}, {"from": "gpt", "value": "This statistic presents the reach of the most popular social networks among female beauty consumers in the United States as of August 2016. During the survey period, 62 percent of respondents had an Instagram account."}]}
+```
+
+Haotian's Note: Datasets overlapping with VFLAN / ShareGPT4V-SFT are removed. I also remove `plotqa` since it is too large, `localized_narratives` seems to be a little bit overlapped with captioning efforts within VILA. `websight` and `datikz` are two datasets that target code generation. Since the output is very long, and including them might slow down training, I also temporarily removed these two datasets, but feel free to add them back.
diff --git a/data_prepare/sft/merge_idefics2.py b/data_prepare/sft/merge_idefics2.py
@@ -0,0 +1,31 @@
+import os
+import json
+
+dataset_path = "/home/jasonlu/workspace/idefics2-sft/the_cauldron"
+save_path = "/home/jasonlu/workspace/idefics2-sft/new-vflan/"
+metadata_path = os.path.join(save_path, "metadata")
+dataset_names = sorted(os.listdir(metadata_path))
+
+def load_jsonl(file_path):
+    data = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+
+all_data = []
+for dataset_name in dataset_names:
+    if "websight" in dataset_name or "datikz" in dataset_name:
+        # skip the snapshot => code datasets for now.
+        continue
+    loaded = load_jsonl(os.path.join(metadata_path, dataset_name))
+    id_offset = len(all_data)
+    for item in loaded:
+        item["id"] += id_offset
+    all_data += loaded
+    print(dataset_name, len(all_data), all_data[-1])
+
+with open(os.path.join(save_path, "idefics2_sft_train.jsonl"), "w") as f:
+    for item in all_data:
+        json.dump(item, f)
+        f.write("\n")
diff --git a/data_prepare/sft/preprocess_idefics2.py b/data_prepare/sft/preprocess_idefics2.py
@@ -0,0 +1,123 @@
+from datasets import load_dataset, concatenate_datasets
+import os
+import pickle
+import torch
+import json
+from tqdm import tqdm
+from PIL import Image
+from multiprocessing import Pool
+
+
+
+def general_conversation_preprocessor(item, dataset_name, id):
+    # process the conversation item to llava format.
+    conversations = []
+    ret_item = dict(id=id)
+    # ret_item["images"] = item["images"]
+    img_paths = []
+    for img_idx, img in enumerate(item["images"]):
+        save_path_to_append = os.path.join("images", dataset_name, f"{id}_{img_idx}.png")
+        img_path = os.path.join(save_path, save_path_to_append)
+        if img.mode == 'CMYK':
+            img = img.convert('RGB')
+        img.save(img_path)
+        img_paths.append(save_path_to_append)
+    ret_item["images"] = img_paths
+    old_conversations = item["texts"]
+    for idx, conv in enumerate(old_conversations):
+        if "user" in conv:
+            if idx > 0:
+                cur_conv = conv["user"]
+                new_conv = {
+                    "from": "human",
+                    "value": cur_conv
+                }
+            else:
+                cur_conv = conv["user"]
+                new_conv = {
+                    "from": "human",
+                    "value": "<image>\n" * len(item["images"]) + cur_conv
+                }
+            conversations.append(new_conv)
+        if "assistant" in conv:
+            cur_conv = conv["assistant"]
+            if cur_conv.startswith("Answer: "):
+                cur_conv = cur_conv.replace("Answer: ", "")
+            new_conv = {
+                "from": "gpt",
+                "value": cur_conv
+            }
+            conversations.append(new_conv)
+    ret_item["conversations"] = conversations
+    return ret_item
+
+def process_dataset(args):
+    dataset_name, dataset_path, metadata_path, save_path = args
+    if os.path.exists(os.path.join(metadata_path, dataset_name + "_train.jsonl")):
+        return
+    print("Processing", dataset_name, "...")
+    loaded = load_dataset(dataset_path, dataset_name)["train"]
+    dataset = list(loaded)
+    cnt = 0
+    cur_llava_format_dataset = []
+    for item in tqdm(dataset):
+        new_item = general_conversation_preprocessor(item, dataset_name, cnt)
+        if cnt == 0:
+            print(item["texts"], item["images"][0], new_item)
+            print(new_item)
+        cnt += 1
+        cur_llava_format_dataset.append(new_item)
+
+    with open(os.path.join(metadata_path, dataset_name + "_train.jsonl"), "w") as f:
+        for item in cur_llava_format_dataset:
+            json.dump(item, f)
+            f.write("\n")
+
+# download M3IT to the dataset_path directory
+dataset_path = "/home/jasonlu/workspace/idefics2-sft/the_cauldron"
+save_path = "/home/jasonlu/workspace/idefics2-sft/new-vflan/"
+metadata_path = os.path.join(save_path, "metadata")
+os.makedirs(metadata_path, exist_ok=True)
+
+skipped_datasets = [
+    "ai2d", #internvl-sft
+    "chartqa", #internvl-sft
+    "clevr", #vflan, HAS BUG
+    "clevr_math", # HAS BUG
+    "docvqa", #internvl-sft
+    "dvqa", #internvl-sft
+    "nlvr2", #vflan
+    "ocrvqa", #vflan
+    "st_vqa", #vflan
+    "textcaps", #vflan, llava1.5
+    "visualmrc", #vflan
+    "vqav2", #vflan, llava1.5
+    "okvqa", #llava1.5
+    "aokvqa", #llava1.5
+    "plotqa", # has problem to load (very slow)
+    "localized_narratives", # has problem to load (very slow)
+]
+
+_dataset_names = sorted(os.listdir(dataset_path))
+dataset_names = []
+for name in _dataset_names:
+    if name.startswith("."):
+        continue
+    if name in skipped_datasets:
+        continue
+    if os.path.isdir(os.path.join(dataset_path, name)):
+        dataset_names.append(name)
+        os.makedirs(os.path.join(save_path, "images", name), exist_ok=True)
+print(dataset_names, len(dataset_names))
+
+# sequential version
+# for dataset_name in dataset_names:
+#     process_dataset((dataset_name, dataset_path, metadata_path, save_path))
+# parallel version
+with Pool(processes=min(48, len(dataset_names))) as pool:
+    # Prepare the arguments for the process_dataset function
+    args = [(dataset_name, dataset_path, metadata_path, save_path) for dataset_name in dataset_names]
+
+    # Map the process_dataset function to the arguments
+    for _ in tqdm(pool.imap_unordered(process_dataset, args), total=len(args), desc="Processing datasets"):
+        pass
diff --git a/llava/data/dataset.py b/llava/data/dataset.py
@@ -660,6 +660,13 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
             image_file = self.list_data_dict[i]["image"]
             image = process_image(image_file, self.data_args, self.image_folder)
             sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
+        elif "images" in sources[0]:
+            all_images = []
+            for image_file in self.list_data_dict[i]["images"]:
+                image = process_image(image_file, self.data_args, self.image_folder)
+                all_images.append(image)
+            image_tensor = torch.stack(all_images)
+            sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
         else:
             sources = copy.deepcopy([e["conversations"] for e in sources])
 
@@ -668,6 +675,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
             self.tokenizer,
             has_image=(
                 "image" in self.list_data_dict[i]
+                or "images" in self.list_data_dict[i]
                 or "video" in self.list_data_dict[i]
                 or "video_id" in self.list_data_dict[i]
             ),
@@ -678,6 +686,8 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
         # image exist in the data
         if "image" in self.list_data_dict[i]:
             data_dict["image"] = image.unsqueeze(0)
+        elif ("images" in self.list_data_dict[i]):
+            data_dict["image"] = image_tensor
         else:
             data_dict["image"] = None
         return data_dict
@@ -771,6 +781,13 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
             else:
                 image = process_image(image_file, self.data_args, self.image_folder)
             sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
+        elif "images" in sources[0]:
+            all_images = []
+            for image_file in self.list_data_dict[i]["images"]:
+                image = process_image(image_file, self.data_args, self.image_folder)
+                all_images.append(image)
+            image_tensor = torch.stack(all_images)
+            sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
         elif ("video" in sources[0]) or ("video_id" in sources[0]):
             num_video_frames = self.data_args.num_video_frames
             if "video" in sources[0]:
@@ -822,6 +839,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
             self.tokenizer,
             has_image=(
                 "image" in self.list_data_dict[i]
+                or "images" in self.list_data_dict[i]
                 or "video" in self.list_data_dict[i]
                 or "video_id" in self.list_data_dict[i]
             ),
@@ -835,6 +853,8 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
                 data_dict["image"] = image
             else:
                 data_dict["image"] = image.unsqueeze(0)
+        elif ("images" in self.list_data_dict[i]):
+            data_dict["image"] = image_tensor
         elif ("video" in self.list_data_dict[i]) or ("video_id" in self.list_data_dict[i]):
             data_dict["image"] = image_tensor
             if not video_loading_succeed:

diff --git a/llava/data/datasets_mixture.py b/llava/data/datasets_mixture.py
@@ -629,6 +629,14 @@ def register_datasets_mixtures():
     )
     add_dataset(synthdog_en)
 
+    idefics2_sft = Dataset(
+        dataset_name="idefics2_sft",
+        dataset_type="torch",
+        data_path="/home/jasonlu/workspace/idefics2-sft/new-vflan/idefics2_sft_train.jsonl",
+        image_path="/home/jasonlu/workspace/idefics2-sft/new-vflan",
+        description="",
+    )
+    add_dataset(idefics2_sft)
 
     test = Dataset(
         dataset_name="test",