Skip to content

Commit

Permalink
Merge pull request NVlabs#70 from Efficient-Large-Model/dev/idefics2
Browse files Browse the repository at this point in the history
IDEFICS2-SFT
  • Loading branch information
Efficient-Large-Language-Model authored May 4, 2024
2 parents 4f906a9 + db6701a commit 4b28396
Show file tree
Hide file tree
Showing 5 changed files with 202 additions and 1 deletion.
21 changes: 20 additions & 1 deletion data_prepare/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,23 @@ We use the train split of ScienceQA. The image data of the train split can be ob

```bash
huggingface-cli download Efficient-Large-Model/ScienceQA_train_12K --repo-type dataset --local-dir scienceqa --local-dir-use-symlinks False
```
```

### IDEFICS2-SFT dataset

We also provide scripts to preprocess IDEFICS2-SFT dataset into llava-SFT like format.

Please first download [HuggingFaceM4/the_cauldron](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron) to `/home/jasonlu/workspace/idefics2-sft/the_cauldron`. Then, run the following scripts:

```bash
python preprocess_idefics2.py
python merge_idefics2.py
```

A sample in the preprocessed dataset file will look like this:

```json
{"id": 0, "images": ["images/chart2text/0_0.png"], "conversations": [{"from": "human", "value": "<image>\nPlease clarify the meaning conveyed by this graph."}, {"from": "gpt", "value": "This statistic presents the reach of the most popular social networks among female beauty consumers in the United States as of August 2016. During the survey period, 62 percent of respondents had an Instagram account."}]}
```

Haotian's Note: Datasets overlapping with VFLAN / ShareGPT4V-SFT are removed. I also remove `plotqa` since it is too large, `localized_narratives` seems to be a little bit overlapped with captioning efforts within VILA. `websight` and `datikz` are two datasets that target code generation. Since the output is very long, and including them might slow down training, I also temporarily removed these two datasets, but feel free to add them back.
31 changes: 31 additions & 0 deletions data_prepare/sft/merge_idefics2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
import json

dataset_path = "/home/jasonlu/workspace/idefics2-sft/the_cauldron"
save_path = "/home/jasonlu/workspace/idefics2-sft/new-vflan/"
metadata_path = os.path.join(save_path, "metadata")
dataset_names = sorted(os.listdir(metadata_path))

def load_jsonl(file_path):
data = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
data.append(json.loads(line))
return data

all_data = []
for dataset_name in dataset_names:
if "websight" in dataset_name or "datikz" in dataset_name:
# skip the snapshot => code datasets for now.
continue
loaded = load_jsonl(os.path.join(metadata_path, dataset_name))
id_offset = len(all_data)
for item in loaded:
item["id"] += id_offset
all_data += loaded
print(dataset_name, len(all_data), all_data[-1])

with open(os.path.join(save_path, "idefics2_sft_train.jsonl"), "w") as f:
for item in all_data:
json.dump(item, f)
f.write("\n")
123 changes: 123 additions & 0 deletions data_prepare/sft/preprocess_idefics2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from datasets import load_dataset, concatenate_datasets
import os
import pickle
import torch
import json
from tqdm import tqdm
from PIL import Image
from multiprocessing import Pool



def general_conversation_preprocessor(item, dataset_name, id):
# process the conversation item to llava format.
conversations = []
ret_item = dict(id=id)
# ret_item["images"] = item["images"]
img_paths = []
for img_idx, img in enumerate(item["images"]):
save_path_to_append = os.path.join("images", dataset_name, f"{id}_{img_idx}.png")
img_path = os.path.join(save_path, save_path_to_append)
if img.mode == 'CMYK':
img = img.convert('RGB')
img.save(img_path)
img_paths.append(save_path_to_append)
ret_item["images"] = img_paths
old_conversations = item["texts"]
for idx, conv in enumerate(old_conversations):
if "user" in conv:
if idx > 0:
cur_conv = conv["user"]
new_conv = {
"from": "human",
"value": cur_conv
}
else:
cur_conv = conv["user"]
new_conv = {
"from": "human",
"value": "<image>\n" * len(item["images"]) + cur_conv
}
conversations.append(new_conv)
if "assistant" in conv:
cur_conv = conv["assistant"]
if cur_conv.startswith("Answer: "):
cur_conv = cur_conv.replace("Answer: ", "")
new_conv = {
"from": "gpt",
"value": cur_conv
}
conversations.append(new_conv)
ret_item["conversations"] = conversations
return ret_item

def process_dataset(args):
dataset_name, dataset_path, metadata_path, save_path = args
if os.path.exists(os.path.join(metadata_path, dataset_name + "_train.jsonl")):
return
print("Processing", dataset_name, "...")
loaded = load_dataset(dataset_path, dataset_name)["train"]
dataset = list(loaded)
cnt = 0
cur_llava_format_dataset = []
for item in tqdm(dataset):
new_item = general_conversation_preprocessor(item, dataset_name, cnt)
if cnt == 0:
print(item["texts"], item["images"][0], new_item)
print(new_item)
cnt += 1
cur_llava_format_dataset.append(new_item)

with open(os.path.join(metadata_path, dataset_name + "_train.jsonl"), "w") as f:
for item in cur_llava_format_dataset:
json.dump(item, f)
f.write("\n")

# download M3IT to the dataset_path directory
dataset_path = "/home/jasonlu/workspace/idefics2-sft/the_cauldron"
save_path = "/home/jasonlu/workspace/idefics2-sft/new-vflan/"
metadata_path = os.path.join(save_path, "metadata")
os.makedirs(metadata_path, exist_ok=True)

skipped_datasets = [
"ai2d", #internvl-sft
"chartqa", #internvl-sft
"clevr", #vflan, HAS BUG
"clevr_math", # HAS BUG
"docvqa", #internvl-sft
"dvqa", #internvl-sft
"nlvr2", #vflan
"ocrvqa", #vflan
"st_vqa", #vflan
"textcaps", #vflan, llava1.5
"visualmrc", #vflan
"vqav2", #vflan, llava1.5
"okvqa", #llava1.5
"aokvqa", #llava1.5
"plotqa", # has problem to load (very slow)
"localized_narratives", # has problem to load (very slow)
]

_dataset_names = sorted(os.listdir(dataset_path))
dataset_names = []
for name in _dataset_names:
if name.startswith("."):
continue
if name in skipped_datasets:
continue
if os.path.isdir(os.path.join(dataset_path, name)):
dataset_names.append(name)
os.makedirs(os.path.join(save_path, "images", name), exist_ok=True)
print(dataset_names, len(dataset_names))

# sequential version
# for dataset_name in dataset_names:
# process_dataset((dataset_name, dataset_path, metadata_path, save_path))
# parallel version
with Pool(processes=min(48, len(dataset_names))) as pool:
# Prepare the arguments for the process_dataset function
args = [(dataset_name, dataset_path, metadata_path, save_path) for dataset_name in dataset_names]

# Map the process_dataset function to the arguments
for _ in tqdm(pool.imap_unordered(process_dataset, args), total=len(args), desc="Processing datasets"):
pass
20 changes: 20 additions & 0 deletions llava/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,13 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
image_file = self.list_data_dict[i]["image"]
image = process_image(image_file, self.data_args, self.image_folder)
sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
elif "images" in sources[0]:
all_images = []
for image_file in self.list_data_dict[i]["images"]:
image = process_image(image_file, self.data_args, self.image_folder)
all_images.append(image)
image_tensor = torch.stack(all_images)
sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
else:
sources = copy.deepcopy([e["conversations"] for e in sources])

Expand All @@ -668,6 +675,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
self.tokenizer,
has_image=(
"image" in self.list_data_dict[i]
or "images" in self.list_data_dict[i]
or "video" in self.list_data_dict[i]
or "video_id" in self.list_data_dict[i]
),
Expand All @@ -678,6 +686,8 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
# image exist in the data
if "image" in self.list_data_dict[i]:
data_dict["image"] = image.unsqueeze(0)
elif ("images" in self.list_data_dict[i]):
data_dict["image"] = image_tensor
else:
data_dict["image"] = None
return data_dict
Expand Down Expand Up @@ -771,6 +781,13 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
else:
image = process_image(image_file, self.data_args, self.image_folder)
sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
elif "images" in sources[0]:
all_images = []
for image_file in self.list_data_dict[i]["images"]:
image = process_image(image_file, self.data_args, self.image_folder)
all_images.append(image)
image_tensor = torch.stack(all_images)
sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
elif ("video" in sources[0]) or ("video_id" in sources[0]):
num_video_frames = self.data_args.num_video_frames
if "video" in sources[0]:
Expand Down Expand Up @@ -822,6 +839,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
self.tokenizer,
has_image=(
"image" in self.list_data_dict[i]
or "images" in self.list_data_dict[i]
or "video" in self.list_data_dict[i]
or "video_id" in self.list_data_dict[i]
),
Expand All @@ -835,6 +853,8 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
data_dict["image"] = image
else:
data_dict["image"] = image.unsqueeze(0)
elif ("images" in self.list_data_dict[i]):
data_dict["image"] = image_tensor
elif ("video" in self.list_data_dict[i]) or ("video_id" in self.list_data_dict[i]):
data_dict["image"] = image_tensor
if not video_loading_succeed:
Expand Down
8 changes: 8 additions & 0 deletions llava/data/datasets_mixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,14 @@ def register_datasets_mixtures():
)
add_dataset(synthdog_en)

idefics2_sft = Dataset(
dataset_name="idefics2_sft",
dataset_type="torch",
data_path="/home/jasonlu/workspace/idefics2-sft/new-vflan/idefics2_sft_train.jsonl",
image_path="/home/jasonlu/workspace/idefics2-sft/new-vflan",
description="",
)
add_dataset(idefics2_sft)

test = Dataset(
dataset_name="test",
Expand Down

0 comments on commit 4b28396

Please sign in to comment.