Skip to content

Commit

Permalink
docs
Browse files Browse the repository at this point in the history
  • Loading branch information
pufanyi committed Jun 19, 2024
1 parent ce6f889 commit 19f9bd6
Showing 1 changed file with 17 additions and 75 deletions.
92 changes: 17 additions & 75 deletions tools/make_image_hf_dataset.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,38 +28,15 @@
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "bat"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2024-06-19 14:09:51-- https://huggingface.co/datasets/pufanyi/VQAv2_TOY/resolve/main/source_data/sample_data.zip\n",
"Resolving huggingface.co (huggingface.co)... 13.33.30.114, 13.33.30.49, 13.33.30.76, ...\n",
"Connecting to huggingface.co (huggingface.co)|13.33.30.114|:443... connected.\n",
"HTTP request sent, awaiting response... 302 Found\n",
"Location: https://cdn-lfs-us-1.huggingface.co/repos/c9/82/c9827770a5c0b13c1b646a275968813f8705db30ac0de29f118bb316c2b2a4eb/8cc2e821b7c6e4b5726a6feeb6214cd2d4810d53f568a5f3565d78e6d1ee5403?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27sample_data.zip%3B+filename%3D%22sample_data.zip%22%3B&response-content-type=application%2Fzip&Expires=1719036591&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxOTAzNjU5MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M5LzgyL2M5ODI3NzcwYTVjMGIxM2MxYjY0NmEyNzU5Njg4MTNmODcwNWRiMzBhYzBkZTI5ZjExOGJiMzE2YzJiMmE0ZWIvOGNjMmU4MjFiN2M2ZTRiNTcyNmE2ZmVlYjYyMTRjZDJkNDgxMGQ1M2Y1NjhhNWYzNTY1ZDc4ZTZkMWVlNTQwMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=kppoby2Wg9BYA-L2HJ0uShfMSULqTXjtN3cbdBdZTvMf4NvNXBJxc0mcPSiz-sqV7d7hJn32IzHze2JnnTGxrVrozYdHeoTuG0EtF%7ERgQz17PbzbEps-MPzl-h4G9d5RImWDBNN3OYTWyvSxFzn12d-owQKrkdEXejUZEkGdzvHgECzLPpuMw%7EXIctwxBBbxrHRtBNU57K2KBwOqw5rujHtQevhMaCeRgxRFlpfc3FDxsl4rUVHrCM79UhPwutpEAtOh%7Ep6%7EdgLOXal6oZKCnejCQg3AjgvuMe4Eot3J37a7yUGToRtx6XX8Q9I1SC2nScXIWwZndOQY-1VNSL1s-A__&Key-Pair-Id=K2FPYV99P2N66Q [following]\n",
"--2024-06-19 14:09:51-- https://cdn-lfs-us-1.huggingface.co/repos/c9/82/c9827770a5c0b13c1b646a275968813f8705db30ac0de29f118bb316c2b2a4eb/8cc2e821b7c6e4b5726a6feeb6214cd2d4810d53f568a5f3565d78e6d1ee5403?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27sample_data.zip%3B+filename%3D%22sample_data.zip%22%3B&response-content-type=application%2Fzip&Expires=1719036591&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxOTAzNjU5MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M5LzgyL2M5ODI3NzcwYTVjMGIxM2MxYjY0NmEyNzU5Njg4MTNmODcwNWRiMzBhYzBkZTI5ZjExOGJiMzE2YzJiMmE0ZWIvOGNjMmU4MjFiN2M2ZTRiNTcyNmE2ZmVlYjYyMTRjZDJkNDgxMGQ1M2Y1NjhhNWYzNTY1ZDc4ZTZkMWVlNTQwMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=kppoby2Wg9BYA-L2HJ0uShfMSULqTXjtN3cbdBdZTvMf4NvNXBJxc0mcPSiz-sqV7d7hJn32IzHze2JnnTGxrVrozYdHeoTuG0EtF%7ERgQz17PbzbEps-MPzl-h4G9d5RImWDBNN3OYTWyvSxFzn12d-owQKrkdEXejUZEkGdzvHgECzLPpuMw%7EXIctwxBBbxrHRtBNU57K2KBwOqw5rujHtQevhMaCeRgxRFlpfc3FDxsl4rUVHrCM79UhPwutpEAtOh%7Ep6%7EdgLOXal6oZKCnejCQg3AjgvuMe4Eot3J37a7yUGToRtx6XX8Q9I1SC2nScXIWwZndOQY-1VNSL1s-A__&Key-Pair-Id=K2FPYV99P2N66Q\n",
"Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 3.165.102.80, 3.165.102.25, 3.165.102.95, ...\n",
"Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|3.165.102.80|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 2678607 (2.6M) [application/zip]\n",
"Saving to: ‘data/sample_data.zip’\n",
"\n",
"sample_data.zip 100%[===================>] 2.55M 7.46MB/s in 0.3s \n",
"\n",
"2024-06-19 14:09:52 (7.46 MB/s) - ‘data/sample_data.zip’ saved [2678607/2678607]\n",
"\n"
]
}
],
"outputs": [],
"source": [
"!wget https://huggingface.co/datasets/pufanyi/VQAv2_TOY/resolve/main/source_data/sample_data.zip -P data\n",
"!wget https://huggingface.co/datasets/lmms-lab/VQAv2_TOY/resolve/main/source_data/sample_data.zip -P data\n",
"!unzip data/sample_data.zip -d data"
]
},
Expand Down Expand Up @@ -107,14 +84,10 @@
"\n",
"features = datasets.Features(\n",
" {\n",
" \"question\": datasets.Value(\"string\"),\n",
" \"question_id\": datasets.Value(\"int64\"),\n",
" \"question\": datasets.Value(\"string\"),\n",
" \"image_id\": datasets.Value(\"string\"),\n",
" \"image\": datasets.Image(),\n",
" \"answers\": datasets.Sequence(datasets.Sequence(feature={\"answer\": datasets.Value(\"string\"), \"answer_confidence\": datasets.Value(\"string\"), \"answer_id\": datasets.Value(\"int64\")})),\n",
" \"answer_type\": datasets.Value(\"string\"),\n",
" \"multiple_choice_answer\": datasets.Value(\"string\"),\n",
" \"question_type\": datasets.Value(\"string\"),\n",
" }\n",
")"
]
Expand Down Expand Up @@ -144,26 +117,15 @@
"import json\n",
"from PIL import Image\n",
"\n",
"KEYS = [\"question\", \"question_id\", \"image_id\", \"answers\", \"answer_type\", \"multiple_choice_answer\", \"question_type\"]\n",
"\n",
"def generator(qa_file, image_folder, image_prefix):\n",
" # Open and load the question-answer file\n",
" with open(qa_file, \"r\") as f:\n",
" data = json.load(f)\n",
" qa = data[\"questions\"]\n",
"\n",
" for q in qa:\n",
" # Get the image id\n",
" image_id = q[\"image_id\"]\n",
" # Construct the image path\n",
" image_path = os.path.join(image_folder, f\"{image_prefix}_{image_id:012}.jpg\")\n",
" # Open the image and add it to the question-answer dictionary\n",
" q[\"image\"] = Image.open(image_path)\n",
" # Check if all keys are present in the question-answer dictionary, if not add them with None value\n",
" for key in KEYS:\n",
" if key not in q:\n",
" q[key] = None\n",
" # Yield the question-answer dictionary\n",
" yield q"
]
},
Expand All @@ -189,33 +151,34 @@
"data_val = datasets.Dataset.from_generator(\n",
" generator,\n",
" gen_kwargs={\n",
" \"qa_file\": \"data/questions/v2_OpenEnded_mscoco_val2014_questions.json\",\n",
" \"image_folder\": \"data/images/val2014\",\n",
" \"qa_file\": \"data/questions/vqav2_toy_questions_val2014.json\",\n",
" \"image_folder\": \"data/images\",\n",
" \"image_prefix\": \"COCO_val2014\",\n",
" },\n",
" features=features,\n",
" # For this dataset, there is no need to specify the features, as all cells are non-null and all splits have the same schema\n",
" # features=features,\n",
" num_proc=NUM_PROC,\n",
")\n",
"\n",
"data_test = datasets.Dataset.from_generator(\n",
" generator,\n",
" gen_kwargs={\n",
" \"qa_file\": \"data/questions/v2_OpenEnded_mscoco_test2015_questions.json\",\n",
" \"image_folder\": \"data/images/test2015\",\n",
" \"qa_file\": \"data/questions/vqav2_toy_questions_test2015.json\",\n",
" \"image_folder\": \"data/images\",\n",
" \"image_prefix\": \"COCO_test2015\",\n",
" },\n",
" features=features,\n",
" # features=features,\n",
" num_proc=NUM_PROC,\n",
")\n",
"\n",
"data_test_dev = datasets.Dataset.from_generator(\n",
" generator,\n",
" gen_kwargs={\n",
" \"qa_file\": \"data/questions/v2_OpenEnded_mscoco_test-dev2015_questions.json\",\n",
" \"image_folder\": \"data/images/test2015\",\n",
" \"qa_file\": \"data/questions/vqav2_toy_questions_test-dev2015.json\",\n",
" \"image_folder\": \"data/images\",\n",
" \"image_prefix\": \"COCO_test2015\",\n",
" },\n",
" features=features,\n",
" # features=features,\n",
" num_proc=NUM_PROC,\n",
")"
]
Expand Down Expand Up @@ -244,35 +207,14 @@
"metadata": {},
"outputs": [],
"source": [
"data.push_to_hub(\"pufanyi/VQAv2\")"
"data.push_to_hub(\"lmms-lab/VQAv2_TOY\") # replace lmms-lab to your username"
]
},
{
"cell_type": "code",
"execution_count": 44,
"cell_type": "markdown",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CommitInfo(commit_url='https://huggingface.co/datasets/pufanyi/VQAv2_TOY/commit/b057eff450520a6e3fc7e6be88c3a172c4b5d99b', commit_message='Upload source_data/sample_data.zip with huggingface_hub', commit_description='', oid='b057eff450520a6e3fc7e6be88c3a172c4b5d99b', pr_url=None, pr_revision=None, pr_num=None)"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from huggingface_hub import HfApi\n",
"\n",
"api = HfApi()\n",
"api.upload_file(\n",
" path_or_fileobj=\"/data/pufanyi/project/lmms-eval-public/tools/data/sample_data.zip\",\n",
" path_in_repo=\"source_data/sample_data.zip\",\n",
" repo_id=\"pufanyi/VQAv2_TOY\",\n",
" repo_type=\"dataset\",\n",
")"
"Now, you can check the dataset on the [Hugging Face dataset hub](https://huggingface.co/datasets/lmms-lab/VQAv2_TOY)."
]
},
{
Expand Down

0 comments on commit 19f9bd6

Please sign in to comment.