From 81e8801082fb5934b336e3e9ad5506e584d25cbf Mon Sep 17 00:00:00 2001 From: Honaker Date: Fri, 20 Oct 2023 13:54:59 -0400 Subject: [PATCH] adding 2 dataset notebook --- .../tools/huggingface_eurosat_images.ipynb | 652 ++++++++++ .../huggingface_xview_images_reduced.ipynb | 1087 +++++++++++++++++ 2 files changed, 1739 insertions(+) create mode 100644 src/charmory/tools/huggingface_eurosat_images.ipynb create mode 100644 src/charmory/tools/huggingface_xview_images_reduced.ipynb diff --git a/src/charmory/tools/huggingface_eurosat_images.ipynb b/src/charmory/tools/huggingface_eurosat_images.ipynb new file mode 100644 index 000000000..934719f9f --- /dev/null +++ b/src/charmory/tools/huggingface_eurosat_images.ipynb @@ -0,0 +1,652 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Image as Im\n", + "import pyarrow as pa\n", + "import pandas as pd\n", + "from datasets import Dataset\n", + "from datasets import load_dataset\n", + "import datasets\n", + "from PIL import Image\n", + "import numpy as np\n", + "import jatic_toolbox" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset imagefolder (/home/chris/.cache/huggingface/datasets/tanganke___imagefolder/tanganke--EuroSAT-11e1b29eb44b9d39/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)\n", + "Loading cached processed dataset at /home/chris/.cache/huggingface/datasets/tanganke___imagefolder/tanganke--EuroSAT-11e1b29eb44b9d39/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f/cache-49e74981f0392efc.arrow\n", + "Found cached dataset imagefolder (/home/chris/.cache/huggingface/datasets/tanganke___imagefolder/tanganke--EuroSAT-11e1b29eb44b9d39/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)\n", + "Loading cached processed dataset at /home/chris/.cache/huggingface/datasets/tanganke___imagefolder/tanganke--EuroSAT-11e1b29eb44b9d39/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f/cache-f719f85a814ecbae.arrow\n", + "Found cached dataset imagefolder (/home/chris/.cache/huggingface/datasets/tanganke___imagefolder/tanganke--EuroSAT-11e1b29eb44b9d39/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)\n", + "Loading cached processed dataset at /home/chris/.cache/huggingface/datasets/tanganke___imagefolder/tanganke--EuroSAT-11e1b29eb44b9d39/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f/cache-edf6901f5ee348c8.arrow\n" + ] + } + ], + "source": [ + "def load_huggingface_dataset():\n", + " validation_dataset = (jatic_toolbox.load_dataset)(\n", + " provider=\"huggingface\",\n", + " dataset_name=\"tanganke/EuroSAT\",\n", + " task=\"image-classification\",\n", + " split=\"validation\",\n", + " )\n", + " train_dataset = (jatic_toolbox.load_dataset)(\n", + " provider=\"huggingface\",\n", + " dataset_name=\"tanganke/EuroSAT\",\n", + " task=\"image-classification\",\n", + " split=\"train\",\n", + " )\n", + " test_dataset = (jatic_toolbox.load_dataset)(\n", + " provider=\"huggingface\",\n", + " dataset_name=\"tanganke/EuroSAT\",\n", + " task=\"image-classification\",\n", + " split=\"test\",\n", + " )\n", + "\n", + " return validation_dataset, train_dataset, test_dataset\n", + "validation_dataset, train_dataset, test_dataset = load_huggingface_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(,\n", + " ,\n", + " )" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_dataset, train_dataset, test_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "jatic_toolbox._internals.interop.huggingface.datasets.HuggingFaceVisionDataset" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(validation_dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "NewDataset = datasets.DatasetDict({\"train\":train_dataset._dataset,\"validation\":validation_dataset._dataset, \"test\":test_dataset._dataset})\n" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['image', 'labels'],\n", + " num_rows: 21600\n", + "})" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "NewDataset['train'] " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Pushing split train to the Hub.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "03a9158606c148d58a8389ecfcac24ef", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/2160 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iscrowdimage_idbboxsegmentationcategory_ididarea
001043[427, 19, 11, 10][[427, 19, 427, 29, 438, 29, 438, 19]]327015110
101043[506, 0, 6, 9][[506, 9, 512, 9, 512, 0, 506, 0]]32701654
201043[131, 234, 70, 60][[131, 234, 131, 294, 201, 294, 201, 234]]34270174200
301043[110, 139, 50, 39][[110, 139, 110, 178, 160, 178, 160, 139]]34270181950
401043[12, 71, 69, 50][[12, 71, 12, 121, 81, 121, 81, 71]]34270193450
........................
484901768[103, 451, 333, 61][[436, 451, 103, 451, 103, 512, 436, 512]]344973820313
485001768[269, 134, 141, 117][[269, 134, 269, 251, 410, 251, 410, 134]]344973916497
485101768[190, 242, 73, 52][[190, 242, 190, 294, 263, 294, 263, 242]]34497403796
485201768[0, 287, 235, 217][[235, 504, 235, 287, 0, 287, 0, 504]]344974150995
485301768[379, 13, 133, 144][[379, 13, 379, 157, 512, 157, 512, 13]]344974219152
\n", + "

4854 rows × 7 columns

\n", + "" + ], + "text/plain": [ + " iscrowd image_id bbox \\\n", + "0 0 1043 [427, 19, 11, 10] \n", + "1 0 1043 [506, 0, 6, 9] \n", + "2 0 1043 [131, 234, 70, 60] \n", + "3 0 1043 [110, 139, 50, 39] \n", + "4 0 1043 [12, 71, 69, 50] \n", + "... ... ... ... \n", + "4849 0 1768 [103, 451, 333, 61] \n", + "4850 0 1768 [269, 134, 141, 117] \n", + "4851 0 1768 [190, 242, 73, 52] \n", + "4852 0 1768 [0, 287, 235, 217] \n", + "4853 0 1768 [379, 13, 133, 144] \n", + "\n", + " segmentation category_id id area \n", + "0 [[427, 19, 427, 29, 438, 29, 438, 19]] 3 27015 110 \n", + "1 [[506, 9, 512, 9, 512, 0, 506, 0]] 3 27016 54 \n", + "2 [[131, 234, 131, 294, 201, 294, 201, 234]] 34 27017 4200 \n", + "3 [[110, 139, 110, 178, 160, 178, 160, 139]] 34 27018 1950 \n", + "4 [[12, 71, 12, 121, 81, 121, 81, 71]] 34 27019 3450 \n", + "... ... ... ... ... \n", + "4849 [[436, 451, 103, 451, 103, 512, 436, 512]] 34 49738 20313 \n", + "4850 [[269, 134, 269, 251, 410, 251, 410, 134]] 34 49739 16497 \n", + "4851 [[190, 242, 190, 294, 263, 294, 263, 242]] 34 49740 3796 \n", + "4852 [[235, 504, 235, 287, 0, 287, 0, 504]] 34 49741 50995 \n", + "4853 [[379, 13, 379, 157, 512, 157, 512, 13]] 34 49742 19152 \n", + "\n", + "[4854 rows x 7 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
heightwidthidfile_name
1445125121719126_384_0_896_512.jpg
1455125121720126_768_0_1280_512.jpg
1465125121721126_1152_0_1664_512.jpg
1475125121722126_1536_0_2048_512.jpg
1485125121723126_1920_0_2432_512.jpg
1495125121724126_2304_0_2816_512.jpg
1505125121725126_2582_0_3094_512.jpg
1515125121726126_0_384_512_896.jpg
1525125121727126_384_384_896_896.jpg
1535125121728126_768_384_1280_896.jpg
1545125121729126_1152_384_1664_896.jpg
1555125121730126_1536_384_2048_896.jpg
1565125121731126_1920_384_2432_896.jpg
1575125121732126_2304_384_2816_896.jpg
1585125121733126_2582_384_3094_896.jpg
1595125121734126_0_768_512_1280.jpg
1605125121735126_384_768_896_1280.jpg
1615125121736126_768_768_1280_1280.jpg
1625125121737126_1152_768_1664_1280.jpg
1635125121738126_1536_768_2048_1280.jpg
1645125121739126_1920_768_2432_1280.jpg
1655125121740126_2304_768_2816_1280.jpg
1665125121741126_2582_768_3094_1280.jpg
1675125121742126_0_1152_512_1664.jpg
1685125121743126_384_1152_896_1664.jpg
1695125121744126_768_1152_1280_1664.jpg
1705125121745126_1536_1152_2048_1664.jpg
1715125121746126_1920_1152_2432_1664.jpg
1725125121747126_2304_1152_2816_1664.jpg
1735125121748126_2582_1152_3094_1664.jpg
1745125121749126_0_1536_512_2048.jpg
1755125121750126_384_1536_896_2048.jpg
1765125121751126_768_1536_1280_2048.jpg
1775125121752126_1152_1536_1664_2048.jpg
1785125121753126_1536_1536_2048_2048.jpg
1795125121754126_1920_1536_2432_2048.jpg
1805125121755126_2304_1536_2816_2048.jpg
1815125121756126_2582_1536_3094_2048.jpg
1825125121757126_0_1920_512_2432.jpg
1835125121758126_384_1920_896_2432.jpg
1845125121759126_768_1920_1280_2432.jpg
1855125121760126_1152_1920_1664_2432.jpg
1865125121761126_1536_1920_2048_2432.jpg
1875125121762126_1920_1920_2432_2432.jpg
1885125121763126_0_2095_512_2607.jpg
1895125121764126_384_2095_896_2607.jpg
1905125121765126_768_2095_1280_2607.jpg
1915125121766126_1152_2095_1664_2607.jpg
1925125121767126_1536_2095_2048_2607.jpg
1935125121768126_1920_2095_2432_2607.jpg
\n", + "
" + ], + "text/plain": [ + " height width id file_name\n", + "144 512 512 1719 126_384_0_896_512.jpg\n", + "145 512 512 1720 126_768_0_1280_512.jpg\n", + "146 512 512 1721 126_1152_0_1664_512.jpg\n", + "147 512 512 1722 126_1536_0_2048_512.jpg\n", + "148 512 512 1723 126_1920_0_2432_512.jpg\n", + "149 512 512 1724 126_2304_0_2816_512.jpg\n", + "150 512 512 1725 126_2582_0_3094_512.jpg\n", + "151 512 512 1726 126_0_384_512_896.jpg\n", + "152 512 512 1727 126_384_384_896_896.jpg\n", + "153 512 512 1728 126_768_384_1280_896.jpg\n", + "154 512 512 1729 126_1152_384_1664_896.jpg\n", + "155 512 512 1730 126_1536_384_2048_896.jpg\n", + "156 512 512 1731 126_1920_384_2432_896.jpg\n", + "157 512 512 1732 126_2304_384_2816_896.jpg\n", + "158 512 512 1733 126_2582_384_3094_896.jpg\n", + "159 512 512 1734 126_0_768_512_1280.jpg\n", + "160 512 512 1735 126_384_768_896_1280.jpg\n", + "161 512 512 1736 126_768_768_1280_1280.jpg\n", + "162 512 512 1737 126_1152_768_1664_1280.jpg\n", + "163 512 512 1738 126_1536_768_2048_1280.jpg\n", + "164 512 512 1739 126_1920_768_2432_1280.jpg\n", + "165 512 512 1740 126_2304_768_2816_1280.jpg\n", + "166 512 512 1741 126_2582_768_3094_1280.jpg\n", + "167 512 512 1742 126_0_1152_512_1664.jpg\n", + "168 512 512 1743 126_384_1152_896_1664.jpg\n", + "169 512 512 1744 126_768_1152_1280_1664.jpg\n", + "170 512 512 1745 126_1536_1152_2048_1664.jpg\n", + "171 512 512 1746 126_1920_1152_2432_1664.jpg\n", + "172 512 512 1747 126_2304_1152_2816_1664.jpg\n", + "173 512 512 1748 126_2582_1152_3094_1664.jpg\n", + "174 512 512 1749 126_0_1536_512_2048.jpg\n", + "175 512 512 1750 126_384_1536_896_2048.jpg\n", + "176 512 512 1751 126_768_1536_1280_2048.jpg\n", + "177 512 512 1752 126_1152_1536_1664_2048.jpg\n", + "178 512 512 1753 126_1536_1536_2048_2048.jpg\n", + "179 512 512 1754 126_1920_1536_2432_2048.jpg\n", + "180 512 512 1755 126_2304_1536_2816_2048.jpg\n", + "181 512 512 1756 126_2582_1536_3094_2048.jpg\n", + "182 512 512 1757 126_0_1920_512_2432.jpg\n", + "183 512 512 1758 126_384_1920_896_2432.jpg\n", + "184 512 512 1759 126_768_1920_1280_2432.jpg\n", + "185 512 512 1760 126_1152_1920_1664_2432.jpg\n", + "186 512 512 1761 126_1536_1920_2048_2432.jpg\n", + "187 512 512 1762 126_1920_1920_2432_2432.jpg\n", + "188 512 512 1763 126_0_2095_512_2607.jpg\n", + "189 512 512 1764 126_384_2095_896_2607.jpg\n", + "190 512 512 1765 126_768_2095_1280_2607.jpg\n", + "191 512 512 1766 126_1152_2095_1664_2607.jpg\n", + "192 512 512 1767 126_1536_2095_2048_2607.jpg\n", + "193 512 512 1768 126_1920_2095_2432_2607.jpg" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_image[-50:]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "supercategory\n", + "Engineering Vehicle 14\n", + "Maritime Vessel 10\n", + "Other 9\n", + "Truck 9\n", + "Railway Vehicle 6\n", + "Building 6\n", + "Fixed-wing Aircraft 3\n", + "Passenger Vehicle 3\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cat.supercategory.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "df_cat['index'] = range(0, len(df_cat))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "df['new_cat'] = df['category_id'].apply(lambda x: df_cat[df_cat.id == x].supercategory.values[0])\n", + "df['image_id_file'] = df['image_id'].apply(lambda x: df_image[df_image.id == x].file_name.values[0])\n", + "df['list_of_cooridates'] = df['bbox'].apply(lambda x: [int(x[0]),int(x[1]),int(x[0])+int(x[2]),int(x[1]) +int(x[3])])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "df['unique_id'] = pd.factorize(df['new_cat'])[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "unique_id\n", + "1 3555\n", + "0 1154\n", + "3 111\n", + "2 18\n", + "4 11\n", + "5 5\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.unique_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "new_cat\n", + "Building 3555\n", + "Passenger Vehicle 1154\n", + "Truck 111\n", + "Other 18\n", + "Engineering Vehicle 11\n", + "Maritime Vessel 5\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.new_cat.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "finished with 20\n", + "finished with 41\n", + "finished with 62\n", + "finished with 83\n" + ] + } + ], + "source": [ + "df['shape'] = 0\n", + "df_final = pd.DataFrame()\n", + "LIST =[]; i = 0\n", + "for group_name, df2 in df.groupby('image_id_file'):\n", + " #df2 = df2.drop(['level_0'],axis=1)\n", + " df2 = df2.reset_index()\n", + " df_append = pd.DataFrame(index=range(1),columns=['image_id','image','width','height','objects'])\n", + " \n", + " height, width, _ = np.array(Image.open('basic/'+str(group_name))).shape\n", + " df_append.at[0,'image_id'] = df2['image_id_file'][0]\n", + " df_append.at[0,'image'] = 'basic/'+str(group_name)\n", + " df_append.at[0,'width'] = width\n", + " df_append.at[0,'height'] = height\n", + " df_append.at[0,'objects'] = dict({\n", + " 'id': df2['index'].tolist(),\n", + " 'area': df2['area'].tolist(),\n", + " 'bbox': df2['list_of_cooridates'].tolist(),\n", + " 'category': df2['unique_id'].tolist()\n", + " }\n", + " )\n", + " \n", + " LIST.append(df_append)\n", + " if len(LIST) > 20:\n", + " df_concat = pd.concat(LIST)\n", + " df_final = pd.concat([df_final,df_concat])\n", + " LIST = []\n", + " print('finished with ' + str(i))\n", + " i += 1\n", + "if len(LIST) > 0:\n", + " df_concat = pd.concat(LIST)\n", + " df_final = pd.concat([df_final,df_concat])\n", + " \n", + "df = df_final.reset_index()\n", + "del df_final, df2,df_concat, df_append" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "hg_dataset = Dataset(pa.Table.from_pandas(df))\n", + "NewDataset = datasets.DatasetDict({\"train\":hg_dataset})#,\"val\":test_dataset})\n", + "newdata = NewDataset['train'].cast_column(\"image\", Im())\n", + "NewDataset = datasets.DatasetDict({\"train\":newdata})#,\"val\":test_dataset['train']})\n", + "del newdata, hg_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.000300263" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "NewDataset['train'].data.nbytes / 1e9" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['index', 'image_id', 'image', 'width', 'height', 'objects'],\n", + " num_rows: 84\n", + " })\n", + "})" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "NewDataset" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'index': 0,\n", + " 'image_id': '125_0_1536_512_2048.jpg',\n", + " 'image': ,\n", + " 'width': 512,\n", + " 'height': 512,\n", + " 'objects': {'area': [1005],\n", + " 'bbox': [[497, 423, 512, 490]],\n", + " 'category': [1],\n", + " 'id': [4136]}}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "NewDataset['train'][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 100%|██████████| 2/2 [00:00<00:00, 199.51 examples/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 177.18ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 504.76 examples/s]| 1/50 [00:01<01:02, 1.28s/it]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 454.62ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 385.88 examples/s]| 2/50 [00:01<00:39, 1.23it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 816.65ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 212.39 examples/s]| 3/50 [00:02<00:32, 1.46it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 367.89ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 227.83 examples/s]| 4/50 [00:02<00:28, 1.60it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 370.33ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 571.12 examples/s]| 5/50 [00:04<00:40, 1.10it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 817.60ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 324.31 examples/s]| 6/50 [00:04<00:34, 1.29it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 498.31ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 444.97 examples/s]| 7/50 [00:05<00:30, 1.41it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1297.74ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 411.41 examples/s]| 8/50 [00:05<00:26, 1.58it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 658.34ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 251.14 examples/s]| 9/50 [00:06<00:24, 1.69it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 310.87ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 267.65 examples/s]| 10/50 [00:07<00:25, 1.55it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 654.03ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 556.75 examples/s]| 11/50 [00:07<00:22, 1.73it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 827.12ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 394.70 examples/s]| 12/50 [00:07<00:21, 1.80it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 621.75ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 458.54 examples/s]| 13/50 [00:08<00:18, 2.01it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 780.34ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 495.72 examples/s]| 14/50 [00:08<00:18, 1.97it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 579.40ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 256.14 examples/s]| 15/50 [00:09<00:17, 2.05it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1015.32ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 215.77 examples/s]| 16/50 [00:09<00:16, 2.12it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 333.97ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 458.64 examples/s]| 17/50 [00:10<00:15, 2.14it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 876.00ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 240.95 examples/s]| 18/50 [00:10<00:16, 1.96it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 651.80ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 285.24 examples/s]| 19/50 [00:11<00:15, 1.98it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 367.57ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 224.69 examples/s]| 20/50 [00:11<00:14, 2.14it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 522.13ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 236.60 examples/s]| 21/50 [00:12<00:13, 2.18it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 771.72ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 373.84 examples/s]| 22/50 [00:12<00:12, 2.29it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 915.79ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 229.33 examples/s]| 23/50 [00:12<00:11, 2.33it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 267.55ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 267.03 examples/s]| 24/50 [00:13<00:12, 2.02it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 473.08ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 286.27 examples/s]| 25/50 [00:14<00:12, 1.96it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 642.61ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 229.26 examples/s]| 26/50 [00:14<00:12, 2.00it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 440.02ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 247.52 examples/s]| 27/50 [00:15<00:10, 2.14it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 547.70ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 262.87 examples/s]| 28/50 [00:15<00:10, 2.04it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 600.99ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 597.05 examples/s]| 29/50 [00:16<00:11, 1.82it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1049.36ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 255.86 examples/s]| 30/50 [00:16<00:10, 1.95it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 460.20ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 293.24 examples/s]| 31/50 [00:17<00:09, 2.04it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 601.42ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 281.53 examples/s]| 32/50 [00:17<00:08, 2.06it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 293.99ba/s]\n", + "Map: 100%|██████████| 2/2 [00:00<00:00, 301.35 examples/s]| 33/50 [00:18<00:08, 2.01it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 554.22ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 107.45 examples/s]| 34/50 [00:18<00:09, 1.73it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 260.34ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 142.84 examples/s]| 35/50 [00:19<00:07, 1.89it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 319.25ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 120.66 examples/s]| 36/50 [00:19<00:06, 2.02it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 400.30ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 131.26 examples/s]| 37/50 [00:20<00:06, 2.12it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 382.03ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 149.10 examples/s]| 38/50 [00:20<00:05, 2.10it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 498.97ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 121.19 examples/s]| 39/50 [00:21<00:05, 2.07it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 569.34ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 123.25 examples/s]| 40/50 [00:21<00:05, 1.88it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 451.49ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 141.83 examples/s]| 41/50 [00:22<00:04, 1.92it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 793.92ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 125.42 examples/s]| 42/50 [00:22<00:04, 1.81it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 378.65ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 130.16 examples/s]| 43/50 [00:23<00:03, 1.93it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 435.55ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 131.85 examples/s]| 44/50 [00:23<00:03, 1.93it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 368.18ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 145.13 examples/s]| 45/50 [00:24<00:02, 1.90it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 416.76ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 131.33 examples/s]| 46/50 [00:24<00:02, 1.98it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 404.74ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 126.32 examples/s]| 47/50 [00:25<00:01, 1.84it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 415.36ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 217.10 examples/s]| 48/50 [00:25<00:01, 1.93it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1036.65ba/s]\n", + "Map: 100%|██████████| 1/1 [00:00<00:00, 125.22 examples/s]| 49/50 [00:26<00:00, 2.05it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 684.45ba/s]\n", + "Pushing dataset shards to the dataset hub: 100%|██████████| 50/50 [00:26<00:00, 1.87it/s]\n", + "Downloading metadata: 100%|██████████| 21.0/21.0 [00:00<00:00, 63.6kB/s]\n" + ] + } + ], + "source": [ + "#NewDataset.push_to_hub('Honaker/xview_dataset_subset', num_shards={'train': 50, 'test': 0})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pytorch101", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}