From ee3f6c6d881783d8b75d9d4ce6e746c8b40565b5 Mon Sep 17 00:00:00 2001 From: Thiago Teixeira Santos Date: Sat, 25 May 2019 15:06:51 -0300 Subject: [PATCH] Fixing training and test splits. --- WGISD.ipynb | 176 ++++++++++++++++++----------------------------- test.txt | 2 - test_masked.txt | 3 +- train.txt | 2 + train_masked.txt | 2 + 5 files changed, 71 insertions(+), 114 deletions(-) diff --git a/WGISD.ipynb b/WGISD.ipynb index 3299ce4..6c55c98 100644 --- a/WGISD.ipynb +++ b/WGISD.ipynb @@ -56,12 +56,11 @@ "instances = {v: [] for v in varietals}\n", "\n", "for dirname, dirnames, filenames in os.walk('.'):\n", - " for filename in [f for f in filenames if f.endswith('.txt')]:\n", - " for v in varietals:\n", - " if filename.startswith(v):\n", - " instances[v].append(filename[:-4])\n", - " \n", - "\n" + " if dirname == '.':\n", + " for filename in [f for f in filenames if f.endswith('.txt')]:\n", + " for v in varietals:\n", + " if filename.startswith(v):\n", + " instances[v].append(filename[:-4])" ] }, { @@ -136,7 +135,7 @@ { "data": { "text/plain": [ - "{'CDY': 838, 'CFR': 1069, 'CSV': 640, 'SVB': 1313, 'SYH': 559}" + "{'CDY': 840, 'CFR': 1069, 'CSV': 643, 'SVB': 1317, 'SYH': 563}" ] }, "execution_count": 7, @@ -198,7 +197,7 @@ { "data": { "text/plain": [ - "{'CDY': 306, 'CFR': 513, 'CSV': 303, 'SVB': 604, 'SYH': 281}" + "{'CDY': 308, 'CFR': 513, 'CSV': 306, 'SVB': 608, 'SYH': 285}" ] }, "execution_count": 10, @@ -275,8 +274,8 @@ " \n", " CDY\n", " 65\n", - " 838\n", - " 306\n", + " 840\n", + " 308\n", " \n", " \n", " CFR\n", @@ -287,20 +286,20 @@ " \n", " CSV\n", " 57\n", - " 640\n", - " 303\n", + " 643\n", + " 306\n", " \n", " \n", " SVB\n", " 65\n", - " 1313\n", - " 604\n", + " 1317\n", + " 608\n", " \n", " \n", " SYH\n", " 48\n", - " 559\n", - " 281\n", + " 563\n", + " 285\n", " \n", " \n", "\n", @@ -308,11 +307,11 @@ ], "text/plain": [ " Images BoxedBunches MaskedBunches\n", - "CDY 65 838 306\n", + "CDY 65 840 308\n", "CFR 65 1069 513\n", - "CSV 57 640 303\n", - "SVB 65 1313 604\n", - "SYH 48 559 281" + "CSV 57 643 306\n", + "SVB 65 1317 608\n", + "SYH 48 563 285" ] }, "execution_count": 13, @@ -339,8 +338,8 @@ "data": { "text/plain": [ "Images 300\n", - "BoxedBunches 4419\n", - "MaskedBunches 2007\n", + "BoxedBunches 4432\n", + "MaskedBunches 2020\n", "dtype: int64" ] }, @@ -368,7 +367,7 @@ { "data": { "text/plain": [ - "240" + "242" ] }, "execution_count": 15, @@ -392,7 +391,7 @@ { "data": { "text/plain": [ - "60" + "58" ] }, "execution_count": 16, @@ -408,10 +407,37 @@ "len(test)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assert train and test are _disjoints_ :" + ] + }, { "cell_type": "code", "execution_count": 17, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "set()" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.intersection(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, "outputs": [], "source": [ "data = []\n", @@ -435,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -473,15 +499,15 @@ " \n", " \n", " Test\n", - " 60\n", - " 864\n", - " 422\n", + " 58\n", + " 850\n", + " 408\n", " \n", " \n", " Train\n", - " 240\n", - " 3555\n", - " 1585\n", + " 242\n", + " 3582\n", + " 1612\n", " \n", " \n", "\n", @@ -490,11 +516,11 @@ "text/plain": [ " Image BoxedBunches MaskedBunches\n", "Split \n", - "Test 60 864 422\n", - "Train 240 3555 1585" + "Test 58 850 408\n", + "Train 242 3582 1612" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -505,19 +531,19 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Image 300\n", - "BoxedBunches 4419\n", - "MaskedBunches 2007\n", + "BoxedBunches 4432\n", + "MaskedBunches 2020\n", "dtype: int64" ] }, - "execution_count": 19, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -525,83 +551,13 @@ "source": [ "dfi.groupby(['Split']).sum().sum()" ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ImageBoxedBunchesMaskedBunches
Split
Test29422422
Train10815851585
\n", - "
" - ], - "text/plain": [ - " Image BoxedBunches MaskedBunches\n", - "Split \n", - "Test 29 422 422\n", - "Train 108 1585 1585" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dfi[dfi['MaskedBunches'] > 0].groupby(['Split']).sum()" - ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { diff --git a/test.txt b/test.txt index 0935187..550dc54 100644 --- a/test.txt +++ b/test.txt @@ -6,7 +6,6 @@ CFR_1641 CFR_1638 SYH_2017-04-27_1269 SYH_2017-04-27_1322 -CDY_20180427_153310519_BURST000_COVER_TOP CSV_1877 CSV_20180427_144723166_HDR CDY_2040 @@ -20,7 +19,6 @@ CFR_1639 CFR_1643 CFR_1651 SVB_1972 -CDY_20180427_153144437_BURST000_COVER_TOP CDY_2046 SVB_1944 CDY_2038 diff --git a/test_masked.txt b/test_masked.txt index 4e5e189..845da3f 100644 --- a/test_masked.txt +++ b/test_masked.txt @@ -13,7 +13,6 @@ SVB_1972 CSV_1877 CDY_2015 SYH_2017-04-27_1269 -CDY_20180427_153144437_BURST000_COVER_TOP CFR_1638 CFR_1641 CFR_1643 @@ -26,4 +25,4 @@ SVB_1944 CDY_2043 CDY_2046 SYH_2017-04-27_1304 -CDY_20180427_153310519_BURST000_COVER_TOP + diff --git a/train.txt b/train.txt index 50e4c6d..66633ed 100644 --- a/train.txt +++ b/train.txt @@ -238,3 +238,5 @@ SYH_2017-04-27_1326 CFR_1630 SYH_2017-04-27_1268 CFR_20180427_141150696 +CDY_20180427_153310519_BURST000_COVER_TOP +CDY_20180427_153144437_BURST000_COVER_TOP diff --git a/train_masked.txt b/train_masked.txt index 115eaa6..9e1878c 100644 --- a/train_masked.txt +++ b/train_masked.txt @@ -106,3 +106,5 @@ CDY_2018 CFR_1647 SVB_1966 CFR_1670 +CDY_20180427_153310519_BURST000_COVER_TOP +CDY_20180427_153144437_BURST000_COVER_TOP