Skip to content

Commit

Permalink
Fixing training and test splits.
Browse files Browse the repository at this point in the history
  • Loading branch information
thsant committed May 25, 2019
1 parent c671353 commit ee3f6c6
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 114 deletions.
176 changes: 66 additions & 110 deletions WGISD.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,11 @@
"instances = {v: [] for v in varietals}\n",
"\n",
"for dirname, dirnames, filenames in os.walk('.'):\n",
" for filename in [f for f in filenames if f.endswith('.txt')]:\n",
" for v in varietals:\n",
" if filename.startswith(v):\n",
" instances[v].append(filename[:-4])\n",
" \n",
"\n"
" if dirname == '.':\n",
" for filename in [f for f in filenames if f.endswith('.txt')]:\n",
" for v in varietals:\n",
" if filename.startswith(v):\n",
" instances[v].append(filename[:-4])"
]
},
{
Expand Down Expand Up @@ -136,7 +135,7 @@
{
"data": {
"text/plain": [
"{'CDY': 838, 'CFR': 1069, 'CSV': 640, 'SVB': 1313, 'SYH': 559}"
"{'CDY': 840, 'CFR': 1069, 'CSV': 643, 'SVB': 1317, 'SYH': 563}"
]
},
"execution_count": 7,
Expand Down Expand Up @@ -198,7 +197,7 @@
{
"data": {
"text/plain": [
"{'CDY': 306, 'CFR': 513, 'CSV': 303, 'SVB': 604, 'SYH': 281}"
"{'CDY': 308, 'CFR': 513, 'CSV': 306, 'SVB': 608, 'SYH': 285}"
]
},
"execution_count": 10,
Expand Down Expand Up @@ -275,8 +274,8 @@
" <tr>\n",
" <th>CDY</th>\n",
" <td>65</td>\n",
" <td>838</td>\n",
" <td>306</td>\n",
" <td>840</td>\n",
" <td>308</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CFR</th>\n",
Expand All @@ -287,32 +286,32 @@
" <tr>\n",
" <th>CSV</th>\n",
" <td>57</td>\n",
" <td>640</td>\n",
" <td>303</td>\n",
" <td>643</td>\n",
" <td>306</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SVB</th>\n",
" <td>65</td>\n",
" <td>1313</td>\n",
" <td>604</td>\n",
" <td>1317</td>\n",
" <td>608</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SYH</th>\n",
" <td>48</td>\n",
" <td>559</td>\n",
" <td>281</td>\n",
" <td>563</td>\n",
" <td>285</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Images BoxedBunches MaskedBunches\n",
"CDY 65 838 306\n",
"CDY 65 840 308\n",
"CFR 65 1069 513\n",
"CSV 57 640 303\n",
"SVB 65 1313 604\n",
"SYH 48 559 281"
"CSV 57 643 306\n",
"SVB 65 1317 608\n",
"SYH 48 563 285"
]
},
"execution_count": 13,
Expand All @@ -339,8 +338,8 @@
"data": {
"text/plain": [
"Images 300\n",
"BoxedBunches 4419\n",
"MaskedBunches 2007\n",
"BoxedBunches 4432\n",
"MaskedBunches 2020\n",
"dtype: int64"
]
},
Expand Down Expand Up @@ -368,7 +367,7 @@
{
"data": {
"text/plain": [
"240"
"242"
]
},
"execution_count": 15,
Expand All @@ -392,7 +391,7 @@
{
"data": {
"text/plain": [
"60"
"58"
]
},
"execution_count": 16,
Expand All @@ -408,10 +407,37 @@
"len(test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Assert train and test are _disjoints_ :"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"set()"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.intersection(test)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"data = []\n",
Expand All @@ -435,7 +461,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -473,15 +499,15 @@
" <tbody>\n",
" <tr>\n",
" <th>Test</th>\n",
" <td>60</td>\n",
" <td>864</td>\n",
" <td>422</td>\n",
" <td>58</td>\n",
" <td>850</td>\n",
" <td>408</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Train</th>\n",
" <td>240</td>\n",
" <td>3555</td>\n",
" <td>1585</td>\n",
" <td>242</td>\n",
" <td>3582</td>\n",
" <td>1612</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -490,11 +516,11 @@
"text/plain": [
" Image BoxedBunches MaskedBunches\n",
"Split \n",
"Test 60 864 422\n",
"Train 240 3555 1585"
"Test 58 850 408\n",
"Train 242 3582 1612"
]
},
"execution_count": 18,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -505,103 +531,33 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Image 300\n",
"BoxedBunches 4419\n",
"MaskedBunches 2007\n",
"BoxedBunches 4432\n",
"MaskedBunches 2020\n",
"dtype: int64"
]
},
"execution_count": 19,
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfi.groupby(['Split']).sum().sum()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Image</th>\n",
" <th>BoxedBunches</th>\n",
" <th>MaskedBunches</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Split</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Test</th>\n",
" <td>29</td>\n",
" <td>422</td>\n",
" <td>422</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Train</th>\n",
" <td>108</td>\n",
" <td>1585</td>\n",
" <td>1585</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Image BoxedBunches MaskedBunches\n",
"Split \n",
"Test 29 422 422\n",
"Train 108 1585 1585"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfi[dfi['MaskedBunches'] > 0].groupby(['Split']).sum()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 2",
"language": "python",
"name": "python3"
"name": "python2"
},
"language_info": {
"codemirror_mode": {
Expand Down
2 changes: 0 additions & 2 deletions test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ CFR_1641
CFR_1638
SYH_2017-04-27_1269
SYH_2017-04-27_1322
CDY_20180427_153310519_BURST000_COVER_TOP
CSV_1877
CSV_20180427_144723166_HDR
CDY_2040
Expand All @@ -20,7 +19,6 @@ CFR_1639
CFR_1643
CFR_1651
SVB_1972
CDY_20180427_153144437_BURST000_COVER_TOP
CDY_2046
SVB_1944
CDY_2038
Expand Down
3 changes: 1 addition & 2 deletions test_masked.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ SVB_1972
CSV_1877
CDY_2015
SYH_2017-04-27_1269
CDY_20180427_153144437_BURST000_COVER_TOP
CFR_1638
CFR_1641
CFR_1643
Expand All @@ -26,4 +25,4 @@ SVB_1944
CDY_2043
CDY_2046
SYH_2017-04-27_1304
CDY_20180427_153310519_BURST000_COVER_TOP

2 changes: 2 additions & 0 deletions train.txt
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,5 @@ SYH_2017-04-27_1326
CFR_1630
SYH_2017-04-27_1268
CFR_20180427_141150696
CDY_20180427_153310519_BURST000_COVER_TOP
CDY_20180427_153144437_BURST000_COVER_TOP
2 changes: 2 additions & 0 deletions train_masked.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,5 @@ CDY_2018
CFR_1647
SVB_1966
CFR_1670
CDY_20180427_153310519_BURST000_COVER_TOP
CDY_20180427_153144437_BURST000_COVER_TOP

0 comments on commit ee3f6c6

Please sign in to comment.