From ee3f6c6d881783d8b75d9d4ce6e746c8b40565b5 Mon Sep 17 00:00:00 2001
From: Thiago Teixeira Santos <thiago.santos@embrapa.br>
Date: Sat, 25 May 2019 15:06:51 -0300
Subject: [PATCH] Fixing training and test splits.

---
 WGISD.ipynb      | 176 ++++++++++++++++++-----------------------------
 test.txt         |   2 -
 test_masked.txt  |   3 +-
 train.txt        |   2 +
 train_masked.txt |   2 +
 5 files changed, 71 insertions(+), 114 deletions(-)
diff --git a/WGISD.ipynb b/WGISD.ipynb
index 3299ce4..6c55c98 100644
--- a/WGISD.ipynb
+++ b/WGISD.ipynb
@@ -56,12 +56,11 @@
     "instances = {v: [] for v in varietals}\n",
     "\n",
     "for dirname, dirnames, filenames in os.walk('.'):\n",
-    "    for filename in [f for f in filenames if f.endswith('.txt')]:\n",
-    "        for v in varietals:\n",
-    "            if filename.startswith(v):\n",
-    "                instances[v].append(filename[:-4])\n",
-    "        \n",
-    "\n"
+    "    if dirname == '.':\n",
+    "        for filename in [f for f in filenames if f.endswith('.txt')]:\n",
+    "            for v in varietals:\n",
+    "                if filename.startswith(v):\n",
+    "                    instances[v].append(filename[:-4])"
    ]
   },
   {
@@ -136,7 +135,7 @@
     {
      "data": {
       "text/plain": [
-       "{'CDY': 838, 'CFR': 1069, 'CSV': 640, 'SVB': 1313, 'SYH': 559}"
+       "{'CDY': 840, 'CFR': 1069, 'CSV': 643, 'SVB': 1317, 'SYH': 563}"
       ]
      },
      "execution_count": 7,
@@ -198,7 +197,7 @@
     {
      "data": {
       "text/plain": [
-       "{'CDY': 306, 'CFR': 513, 'CSV': 303, 'SVB': 604, 'SYH': 281}"
+       "{'CDY': 308, 'CFR': 513, 'CSV': 306, 'SVB': 608, 'SYH': 285}"
       ]
      },
      "execution_count": 10,
@@ -275,8 +274,8 @@
        "    <tr>\n",
        "      <th>CDY</th>\n",
        "      <td>65</td>\n",
-       "      <td>838</td>\n",
-       "      <td>306</td>\n",
+       "      <td>840</td>\n",
+       "      <td>308</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>CFR</th>\n",
@@ -287,20 +286,20 @@
        "    <tr>\n",
        "      <th>CSV</th>\n",
        "      <td>57</td>\n",
-       "      <td>640</td>\n",
-       "      <td>303</td>\n",
+       "      <td>643</td>\n",
+       "      <td>306</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>SVB</th>\n",
        "      <td>65</td>\n",
-       "      <td>1313</td>\n",
-       "      <td>604</td>\n",
+       "      <td>1317</td>\n",
+       "      <td>608</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>SYH</th>\n",
        "      <td>48</td>\n",
-       "      <td>559</td>\n",
-       "      <td>281</td>\n",
+       "      <td>563</td>\n",
+       "      <td>285</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -308,11 +307,11 @@
       ],
       "text/plain": [
        "     Images  BoxedBunches  MaskedBunches\n",
-       "CDY      65           838            306\n",
+       "CDY      65           840            308\n",
        "CFR      65          1069            513\n",
-       "CSV      57           640            303\n",
-       "SVB      65          1313            604\n",
-       "SYH      48           559            281"
+       "CSV      57           643            306\n",
+       "SVB      65          1317            608\n",
+       "SYH      48           563            285"
       ]
      },
      "execution_count": 13,
@@ -339,8 +338,8 @@
      "data": {
       "text/plain": [
        "Images            300\n",
-       "BoxedBunches     4419\n",
-       "MaskedBunches    2007\n",
+       "BoxedBunches     4432\n",
+       "MaskedBunches    2020\n",
        "dtype: int64"
       ]
      },
@@ -368,7 +367,7 @@
     {
      "data": {
       "text/plain": [
-       "240"
+       "242"
       ]
      },
      "execution_count": 15,
@@ -392,7 +391,7 @@
     {
      "data": {
       "text/plain": [
-       "60"
+       "58"
       ]
      },
      "execution_count": 16,
@@ -408,10 +407,37 @@
     "len(test)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Assert train and test are _disjoints_ :"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 17,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "set()"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train.intersection(test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
    "outputs": [],
    "source": [
     "data = []\n",
@@ -435,7 +461,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -473,15 +499,15 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>Test</th>\n",
-       "      <td>60</td>\n",
-       "      <td>864</td>\n",
-       "      <td>422</td>\n",
+       "      <td>58</td>\n",
+       "      <td>850</td>\n",
+       "      <td>408</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Train</th>\n",
-       "      <td>240</td>\n",
-       "      <td>3555</td>\n",
-       "      <td>1585</td>\n",
+       "      <td>242</td>\n",
+       "      <td>3582</td>\n",
+       "      <td>1612</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -490,11 +516,11 @@
       "text/plain": [
        "       Image  BoxedBunches  MaskedBunches\n",
        "Split                                    \n",
-       "Test      60           864            422\n",
-       "Train    240          3555           1585"
+       "Test      58           850            408\n",
+       "Train    242          3582           1612"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -505,19 +531,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "Image             300\n",
-       "BoxedBunches     4419\n",
-       "MaskedBunches    2007\n",
+       "BoxedBunches     4432\n",
+       "MaskedBunches    2020\n",
        "dtype: int64"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -525,83 +551,13 @@
    "source": [
     "dfi.groupby(['Split']).sum().sum()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Image</th>\n",
-       "      <th>BoxedBunches</th>\n",
-       "      <th>MaskedBunches</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>Split</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>Test</th>\n",
-       "      <td>29</td>\n",
-       "      <td>422</td>\n",
-       "      <td>422</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>Train</th>\n",
-       "      <td>108</td>\n",
-       "      <td>1585</td>\n",
-       "      <td>1585</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       Image  BoxedBunches  MaskedBunches\n",
-       "Split                                    \n",
-       "Test      29           422            422\n",
-       "Train    108          1585           1585"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dfi[dfi['MaskedBunches'] > 0].groupby(['Split']).sum()"
-   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/test.txt b/test.txt
index 0935187..550dc54 100644
--- a/test.txt
+++ b/test.txt
@@ -6,7 +6,6 @@ CFR_1641
 CFR_1638
 SYH_2017-04-27_1269
 SYH_2017-04-27_1322
-CDY_20180427_153310519_BURST000_COVER_TOP
 CSV_1877
 CSV_20180427_144723166_HDR
 CDY_2040
@@ -20,7 +19,6 @@ CFR_1639
 CFR_1643
 CFR_1651
 SVB_1972
-CDY_20180427_153144437_BURST000_COVER_TOP
 CDY_2046
 SVB_1944
 CDY_2038
diff --git a/test_masked.txt b/test_masked.txt
index 4e5e189..845da3f 100644
--- a/test_masked.txt
+++ b/test_masked.txt
@@ -13,7 +13,6 @@ SVB_1972
 CSV_1877
 CDY_2015
 SYH_2017-04-27_1269
-CDY_20180427_153144437_BURST000_COVER_TOP
 CFR_1638
 CFR_1641
 CFR_1643
@@ -26,4 +25,4 @@ SVB_1944
 CDY_2043
 CDY_2046
 SYH_2017-04-27_1304
-CDY_20180427_153310519_BURST000_COVER_TOP
+
diff --git a/train.txt b/train.txt
index 50e4c6d..66633ed 100644
--- a/train.txt
+++ b/train.txt
@@ -238,3 +238,5 @@ SYH_2017-04-27_1326
 CFR_1630
 SYH_2017-04-27_1268
 CFR_20180427_141150696
+CDY_20180427_153310519_BURST000_COVER_TOP
+CDY_20180427_153144437_BURST000_COVER_TOP
diff --git a/train_masked.txt b/train_masked.txt
index 115eaa6..9e1878c 100644
--- a/train_masked.txt
+++ b/train_masked.txt
@@ -106,3 +106,5 @@ CDY_2018
 CFR_1647
 SVB_1966
 CFR_1670
+CDY_20180427_153310519_BURST000_COVER_TOP
+CDY_20180427_153144437_BURST000_COVER_TOP