dreamquark-ai · eduardocarvp · Mar 23, 2022 · Mar 3, 2022 · Mar 8, 2022 · Mar 23, 2022
diff --git a/Makefile b/Makefile
@@ -9,7 +9,7 @@ NO_COLOR=\\e[39m
 OK_COLOR=\\e[32m
 ERROR_COLOR=\\e[31m
 WARN_COLOR=\\e[33m
-PORT=8889
+PORT=8887
 .SILENT: ;
 default: help;   # default target
 

diff --git a/README.md b/README.md
@@ -192,6 +192,12 @@ A complete example can be found within the notebook `pretraining_example.ipynb`.
 
 /!\ : current implementation is trying to reconstruct the original inputs, but Batch Normalization applies a random transformation that can't be deduced by a single line, making the reconstruction harder. Lowering the `batch_size` might make the pretraining easier.
 
+# Data augmentation on the fly
+
+It is now possible to apply custom data augmentation pipeline during training.
+Templates for ClassificationSMOTE and RegressionSMOTE have been added in `pytorch-tabnet/augmentations.py` and can be used as is.
+
+
 # Easy saving and loading
 
 It's really easy to save and re-load a trained model, this makes TabNet production ready.

diff --git a/census_example.ipynb b/census_example.ipynb
@@ -205,6 +205,16 @@
     "max_epochs = 100 if not os.getenv(\"CI\", False) else 2"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pytorch_tabnet.augmentations import ClassificationSMOTE\n",
+    "aug = ClassificationSMOTE(p=0.2)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -225,10 +235,11 @@
     "        batch_size=1024, virtual_batch_size=128,\n",
     "        num_workers=0,\n",
     "        weights=1,\n",
-    "        drop_last=False\n",
+    "        drop_last=False,\n",
+    "        augmentations=aug, #aug, None\n",
     "    )\n",
     "    save_history.append(clf.history[\"valid_auc\"])\n",
-    "    \n",
+    "\n",
     "assert(np.all(np.array(save_history[0]==np.array(save_history[1]))))"
    ]
   },

diff --git a/forest_example.ipynb b/forest_example.ipynb
@@ -237,7 +237,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "max_epochs = 5 if not os.getenv(\"CI\", False) else 2"
+    "max_epochs = 50 if not os.getenv(\"CI\", False) else 2"
    ]
   },
   {
@@ -248,12 +248,16 @@
    },
    "outputs": [],
    "source": [
+    "from pytorch_tabnet.augmentations import ClassificationSMOTE\n",
+    "aug = ClassificationSMOTE(p=0.2)\n",
+    "\n",
     "clf.fit(\n",
     "    X_train=X_train, y_train=y_train,\n",
     "    eval_set=[(X_train, y_train), (X_valid, y_valid)],\n",
     "    eval_name=['train', 'valid'],\n",
     "    max_epochs=max_epochs, patience=100,\n",
-    "    batch_size=16384, virtual_batch_size=256\n",
+    "    batch_size=16384, virtual_batch_size=256,\n",
+    "    augmentations=aug\n",
     ") "
    ]
   },