From db86ec414ea4ba54a4a96027970e2bf9bf3cdbd8 Mon Sep 17 00:00:00 2001
From: Jakub Czakon <jakub.czakon@pascal01.intra.codilime.com>
Date: Sat, 14 Jul 2018 15:27:32 +0200
Subject: [PATCH 1/2] added fraction features to eda and feature extraction,
 updated configs

---
 configs/neptune.yaml               |   3 +-
 configs/neptune_random_search.yaml |   1 +
 configs/neptune_stacking.yaml      |   5 +-
 notebooks/eda-installments.ipynb   | 220 +++++++++++++++++++++++++----
 src/feature_extraction.py          |  37 ++++-
 src/pipeline_config.py             |   4 +-
 src/utils.py                       |   6 +
 7 files changed, 236 insertions(+), 40 deletions(-)

diff --git a/configs/neptune.yaml b/configs/neptune.yaml
index f172663..0347d35 100644
--- a/configs/neptune.yaml
+++ b/configs/neptune.yaml
@@ -52,7 +52,8 @@ parameters:
 
 # Feature Extraction
   installments__last_k_trend_periods: '[10, 50, 100, 500]'
-  installments__last_k_agg_periods: '[1, 5, 10, 50, 100, 500]'
+  installments__last_k_agg_periods: '[1, 5, 10, 20, 50, 100]'
+  installments__last_k_agg_period_fractions: '[(5,20),(5,50),(10,50),(10,100),(20,100)]'
   application_aggregation__use_diffs_only: True
   use_nan_count: True
 
diff --git a/configs/neptune_random_search.yaml b/configs/neptune_random_search.yaml
index 3f6b0dd..8db4ea5 100644
--- a/configs/neptune_random_search.yaml
+++ b/configs/neptune_random_search.yaml
@@ -53,6 +53,7 @@ parameters:
 # Feature Extraction
   installments__last_k_trend_periods: '[10, 50, 100, 500]'
   installments__last_k_agg_periods: '[1, 5, 10, 50, 100, 500]'
+  installments__last_k_agg_period_fractions: '[(5,20),(5,50),(10,50),(10,100),(20,100)]'
   application_aggregation__use_diffs_only: True
   use_nan_count: True
 
diff --git a/configs/neptune_stacking.yaml b/configs/neptune_stacking.yaml
index 74c7c9b..6d2c53d 100644
--- a/configs/neptune_stacking.yaml
+++ b/configs/neptune_stacking.yaml
@@ -51,8 +51,9 @@ parameters:
   fill_value: 0
 
 # Feature Extraction
-  installments__last_k_trend_periods: '[10, 50, 100, 500]'
-  installments__last_k_agg_periods: '[1, 5, 10, 50, 100, 500]'
+  installments__last_k_trend_periods: None
+  installments__last_k_agg_periods: None
+  installments__last_k_agg_period_fractions: None
   application_aggregation__use_diffs_only: True
   use_nan_count: True
 
diff --git a/notebooks/eda-installments.ipynb b/notebooks/eda-installments.ipynb
index 71ab29a..db2ec2b 100644
--- a/notebooks/eda-installments.ipynb
+++ b/notebooks/eda-installments.ipynb
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "%load_ext autoreload\n",
@@ -24,7 +26,7 @@
     "\n",
     "sys.path.append('../')\n",
     "from src.utils import parallel_apply\n",
-    "from src.feature_extraction import add_features, add_features_in_group\n",
+    "from src.feature_extraction import add_features_in_group\n",
     "\n",
     "warnings.filterwarnings('ignore')\n",
     "\n",
@@ -34,7 +36,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
@@ -45,7 +49,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "installments.head()"
@@ -77,7 +83,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = []\n",
@@ -96,7 +104,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "groupby_aggregate_names = []\n",
@@ -118,7 +128,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "application.head()"
@@ -127,7 +139,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "application_agg = application[groupby_aggregate_names + ['TARGET']]\n",
@@ -137,7 +151,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
@@ -153,7 +169,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "positive_ID = application[application['TARGET']==1]['SK_ID_CURR'].tolist()\n",
@@ -163,7 +181,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "value_counts = installments[installments['SK_ID_CURR'].isin(positive_ID)]['SK_ID_CURR'].value_counts()"
@@ -172,7 +192,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "value_counts.head()"
@@ -181,7 +203,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "sns.distplot(value_counts)"
@@ -190,7 +214,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "installments_one = installments[installments['SK_ID_CURR']==328162]"
@@ -199,7 +225,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "installments_one.sort_values(['DAYS_INSTALMENT'],ascending=False).head(10)"
@@ -208,7 +236,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# installments_ = installments[installments['SK_ID_CURR'].isin(positive_ID[:100])]\n",
@@ -222,7 +252,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def add_features(feature_name, aggs, features, feature_names, groupby):\n",
@@ -265,13 +297,15 @@
     "            features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name])\n",
     "        elif agg == 'median':\n",
     "            features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median()\n",
-    "        return features"
+    "    return features"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "features = pd.DataFrame({'SK_ID_CURR':installments_['SK_ID_CURR'].unique()})\n",
@@ -281,7 +315,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "installments_.head()"
@@ -297,7 +333,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "feature_names = []\n",
@@ -333,7 +371,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def last_k_instalment_features(gr, periods):\n",
@@ -360,7 +400,7 @@
     "                                         'last_{}_'.format(period))\n",
     "        features = add_features_in_group(features,gr_period,'instalment_paid_over', \n",
     "                                     ['count','mean'],\n",
-    "                                         'last_{}_'.format(period))\n",
+    "                                         'last_{}_'.format(period))        \n",
     "    \n",
     "    return features"
    ]
@@ -368,7 +408,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "func = partial(last_k_instalment_features, periods=[1,5,10,20,50,100])\n",
@@ -390,7 +432,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.linear_model import LinearRegression"
@@ -399,7 +443,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def trend_in_last_k_instalment_features(gr, periods):\n",
@@ -436,7 +482,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "func = partial(trend_in_last_k_instalment_features, periods=[10,50,100,500])\n",
@@ -451,7 +499,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "X = application.merge(features, on='SK_ID_CURR',how='left')\n",
@@ -462,7 +512,117 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "X_corr = abs(X.corr())\n",
+    "X_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
    "metadata": {},
+   "source": [
+    "# Solution 5\n",
+    "## Period fractions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def last_k_instalment_features_with_fractions(gr, periods, fraction_periods):\n",
+    "    gr_ = gr.copy()\n",
+    "    gr_.sort_values(['DAYS_INSTALMENT'],ascending=False, inplace=True)\n",
+    "    \n",
+    "    features = {}\n",
+    "\n",
+    "    for period in periods:\n",
+    "        gr_period = gr_.iloc[:period]\n",
+    "\n",
+    "        features = add_features_in_group(features,gr_period, 'NUM_INSTALMENT_VERSION', \n",
+    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
+    "                                         'last_{}_'.format(period))\n",
+    "        \n",
+    "        features = add_features_in_group(features,gr_period, 'instalment_paid_late_in_days', \n",
+    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
+    "                                         'last_{}_'.format(period))\n",
+    "        features = add_features_in_group(features,gr_period ,'instalment_paid_late', \n",
+    "                                     ['count','mean'],\n",
+    "                                         'last_{}_'.format(period))\n",
+    "        features = add_features_in_group(features,gr_period ,'instalment_paid_over_amount', \n",
+    "                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n",
+    "                                         'last_{}_'.format(period))\n",
+    "        features = add_features_in_group(features,gr_period,'instalment_paid_over', \n",
+    "                                     ['count','mean'],\n",
+    "                                         'last_{}_'.format(period))        \n",
+    "    \n",
+    "    for short_period, long_period in fraction_periods:\n",
+    "        short_feature_names = _get_feature_names(features, short_period)\n",
+    "        long_feature_names = _get_feature_names(features, long_period)\n",
+    "        \n",
+    "        for short_feature, long_feature in zip(short_feature_names, long_feature_names):\n",
+    "            old_name_chunk = '_{}_'.format(short_period)\n",
+    "            new_name_chunk ='_{}by{}_fraction_'.format(short_period, long_period)\n",
+    "            fraction_feature_name = short_feature.replace(old_name_chunk, new_name_chunk)\n",
+    "            features[fraction_feature_name] = safe_div(features[short_feature], features[long_feature])\n",
+    "    return pd.Series(features)\n",
+    "\n",
+    "def _get_feature_names(features, period):\n",
+    "    return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat])\n",
+    "\n",
+    "\n",
+    "def safe_div(a,b):\n",
+    "    try:\n",
+    "        return float(a)/float(b)\n",
+    "    except:\n",
+    "        return 0.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "func = partial(last_k_instalment_features_with_fractions, \n",
+    "               periods=[1,5,10,20,50,100],\n",
+    "               fraction_periods=[(5,20),(5,50),(10,100)])\n",
+    "\n",
+    "g = parallel_apply(groupby, func, index_name='SK_ID_CURR',\n",
+    "                   num_workers=16, chunk_size=1000).reset_index()\n",
+    "features = features.merge(g, on='SK_ID_CURR', how='left')\n",
+    "\n",
+    "display(features.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "X = application.merge(features, on='SK_ID_CURR',how='left')\n",
+    "X = X[features.columns.drop('SK_ID_CURR').tolist()+['TARGET']]\n",
+    "X.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "X_corr = abs(X.corr())\n",
@@ -472,7 +632,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": []
   }
diff --git a/src/feature_extraction.py b/src/feature_extraction.py
index 692e1a4..47987e0 100644
--- a/src/feature_extraction.py
+++ b/src/feature_extraction.py
@@ -11,7 +11,7 @@
 from steppy.base import BaseTransformer
 from steppy.utils import get_logger
 
-from .utils import parallel_apply
+from .utils import parallel_apply, safe_div
 
 logger = get_logger()
 
@@ -473,11 +473,11 @@ def fit(self, prev_applications, **kwargs):
         g = prev_app_sorted.groupby(by=['SK_ID_CURR'])['previous_application_prev_was_refused'].mean().reset_index()
         g.rename(index=str, columns={
             'previous_application_prev_was_refused': 'previous_application_fraction_of_refused_applications'},
-                            inplace=True)
+                 inplace=True)
         features = features.merge(g, on=['SK_ID_CURR'], how='left')
 
         prev_app_sorted['prev_applications_prev_was_revolving_loan'] = (
-                prev_app_sorted['NAME_CONTRACT_TYPE'] == 'Revolving loans').astype('int')
+            prev_app_sorted['NAME_CONTRACT_TYPE'] == 'Revolving loans').astype('int')
         g = prev_app_sorted.groupby(by=['SK_ID_CURR'])[
             'prev_applications_prev_was_revolving_loan'].last().reset_index()
         features = features.merge(g, on=['SK_ID_CURR'], how='left')
@@ -513,9 +513,10 @@ def fit(self, prev_applications, **kwargs):
 
 
 class InstallmentPaymentsFeatures(BasicHandCraftedFeatures):
-    def __init__(self, last_k_agg_periods, last_k_trend_periods, num_workers=1, **kwargs):
+    def __init__(self, last_k_agg_periods, last_k_agg_period_fractions, last_k_trend_periods, num_workers=1, **kwargs):
         super().__init__(num_workers=num_workers)
         self.last_k_agg_periods = last_k_agg_periods
+        self.last_k_agg_period_fractions = last_k_agg_period_fractions
         self.last_k_trend_periods = last_k_trend_periods
 
         self.num_workers = num_workers
@@ -533,6 +534,7 @@ def fit(self, installments, **kwargs):
 
         func = partial(InstallmentPaymentsFeatures.generate_features,
                        agg_periods=self.last_k_agg_periods,
+                       period_fractions=self.last_k_agg_period_fractions,
                        trend_periods=self.last_k_trend_periods)
         g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=self.num_workers).reset_index()
         features = features.merge(g, on='SK_ID_CURR', how='left')
@@ -541,9 +543,11 @@ def fit(self, installments, **kwargs):
         return self
 
     @staticmethod
-    def generate_features(gr, agg_periods, trend_periods):
-        all = InstallmentPaymentsFeatures.last_k_installment_features(gr, periods=[10e16])
-        agg = InstallmentPaymentsFeatures.last_k_installment_features(gr, agg_periods)
+    def generate_features(gr, agg_periods, trend_periods, period_fractions):
+        all = InstallmentPaymentsFeatures.all_installment_features(gr)
+        agg = InstallmentPaymentsFeatures.last_k_installment_features_with_fractions(gr,
+                                                                                     agg_periods,
+                                                                                     period_fractions)
         trend = InstallmentPaymentsFeatures.trend_in_last_k_installment_features(gr, trend_periods)
         last = InstallmentPaymentsFeatures.last_loan_features(gr)
         features = {**all, **agg, **trend, **last}
@@ -553,6 +557,21 @@ def generate_features(gr, agg_periods, trend_periods):
     def all_installment_features(gr):
         return InstallmentPaymentsFeatures.last_k_installment_features(gr, periods=[10e16])
 
+    @staticmethod
+    def last_k_installment_features_with_fractions(gr, periods, period_fractions):
+        features = InstallmentPaymentsFeatures.last_k_installment_features(gr, periods)
+
+        for short_period, long_period in period_fractions:
+            short_feature_names = get_feature_names_by_period(features, short_period)
+            long_feature_names = get_feature_names_by_period(features, long_period)
+
+            for short_feature, long_feature in zip(short_feature_names, long_feature_names):
+                old_name_chunk = '_{}_'.format(short_period)
+                new_name_chunk = '_{}by{}_fraction_'.format(short_period, long_period)
+                fraction_feature_name = short_feature.replace(old_name_chunk, new_name_chunk)
+                features[fraction_feature_name] = safe_div(features[short_feature], features[long_feature])
+        return features
+
     @staticmethod
     def last_k_installment_features(gr, periods):
         gr_ = gr.copy()
@@ -675,3 +694,7 @@ def add_trend_feature(features, gr, feature_name, prefix):
         trend = np.nan
     features['{}{}'.format(prefix, feature_name)] = trend
     return features
+
+
+def get_feature_names_by_period(features, period):
+    return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat])
diff --git a/src/pipeline_config.py b/src/pipeline_config.py
index 35f4ea5..ebabe7e 100644
--- a/src/pipeline_config.py
+++ b/src/pipeline_config.py
@@ -144,7 +144,7 @@
 aggregation_pairs = [(col, agg) for col in cols_to_agg for agg in aggs]
 
 APPLICATION_AGGREGATION_RECIPIES = [
-    (['NAME_EDUCATION_TYPE', 'CODE_GENDER'],  aggregation_pairs),
+    (['NAME_EDUCATION_TYPE', 'CODE_GENDER'], aggregation_pairs),
     (['NAME_FAMILY_STATUS', 'NAME_EDUCATION_TYPE'], aggregation_pairs),
     (['NAME_FAMILY_STATUS', 'CODE_GENDER'], aggregation_pairs),
     (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'),
@@ -292,6 +292,8 @@
                               'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
                               'groupby_aggregations': INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES,
                               'last_k_agg_periods': parameter_eval(params.installments__last_k_agg_periods),
+                              'last_k_agg_period_fractions': parameter_eval(
+                                  params.installments__last_k_agg_period_fractions),
                               'last_k_trend_periods': parameter_eval(params.installments__last_k_trend_periods),
                               'num_workers': params.num_workers
                               },
diff --git a/src/utils.py b/src/utils.py
index 35d8921..63a2d43 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -150,3 +150,9 @@ def _clean_columns(df, keep_colnames):
     for i, colname in enumerate(feature_colnames):
         new_colnames.append('model_{}'.format(i))
     return new_colnames
+
+def safe_div(a, b):
+    try:
+        return float(a) / float(b)
+    except:
+        return 0.0
\ No newline at end of file

From 213ab48e6b4309dcb3a39b77978294b4a6af0620 Mon Sep 17 00:00:00 2001
From: Jakub Czakon <jakub.czakon@pascal01.intra.codilime.com>
Date: Mon, 16 Jul 2018 07:40:10 +0200
Subject: [PATCH 2/2] updated hyperparams

---
 configs/neptune.yaml    | 6 +++---
 src/pipeline_manager.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/neptune.yaml b/configs/neptune.yaml
index 0347d35..5dd9ca8 100644
--- a/configs/neptune.yaml
+++ b/configs/neptune.yaml
@@ -69,12 +69,12 @@ parameters:
   lgbm__max_bin: 300
   lgbm__max_depth: -1
   lgbm__num_leaves: 35
-  lgbm__min_child_samples: 50
+  lgbm__min_child_samples: 70
   lgbm__subsample: 1.0
   lgbm__subsample_freq: 1
-  lgbm__colsample_bytree: 0.2
+  lgbm__colsample_bytree: 0.05
   lgbm__min_gain_to_split: 0.5
-  lgbm__reg_lambda: 100.0
+  lgbm__reg_lambda: 100
   lgbm__reg_alpha: 0.0
   lgbm__scale_pos_weight: 1
 
diff --git a/src/pipeline_manager.py b/src/pipeline_manager.py
index 6d466b0..7f0459b 100644
--- a/src/pipeline_manager.py
+++ b/src/pipeline_manager.py
@@ -401,7 +401,7 @@ def _read_data(dev_mode, read_train=True, read_test=False):
 
     if read_test:
         raw_data['application_test'] = pd.read_csv(params.test_filepath, nrows=nrows)
-
+    
     raw_data['bureau'] = pd.read_csv(params.bureau_filepath, nrows=nrows)
     raw_data['credit_card_balance'] = pd.read_csv(params.credit_card_balance_filepath, nrows=nrows)
     raw_data['pos_cash_balance'] = pd.read_csv(params.POS_CASH_balance_filepath, nrows=nrows)