Dynamic features - previous application (#108)

* previous_application handcrafted features * previous application cleaning * Update neptune.yaml * code improvement * Update notebook
minerva-ml · Jul 6, 2018 · 7cd2071 · 7cd2071
1 parent f3cd0b6
commit 7cd2071
Show file tree

Hide file tree

Showing 6 changed files with 290 additions and 10 deletions.
diff --git a/neptune.yaml b/neptune.yaml
@@ -46,8 +46,8 @@ parameters:
   verbose: 1
 
 # Preprocessing
-  fill_missing: False
-  fill_value: None
+  fill_missing: True
+  fill_value: 0
 
 # Feature Extraction
   installments__last_k_trend_periods: '[10, 50, 100, 500]'

diff --git a/notebooks/eda-previous_application.ipynb b/notebooks/eda-previous_application.ipynb
@@ -48,7 +48,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Aggregations"
+    "### Aggregations"
    ]
   },
   {
@@ -123,6 +123,154 @@
     "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Solution 4\n",
+    "### Hand crafted features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "numbers_of_applications = [1, 3, 5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features = pd.DataFrame({'SK_ID_CURR': previous_application['SK_ID_CURR'].unique()})\n",
+    "prev_applications_sorted = previous_application.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index()\n",
+    "group_object.rename(index=str,\n",
+    "                    columns={'SK_ID_PREV': 'previous_application_number_of_prev_application'},\n",
+    "                    inplace=True)\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prev_applications_sorted['previous_application_prev_was_approved'] = (\n",
+    "        prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Approved').astype('int')\n",
+    "group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])[\n",
+    "    'previous_application_prev_was_approved'].last().reset_index()\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prev_applications_sorted['previous_application_prev_was_refused'] = (\n",
+    "        prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Refused').astype('int')\n",
+    "group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])[\n",
+    "    'previous_application_prev_was_refused'].last().reset_index()\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for number in numbers_of_applications:\n",
+    "    prev_applications_tail = prev_applications_sorted.groupby(by=['SK_ID_CURR']).tail(number)\n",
+    "\n",
+    "    group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['CNT_PAYMENT'].mean().reset_index()\n",
+    "    group_object.rename(index=str, columns={\n",
+    "        'CNT_PAYMENT': 'previous_application_term_of_last_{}_credits_mean'.format(number)},\n",
+    "                        inplace=True)\n",
+    "    features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "\n",
+    "    group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['DAYS_DECISION'].mean().reset_index()\n",
+    "    group_object.rename(index=str, columns={\n",
+    "        'DAYS_DECISION': 'previous_application_days_decision_about_last_{}_credits_mean'.format(number)},\n",
+    "                        inplace=True)\n",
+    "    features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "\n",
+    "    group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['DAYS_FIRST_DRAWING'].mean().reset_index()\n",
+    "    group_object.rename(index=str, columns={\n",
+    "        'DAYS_FIRST_DRAWING': 'previous_application_days_first_drawing_last_{}_credits_mean'.format(number)},\n",
+    "                        inplace=True)\n",
+    "    features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application = application.merge(features,\n",
+    "                                left_on=['SK_ID_CURR'],\n",
+    "                                right_on=['SK_ID_CURR'],\n",
+    "                                how='left',\n",
+    "                                validate='one_to_one')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "engineered_numerical_columns = list(features.columns)\n",
+    "engineered_numerical_columns.remove('SK_ID_CURR')\n",
+    "credit_eng = application[engineered_numerical_columns + ['TARGET']]\n",
+    "credit_eng_corr = abs(credit_eng.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "credit_eng_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.heatmap(credit_eng_corr, \n",
+    "            xticklabels=credit_eng_corr.columns,\n",
+    "            yticklabels=credit_eng_corr.columns)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -133,9 +281,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cpu py3",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "cpu_py3"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {

diff --git a/src/data_cleaning.py b/src/data_cleaning.py
@@ -31,3 +31,17 @@ def transform(self, bureau):
             bureau['CNT_CREDIT_PROLONG'].fillna(self.fill_value, inplace=True)
 
         return {'bureau': bureau}
+
+
+class PreviousApplicationCleaning(BaseTransformer):
+    def __init__(self, **kwargs):
+        super().__init__()
+
+    def transform(self, previous_application):
+        previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
+        previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
+        previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
+        previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
+        previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
+
+        return {'previous_application': previous_application}
diff --git a/src/feature_extraction.py b/src/feature_extraction.py
@@ -4,7 +4,7 @@
 import category_encoders as ce
 import numpy as np
 import pandas as pd
-from scipy.stats import kurtosis, iqr
+from scipy.stats import kurtosis, iqr, skew
 from sklearn.externals import joblib
 from sklearn.linear_model import LinearRegression
 from steppy.base import BaseTransformer
@@ -477,6 +477,81 @@ def persist(self, filepath):
         joblib.dump(self.features, filepath)
 
 
+class PreviousApplicationFeatures(BaseTransformer):
+    def __init__(self, numbers_of_applications=[], **kwargs):
+        self.features = None
+        self.numbers_of_applications = numbers_of_applications
+
+    @property
+    def feature_names(self):
+        feature_names = list(self.features.columns)
+        feature_names.remove('SK_ID_CURR')
+        return feature_names
+
+    def fit(self, X, prev_applications, **kwargs):
+        features = pd.DataFrame({'SK_ID_CURR': prev_applications['SK_ID_CURR'].unique()})
+
+        prev_applications_sorted = prev_applications.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])
+
+        group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index()
+        group_object.rename(index=str,
+                            columns={'SK_ID_PREV': 'previous_application_number_of_prev_application'},
+                            inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        prev_applications_sorted['previous_application_prev_was_approved'] = (
+                prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Approved').astype('int')
+        group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])[
+            'previous_application_prev_was_approved'].last().reset_index()
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        prev_applications_sorted['previous_application_prev_was_refused'] = (
+                prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Refused').astype('int')
+        group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])[
+            'previous_application_prev_was_refused'].last().reset_index()
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        for number in self.numbers_of_applications:
+            prev_applications_tail = prev_applications_sorted.groupby(by=['SK_ID_CURR']).tail(number)
+
+            group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['CNT_PAYMENT'].mean().reset_index()
+            group_object.rename(index=str, columns={
+                'CNT_PAYMENT': 'previous_application_term_of_last_{}_credits_mean'.format(number)},
+                                inplace=True)
+            features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+            group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['DAYS_DECISION'].mean().reset_index()
+            group_object.rename(index=str, columns={
+                'DAYS_DECISION': 'previous_application_days_decision_about_last_{}_credits_mean'.format(number)},
+                                inplace=True)
+            features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+            group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['DAYS_FIRST_DRAWING'].mean().reset_index()
+            group_object.rename(index=str, columns={
+                'DAYS_FIRST_DRAWING': 'previous_application_days_first_drawing_last_{}_credits_mean'.format(number)},
+                                inplace=True)
+            features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        self.features = features
+        return self
+
+    def transform(self, X, **kwargs):
+        X = X.merge(self.features,
+                    left_on=['SK_ID_CURR'],
+                    right_on=['SK_ID_CURR'],
+                    how='left',
+                    validate='one_to_one')
+
+        return {'numerical_features': X[self.feature_names]}
+
+    def load(self, filepath):
+        self.features = joblib.load(filepath)
+        return self
+
+    def persist(self, filepath):
+        joblib.dump(self.features, filepath)
+
+
 class InstallmentPaymentsFeatures(BaseTransformer):
     def __init__(self, last_k_agg_periods, last_k_trend_periods, num_workers=1, **kwargs):
         self.last_k_agg_periods = last_k_agg_periods