Skip to content
This repository has been archived by the owner on Jun 22, 2022. It is now read-only.

Commit

Permalink
Dynamic features - previous application (#108)
Browse files Browse the repository at this point in the history
* previous_application handcrafted features

* previous application cleaning

* Update neptune.yaml

* code improvement

* Update notebook
  • Loading branch information
pknut authored and jakubczakon committed Jul 6, 2018
1 parent f3cd0b6 commit 7cd2071
Show file tree
Hide file tree
Showing 6 changed files with 290 additions and 10 deletions.
4 changes: 2 additions & 2 deletions neptune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ parameters:
verbose: 1

# Preprocessing
fill_missing: False
fill_value: None
fill_missing: True
fill_value: 0

# Feature Extraction
installments__last_k_trend_periods: '[10, 50, 100, 500]'
Expand Down
154 changes: 151 additions & 3 deletions notebooks/eda-previous_application.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Aggregations"
"### Aggregations"
]
},
{
Expand Down Expand Up @@ -123,6 +123,154 @@
"application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Solution 4\n",
"### Hand crafted features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"numbers_of_applications = [1, 3, 5]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"features = pd.DataFrame({'SK_ID_CURR': previous_application['SK_ID_CURR'].unique()})\n",
"prev_applications_sorted = previous_application.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index()\n",
"group_object.rename(index=str,\n",
" columns={'SK_ID_PREV': 'previous_application_number_of_prev_application'},\n",
" inplace=True)\n",
"features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prev_applications_sorted['previous_application_prev_was_approved'] = (\n",
" prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Approved').astype('int')\n",
"group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])[\n",
" 'previous_application_prev_was_approved'].last().reset_index()\n",
"features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prev_applications_sorted['previous_application_prev_was_refused'] = (\n",
" prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Refused').astype('int')\n",
"group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])[\n",
" 'previous_application_prev_was_refused'].last().reset_index()\n",
"features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for number in numbers_of_applications:\n",
" prev_applications_tail = prev_applications_sorted.groupby(by=['SK_ID_CURR']).tail(number)\n",
"\n",
" group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['CNT_PAYMENT'].mean().reset_index()\n",
" group_object.rename(index=str, columns={\n",
" 'CNT_PAYMENT': 'previous_application_term_of_last_{}_credits_mean'.format(number)},\n",
" inplace=True)\n",
" features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
"\n",
" group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['DAYS_DECISION'].mean().reset_index()\n",
" group_object.rename(index=str, columns={\n",
" 'DAYS_DECISION': 'previous_application_days_decision_about_last_{}_credits_mean'.format(number)},\n",
" inplace=True)\n",
" features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
"\n",
" group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['DAYS_FIRST_DRAWING'].mean().reset_index()\n",
" group_object.rename(index=str, columns={\n",
" 'DAYS_FIRST_DRAWING': 'previous_application_days_first_drawing_last_{}_credits_mean'.format(number)},\n",
" inplace=True)\n",
" features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"features.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"application = application.merge(features,\n",
" left_on=['SK_ID_CURR'],\n",
" right_on=['SK_ID_CURR'],\n",
" how='left',\n",
" validate='one_to_one')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"engineered_numerical_columns = list(features.columns)\n",
"engineered_numerical_columns.remove('SK_ID_CURR')\n",
"credit_eng = application[engineered_numerical_columns + ['TARGET']]\n",
"credit_eng_corr = abs(credit_eng.corr())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"credit_eng_corr.sort_values('TARGET', ascending=False)['TARGET']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sns.heatmap(credit_eng_corr, \n",
" xticklabels=credit_eng_corr.columns,\n",
" yticklabels=credit_eng_corr.columns)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -133,9 +281,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "cpu py3",
"display_name": "Python 3",
"language": "python",
"name": "cpu_py3"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand Down
14 changes: 14 additions & 0 deletions src/data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,17 @@ def transform(self, bureau):
bureau['CNT_CREDIT_PROLONG'].fillna(self.fill_value, inplace=True)

return {'bureau': bureau}


class PreviousApplicationCleaning(BaseTransformer):
def __init__(self, **kwargs):
super().__init__()

def transform(self, previous_application):
previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

return {'previous_application': previous_application}
77 changes: 76 additions & 1 deletion src/feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import category_encoders as ce
import numpy as np
import pandas as pd
from scipy.stats import kurtosis, iqr
from scipy.stats import kurtosis, iqr, skew
from sklearn.externals import joblib
from sklearn.linear_model import LinearRegression
from steppy.base import BaseTransformer
Expand Down Expand Up @@ -477,6 +477,81 @@ def persist(self, filepath):
joblib.dump(self.features, filepath)


class PreviousApplicationFeatures(BaseTransformer):
def __init__(self, numbers_of_applications=[], **kwargs):
self.features = None
self.numbers_of_applications = numbers_of_applications

@property
def feature_names(self):
feature_names = list(self.features.columns)
feature_names.remove('SK_ID_CURR')
return feature_names

def fit(self, X, prev_applications, **kwargs):
features = pd.DataFrame({'SK_ID_CURR': prev_applications['SK_ID_CURR'].unique()})

prev_applications_sorted = prev_applications.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])

group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index()
group_object.rename(index=str,
columns={'SK_ID_PREV': 'previous_application_number_of_prev_application'},
inplace=True)
features = features.merge(group_object, on=['SK_ID_CURR'], how='left')

prev_applications_sorted['previous_application_prev_was_approved'] = (
prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Approved').astype('int')
group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])[
'previous_application_prev_was_approved'].last().reset_index()
features = features.merge(group_object, on=['SK_ID_CURR'], how='left')

prev_applications_sorted['previous_application_prev_was_refused'] = (
prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Refused').astype('int')
group_object = prev_applications_sorted.groupby(by=['SK_ID_CURR'])[
'previous_application_prev_was_refused'].last().reset_index()
features = features.merge(group_object, on=['SK_ID_CURR'], how='left')

for number in self.numbers_of_applications:
prev_applications_tail = prev_applications_sorted.groupby(by=['SK_ID_CURR']).tail(number)

group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['CNT_PAYMENT'].mean().reset_index()
group_object.rename(index=str, columns={
'CNT_PAYMENT': 'previous_application_term_of_last_{}_credits_mean'.format(number)},
inplace=True)
features = features.merge(group_object, on=['SK_ID_CURR'], how='left')

group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['DAYS_DECISION'].mean().reset_index()
group_object.rename(index=str, columns={
'DAYS_DECISION': 'previous_application_days_decision_about_last_{}_credits_mean'.format(number)},
inplace=True)
features = features.merge(group_object, on=['SK_ID_CURR'], how='left')

group_object = prev_applications_tail.groupby(by=['SK_ID_CURR'])['DAYS_FIRST_DRAWING'].mean().reset_index()
group_object.rename(index=str, columns={
'DAYS_FIRST_DRAWING': 'previous_application_days_first_drawing_last_{}_credits_mean'.format(number)},
inplace=True)
features = features.merge(group_object, on=['SK_ID_CURR'], how='left')

self.features = features
return self

def transform(self, X, **kwargs):
X = X.merge(self.features,
left_on=['SK_ID_CURR'],
right_on=['SK_ID_CURR'],
how='left',
validate='one_to_one')

return {'numerical_features': X[self.feature_names]}

def load(self, filepath):
self.features = joblib.load(filepath)
return self

def persist(self, filepath):
joblib.dump(self.features, filepath)


class InstallmentPaymentsFeatures(BaseTransformer):
def __init__(self, last_k_agg_periods, last_k_trend_periods, num_workers=1, **kwargs):
self.last_k_agg_periods = last_k_agg_periods
Expand Down
Loading

0 comments on commit 7cd2071

Please sign in to comment.