From c9c9c696724892ca32e1ba2c178e4ed923023a99 Mon Sep 17 00:00:00 2001 From: "karol.strzalkowski" Date: Mon, 16 Jul 2018 13:37:19 +0200 Subject: [PATCH] Dev (#134) * age/employment dummies (#104) * added diff features * New handcrafted features (#102) * Dynamic features * Smart features (#61) * Update README.md * Update README.md * Update * Smart features update * More descriptive transformer name * Reading all data in main * More application features * Transformer for cleaning * Multiinput data dictionary * Fix (#63) * fixed configs * dropped redundand steps, moved stuff to cleaning, refactored groupby (#64) * dropped redundand steps, moved stuff to cleanining, refactored groupby * restructured, added stacking + CV * Fix format string * Update pipeline_manager.py clipped prediction -> prediction * added stratified kfold option (#77) * Update config (#79) * dropped redundand steps, moved stuff to cleanining, refactored groupby * restructured, added stacking + CV * Update pipeline_config.py * Dev review (#81) * dropped feature by type split, refactored pipleine_config * dropped feature by type split method * explored application features * trash * reverted refactor of aggs * fixed/updated bureau features * cleared notebooks * agg features added to notebook bureau * credit card cleaned * added other feature notebooks * added rank mean * updated model arch * reverted to old params * fixed rank mean calculations * ApplicationCleaning update (#84) * Cleaning - application * Clear output in notebook * clenaed names in steps, refactored mergeaggregate transformer, changed caching/saving specs (#85) * local trash * External sources notebook (#86) * Update * External sources notebook * Dev lgbm params (#88) * local trash * updated configs * dropped comment * updated lgb params * Dev app agg fix (#90) * dropped app_aggs * app agg features fixed * cleaned leftovers * dropped fast read-in for debug * External_sources statistics (#89) * Speed-up ext_src notebook * exernal_sources statistics * Weighted mean and notebook fix * application notebook update * clear notebook output * Fix auto submission (#95) * CreditCardBalance monthly diff mean * POSCASH remaining installments * POSCASH completed_contracts * notebook update * Resolve conflicts * Fix * Update neptune.yaml * Update neptune_random_search.yaml * Split static and dynamic features - credit card balance * Dev nan count (#105) * added nan_count * added nan count with parameter * Dev fe installments (#106) * added simple features, parallel groupby, last-installment features * refactored last_installment features * added features for the very last installment * Dev fe instalments dynamic (#107) * added dynamic-trend features * formated configs * added skew/iqr features * added number of credit agreement change features (#109) * added number of credit agreement change features * reverted sample size * Dynamic features - previous application (#108) * previous_application handcrafted features * previous application cleaning * Update neptune.yaml * code improvement * Update notebook * Notebook - feature importance (#112) * Dev speed up (#111) * refactored aggs to calculate only once per training, sped up installment and credit card (only single index groupby) * sped up all hand crafted * fixed bureau worker errors * fixed isntallment names * fixed isntallment names * fixed bureau and prev_app naming bugs * reverted to vectorized where possible * updated hyperparams * updated early stopping params to meet convergence * reverted to old fallback neptune file * updated paths * updated paths, explored prev-app features * dropped duplicated agg * POS_CASH added features * POS CASH features added * POS_CASH_balance feature cleaning * Yaml adjustment * Path change --- configs/neptune.yaml | 4 +- configs/neptune_random_search.yaml | 2 + configs/neptune_stacking.yaml | 2 + notebooks/eda-pos_cash_balance.ipynb | 254 ++++++++++++++++++++++++++- src/feature_extraction.py | 124 +++++++++++-- src/pipeline_config.py | 2 + 6 files changed, 370 insertions(+), 18 deletions(-) diff --git a/configs/neptune.yaml b/configs/neptune.yaml index 5dd9ca8..8bc0937 100644 --- a/configs/neptune.yaml +++ b/configs/neptune.yaml @@ -1,7 +1,7 @@ project: ORGANIZATION/home-credit name: home-credit-default-risk -tags: [solution-4, dev] +tags: [solution-5, dev] metric: channel: 'ROC_AUC' @@ -54,6 +54,8 @@ parameters: installments__last_k_trend_periods: '[10, 50, 100, 500]' installments__last_k_agg_periods: '[1, 5, 10, 20, 50, 100]' installments__last_k_agg_period_fractions: '[(5,20),(5,50),(10,50),(10,100),(20,100)]' + pos_cash__last_k_trend_periods: '[6, 12]' + pos_cash__last_k_agg_periods: '[6, 12, 30]' application_aggregation__use_diffs_only: True use_nan_count: True diff --git a/configs/neptune_random_search.yaml b/configs/neptune_random_search.yaml index 8db4ea5..4abde4a 100644 --- a/configs/neptune_random_search.yaml +++ b/configs/neptune_random_search.yaml @@ -54,6 +54,8 @@ parameters: installments__last_k_trend_periods: '[10, 50, 100, 500]' installments__last_k_agg_periods: '[1, 5, 10, 50, 100, 500]' installments__last_k_agg_period_fractions: '[(5,20),(5,50),(10,50),(10,100),(20,100)]' + pos_cash__last_k_trend_periods: '[6, 12]' + pos_cash__last_k_agg_periods: '[6, 12, 30]' application_aggregation__use_diffs_only: True use_nan_count: True diff --git a/configs/neptune_stacking.yaml b/configs/neptune_stacking.yaml index 6d2c53d..c0aff1b 100644 --- a/configs/neptune_stacking.yaml +++ b/configs/neptune_stacking.yaml @@ -54,6 +54,8 @@ parameters: installments__last_k_trend_periods: None installments__last_k_agg_periods: None installments__last_k_agg_period_fractions: None + pos_cash__last_k_trend_periods: None + pos_cash__last_k_agg_periods: None application_aggregation__use_diffs_only: True use_nan_count: True diff --git a/notebooks/eda-pos_cash_balance.ipynb b/notebooks/eda-pos_cash_balance.ipynb index c23851e..75f4f91 100644 --- a/notebooks/eda-pos_cash_balance.ipynb +++ b/notebooks/eda-pos_cash_balance.ipynb @@ -7,13 +7,21 @@ "outputs": [], "source": [ "import os\n", + "import sys\n", "import pandas as pd\n", + "import numpy as np\n", "from tqdm import tqdm_notebook as tqdm\n", + "from functools import partial\n", "from sklearn.externals import joblib\n", "%matplotlib inline\n", "import seaborn as sns\n", + "from sklearn.linear_model import LinearRegression\n", "\n", - "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n", + "sys.path.append('../')\n", + "from src.utils import parallel_apply\n", + "from src.feature_extraction import add_features_in_group\n", + "\n", + "DIR = 'PATH/TO/YOUR/DATA'\n", "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n", "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n", "pos_cash_balance = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/POS_CASH_balance.csv'))" @@ -170,11 +178,10 @@ "metadata": {}, "outputs": [], "source": [ - "application = application.merge(features,\n", - " left_on=['SK_ID_CURR'],\n", - " right_on=['SK_ID_CURR'],\n", + "X = application.merge(features, left_on=['SK_ID_CURR'], right_on=['SK_ID_CURR'],\n", " how='left',\n", - " validate='one_to_one')" + " validate='one_to_one')\n", + "X = X[features.columns.tolist()+['TARGET']]" ] }, { @@ -185,7 +192,7 @@ "source": [ "engineered_numerical_columns = list(features.columns)\n", "engineered_numerical_columns.remove('SK_ID_CURR')\n", - "credit_eng = application[engineered_numerical_columns + ['TARGET']]\n", + "credit_eng = X[engineered_numerical_columns + ['TARGET']]\n", "credit_eng_corr = abs(credit_eng.corr())" ] }, @@ -209,6 +216,241 @@ " yticklabels=credit_eng_corr.columns)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Solution 5\n", + "\n", + "### Hand crafted features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pos_cash_balance['pos_cash_paid_late'] = (pos_cash_balance['SK_DPD'] > 0).astype(int)\n", + "pos_cash_balance['pos_cash_paid_late_with_tolerance'] = (pos_cash_balance['SK_DPD_DEF'] > 0).astype(int)\n", + "groupby = pos_cash_balance.groupby(['SK_ID_CURR'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def last_k_installment_features(gr, periods):\n", + " gr_ = gr.copy()\n", + " gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)\n", + "\n", + " features = {}\n", + " for period in periods:\n", + " if period > 10e10:\n", + " period_name = 'all_installment_'\n", + " gr_period = gr_.copy()\n", + " else:\n", + " period_name = 'last_{}_'.format(period)\n", + " gr_period = gr_.iloc[:period]\n", + "\n", + " features = add_features_in_group(features, gr_period, 'pos_cash_paid_late',\n", + " ['count', 'mean'],\n", + " period_name)\n", + " features = add_features_in_group(features, gr_period, 'pos_cash_paid_late_with_tolerance',\n", + " ['count', 'mean'],\n", + " period_name)\n", + " features = add_features_in_group(features, gr_period, 'SK_DPD',\n", + " ['sum', 'mean', 'max', 'min', 'median'],\n", + " period_name)\n", + " features = add_features_in_group(features, gr_period, 'SK_DPD_DEF',\n", + " ['sum', 'mean', 'max', 'min','median'],\n", + " period_name)\n", + " return features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = pd.DataFrame({'SK_ID_CURR': pos_cash_balance['SK_ID_CURR'].unique()})\n", + "func = partial(last_k_installment_features, periods=[1, 10, 50, 10e16])\n", + "g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=10, chunk_size=10000).reset_index()\n", + "features = features.merge(g, on='SK_ID_CURR', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = X.merge(features, on='SK_ID_CURR',how='left')\n", + "X.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Last loan features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def last_loan_features(gr):\n", + " gr_ = gr.copy()\n", + " gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)\n", + " last_installment_id = gr_['SK_ID_PREV'].iloc[0]\n", + " gr_ = gr_[gr_['SK_ID_PREV'] == last_installment_id]\n", + "\n", + " features={}\n", + " features = add_features_in_group(features, gr_, 'pos_cash_paid_late',\n", + " ['count', 'sum', 'mean'],\n", + " 'last_loan_')\n", + " features = add_features_in_group(features, gr_, 'pos_cash_paid_late_with_tolerance',\n", + " ['sum', 'mean'],\n", + " 'last_loan_')\n", + " features = add_features_in_group(features, gr_, 'SK_DPD',\n", + " ['sum', 'mean', 'max', 'min', 'std'],\n", + " 'last_loan_')\n", + " features = add_features_in_group(features, gr_, 'SK_DPD_DEF',\n", + " ['sum', 'mean', 'max', 'min', 'std'],\n", + " 'last_loan_')\n", + " return features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = pd.DataFrame({'SK_ID_CURR': pos_cash_balance['SK_ID_CURR'].unique()})\n", + "g = parallel_apply(groupby, last_loan_features, index_name='SK_ID_CURR', num_workers=10, chunk_size=10000).reset_index()\n", + "features = features.merge(g, on='SK_ID_CURR', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = X.merge(features, on='SK_ID_CURR',how='left')\n", + "X.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Trend features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def trend_in_last_k_installment_features(gr, periods):\n", + " gr_ = gr.copy()\n", + " gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)\n", + "\n", + " features = {}\n", + " for period in periods:\n", + " gr_period = gr_.iloc[:period]\n", + "\n", + " features = add_trend_feature(features, gr_period,\n", + " 'SK_DPD', '{}_period_trend_'.format(period)\n", + " )\n", + " features = add_trend_feature(features, gr_period,\n", + " 'SK_DPD_DEF', '{}_period_trend_'.format(period)\n", + " )\n", + " return features\n", + "\n", + "def add_trend_feature(features, gr, feature_name, prefix):\n", + " y = gr[feature_name].values\n", + " try:\n", + " x = np.arange(0, len(y)).reshape(-1, 1)\n", + " lr = LinearRegression()\n", + " lr.fit(x, y)\n", + " trend = lr.coef_[0]\n", + " except:\n", + " trend = np.nan\n", + " features['{}{}'.format(prefix, feature_name)] = trend\n", + " return features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = pd.DataFrame({'SK_ID_CURR': pos_cash_balance['SK_ID_CURR'].unique()})\n", + "func = partial(trend_in_last_k_installment_features, periods=[1,6,12,30,60])\n", + "g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=10, chunk_size=10000).reset_index()\n", + "features = features.merge(g, on='SK_ID_CURR', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = X.merge(features, on='SK_ID_CURR',how='left')\n", + "X_corr = abs(X.corr())\n", + "X_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/feature_extraction.py b/src/feature_extraction.py index 47987e0..9c2d158 100644 --- a/src/feature_extraction.py +++ b/src/feature_extraction.py @@ -425,25 +425,126 @@ def _dynamic_features(self, credit_card, **kwargs): class POSCASHBalanceFeatures(BasicHandCraftedFeatures): + def __init__(self, last_k_agg_periods, last_k_trend_periods, num_workers=1, **kwargs): + super().__init__(num_workers=num_workers) + self.last_k_agg_periods = last_k_agg_periods + self.last_k_trend_periods = last_k_trend_periods + + self.num_workers = num_workers + self.features = None + def fit(self, pos_cash, **kwargs): pos_cash['is_contract_status_completed'] = pos_cash['NAME_CONTRACT_STATUS'] == 'Completed' - pos_cash_sorted = pos_cash.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE']) + pos_cash['pos_cash_paid_late'] = (pos_cash['SK_DPD'] > 0).astype(int) + pos_cash['pos_cash_paid_late_with_tolerance'] = (pos_cash['SK_DPD_DEF'] > 0).astype(int) features = pd.DataFrame({'SK_ID_CURR': pos_cash['SK_ID_CURR'].unique()}) - - g = pos_cash_sorted.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].last().reset_index() - g.rename(index=str, - columns={'CNT_INSTALMENT_FUTURE': 'pos_cash_remaining_installments'}, - inplace=True) - features = features.merge(g, on=['SK_ID_CURR'], how='left') - groupby = pos_cash.groupby(['SK_ID_CURR']) - g = groupby['is_contract_status_completed'].agg('sum').reset_index() - features = features.merge(g, on=['SK_ID_CURR'], how='left') + func = partial(POSCASHBalanceFeatures.generate_features, + agg_periods=self.last_k_agg_periods, + trend_periods=self.last_k_trend_periods) + g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=self.num_workers).reset_index() + features = features.merge(g, on='SK_ID_CURR', how='left') self.features = features return self + @staticmethod + def generate_features(gr, agg_periods, trend_periods): + one_time = POSCASHBalanceFeatures.one_time_features(gr) + all = POSCASHBalanceFeatures.all_installment_features(gr) + agg = POSCASHBalanceFeatures.last_k_installment_features(gr, agg_periods) + trend = POSCASHBalanceFeatures.trend_in_last_k_installment_features(gr, trend_periods) + last = POSCASHBalanceFeatures.last_loan_features(gr) + features = {**one_time, **all, **agg, **trend, **last} + return pd.Series(features) + + @staticmethod + def one_time_features(gr): + gr_ = gr.copy() + gr_.sort_values(['MONTHS_BALANCE'], inplace=True) + features = {} + + features['pos_cash_remaining_installments'] = gr_['CNT_INSTALMENT_FUTURE'].tail(1) + features['pos_cash_completed_contracts'] = gr_['is_contract_status_completed'].agg('sum') + + return features + + @staticmethod + def all_installment_features(gr): + return POSCASHBalanceFeatures.last_k_installment_features(gr, periods=[10e16]) + + @staticmethod + def last_k_installment_features(gr, periods): + gr_ = gr.copy() + gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True) + + features = {} + for period in periods: + if period > 10e10: + period_name = 'all_installment_' + gr_period = gr_.copy() + else: + period_name = 'last_{}_'.format(period) + gr_period = gr_.iloc[:period] + + features = add_features_in_group(features, gr_period, 'pos_cash_paid_late', + ['count', 'mean'], + period_name) + features = add_features_in_group(features, gr_period, 'pos_cash_paid_late_with_tolerance', + ['count', 'mean'], + period_name) + features = add_features_in_group(features, gr_period, 'SK_DPD', + ['sum', 'mean', 'max', 'std', 'skew', 'kurt'], + period_name) + features = add_features_in_group(features, gr_period, 'SK_DPD_DEF', + ['sum', 'mean', 'max', 'std', 'skew', 'kurt'], + period_name) + return features + + @staticmethod + def trend_in_last_k_installment_features(gr, periods): + gr_ = gr.copy() + gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True) + + features = {} + for period in periods: + gr_period = gr_.iloc[:period] + + features = add_trend_feature(features, gr_period, + 'SK_DPD', '{}_period_trend_'.format(period) + ) + features = add_trend_feature(features, gr_period, + 'SK_DPD_DEF', '{}_period_trend_'.format(period) + ) + features = add_trend_feature(features, gr_period, + 'CNT_INSTALMENT_FUTURE', '{}_period_trend_'.format(period) + ) + return features + + @staticmethod + def last_loan_features(gr): + gr_ = gr.copy() + gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True) + last_installment_id = gr_['SK_ID_PREV'].iloc[0] + gr_ = gr_[gr_['SK_ID_PREV'] == last_installment_id] + + features={} + features = add_features_in_group(features, gr_, 'pos_cash_paid_late', + ['count', 'sum', 'mean'], + 'last_loan_') + features = add_features_in_group(features, gr_, 'pos_cash_paid_late_with_tolerance', + ['mean'], + 'last_loan_') + features = add_features_in_group(features, gr_, 'SK_DPD', + ['sum', 'mean', 'max', 'std'], + 'last_loan_') + features = add_features_in_group(features, gr_, 'SK_DPD_DEF', + ['sum', 'mean', 'max', 'std'], + 'last_loan_') + + return features + class PreviousApplicationFeatures(BasicHandCraftedFeatures): def __init__(self, numbers_of_applications=[], num_workers=1, **kwargs): @@ -580,7 +681,7 @@ def last_k_installment_features(gr, periods): features = {} for period in periods: if period > 10e10: - period_name = 'all_installment' + period_name = 'all_installment_' gr_period = gr_.copy() else: period_name = 'last_{}_'.format(period) @@ -680,6 +781,7 @@ def add_features_in_group(features, gr_, feature_name, aggs, prefix): features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name]) elif agg == 'median': features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median() + return features diff --git a/src/pipeline_config.py b/src/pipeline_config.py index ebabe7e..e44d406 100644 --- a/src/pipeline_config.py +++ b/src/pipeline_config.py @@ -301,6 +301,8 @@ 'pos_cash_balance': {'table_name': 'POS_CASH_balance', 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), 'groupby_aggregations': POS_CASH_BALANCE_AGGREGATION_RECIPIES, + 'last_k_agg_periods': parameter_eval(params.pos_cash__last_k_agg_periods), + 'last_k_trend_periods': parameter_eval(params.pos_cash__last_k_trend_periods), 'num_workers': params.num_workers },