From db86ec414ea4ba54a4a96027970e2bf9bf3cdbd8 Mon Sep 17 00:00:00 2001 From: Jakub Czakon Date: Sat, 14 Jul 2018 15:27:32 +0200 Subject: [PATCH 1/2] added fraction features to eda and feature extraction, updated configs --- configs/neptune.yaml | 3 +- configs/neptune_random_search.yaml | 1 + configs/neptune_stacking.yaml | 5 +- notebooks/eda-installments.ipynb | 220 +++++++++++++++++++++++++---- src/feature_extraction.py | 37 ++++- src/pipeline_config.py | 4 +- src/utils.py | 6 + 7 files changed, 236 insertions(+), 40 deletions(-) diff --git a/configs/neptune.yaml b/configs/neptune.yaml index f172663..0347d35 100644 --- a/configs/neptune.yaml +++ b/configs/neptune.yaml @@ -52,7 +52,8 @@ parameters: # Feature Extraction installments__last_k_trend_periods: '[10, 50, 100, 500]' - installments__last_k_agg_periods: '[1, 5, 10, 50, 100, 500]' + installments__last_k_agg_periods: '[1, 5, 10, 20, 50, 100]' + installments__last_k_agg_period_fractions: '[(5,20),(5,50),(10,50),(10,100),(20,100)]' application_aggregation__use_diffs_only: True use_nan_count: True diff --git a/configs/neptune_random_search.yaml b/configs/neptune_random_search.yaml index 3f6b0dd..8db4ea5 100644 --- a/configs/neptune_random_search.yaml +++ b/configs/neptune_random_search.yaml @@ -53,6 +53,7 @@ parameters: # Feature Extraction installments__last_k_trend_periods: '[10, 50, 100, 500]' installments__last_k_agg_periods: '[1, 5, 10, 50, 100, 500]' + installments__last_k_agg_period_fractions: '[(5,20),(5,50),(10,50),(10,100),(20,100)]' application_aggregation__use_diffs_only: True use_nan_count: True diff --git a/configs/neptune_stacking.yaml b/configs/neptune_stacking.yaml index 74c7c9b..6d2c53d 100644 --- a/configs/neptune_stacking.yaml +++ b/configs/neptune_stacking.yaml @@ -51,8 +51,9 @@ parameters: fill_value: 0 # Feature Extraction - installments__last_k_trend_periods: '[10, 50, 100, 500]' - installments__last_k_agg_periods: '[1, 5, 10, 50, 100, 500]' + installments__last_k_trend_periods: None + installments__last_k_agg_periods: None + installments__last_k_agg_period_fractions: None application_aggregation__use_diffs_only: True use_nan_count: True diff --git a/notebooks/eda-installments.ipynb b/notebooks/eda-installments.ipynb index 71ab29a..db2ec2b 100644 --- a/notebooks/eda-installments.ipynb +++ b/notebooks/eda-installments.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "%load_ext autoreload\n", @@ -24,7 +26,7 @@ "\n", "sys.path.append('../')\n", "from src.utils import parallel_apply\n", - "from src.feature_extraction import add_features, add_features_in_group\n", + "from src.feature_extraction import add_features_in_group\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", @@ -34,7 +36,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n", @@ -45,7 +49,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "installments.head()" @@ -77,7 +83,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = []\n", @@ -96,7 +104,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "groupby_aggregate_names = []\n", @@ -118,7 +128,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "application.head()" @@ -127,7 +139,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "application_agg = application[groupby_aggregate_names + ['TARGET']]\n", @@ -137,7 +151,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']" @@ -153,7 +169,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "positive_ID = application[application['TARGET']==1]['SK_ID_CURR'].tolist()\n", @@ -163,7 +181,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "value_counts = installments[installments['SK_ID_CURR'].isin(positive_ID)]['SK_ID_CURR'].value_counts()" @@ -172,7 +192,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "value_counts.head()" @@ -181,7 +203,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "sns.distplot(value_counts)" @@ -190,7 +214,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "installments_one = installments[installments['SK_ID_CURR']==328162]" @@ -199,7 +225,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "installments_one.sort_values(['DAYS_INSTALMENT'],ascending=False).head(10)" @@ -208,7 +236,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# installments_ = installments[installments['SK_ID_CURR'].isin(positive_ID[:100])]\n", @@ -222,7 +252,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def add_features(feature_name, aggs, features, feature_names, groupby):\n", @@ -265,13 +297,15 @@ " features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name])\n", " elif agg == 'median':\n", " features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median()\n", - " return features" + " return features" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "features = pd.DataFrame({'SK_ID_CURR':installments_['SK_ID_CURR'].unique()})\n", @@ -281,7 +315,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "installments_.head()" @@ -297,7 +333,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "feature_names = []\n", @@ -333,7 +371,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def last_k_instalment_features(gr, periods):\n", @@ -360,7 +400,7 @@ " 'last_{}_'.format(period))\n", " features = add_features_in_group(features,gr_period,'instalment_paid_over', \n", " ['count','mean'],\n", - " 'last_{}_'.format(period))\n", + " 'last_{}_'.format(period)) \n", " \n", " return features" ] @@ -368,7 +408,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "func = partial(last_k_instalment_features, periods=[1,5,10,20,50,100])\n", @@ -390,7 +432,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression" @@ -399,7 +443,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def trend_in_last_k_instalment_features(gr, periods):\n", @@ -436,7 +482,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "func = partial(trend_in_last_k_instalment_features, periods=[10,50,100,500])\n", @@ -451,7 +499,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "X = application.merge(features, on='SK_ID_CURR',how='left')\n", @@ -462,7 +512,117 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X_corr = abs(X.corr())\n", + "X_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "markdown", "metadata": {}, + "source": [ + "# Solution 5\n", + "## Period fractions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def last_k_instalment_features_with_fractions(gr, periods, fraction_periods):\n", + " gr_ = gr.copy()\n", + " gr_.sort_values(['DAYS_INSTALMENT'],ascending=False, inplace=True)\n", + " \n", + " features = {}\n", + "\n", + " for period in periods:\n", + " gr_period = gr_.iloc[:period]\n", + "\n", + " features = add_features_in_group(features,gr_period, 'NUM_INSTALMENT_VERSION', \n", + " ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n", + " 'last_{}_'.format(period))\n", + " \n", + " features = add_features_in_group(features,gr_period, 'instalment_paid_late_in_days', \n", + " ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n", + " 'last_{}_'.format(period))\n", + " features = add_features_in_group(features,gr_period ,'instalment_paid_late', \n", + " ['count','mean'],\n", + " 'last_{}_'.format(period))\n", + " features = add_features_in_group(features,gr_period ,'instalment_paid_over_amount', \n", + " ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],\n", + " 'last_{}_'.format(period))\n", + " features = add_features_in_group(features,gr_period,'instalment_paid_over', \n", + " ['count','mean'],\n", + " 'last_{}_'.format(period)) \n", + " \n", + " for short_period, long_period in fraction_periods:\n", + " short_feature_names = _get_feature_names(features, short_period)\n", + " long_feature_names = _get_feature_names(features, long_period)\n", + " \n", + " for short_feature, long_feature in zip(short_feature_names, long_feature_names):\n", + " old_name_chunk = '_{}_'.format(short_period)\n", + " new_name_chunk ='_{}by{}_fraction_'.format(short_period, long_period)\n", + " fraction_feature_name = short_feature.replace(old_name_chunk, new_name_chunk)\n", + " features[fraction_feature_name] = safe_div(features[short_feature], features[long_feature])\n", + " return pd.Series(features)\n", + "\n", + "def _get_feature_names(features, period):\n", + " return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat])\n", + "\n", + "\n", + "def safe_div(a,b):\n", + " try:\n", + " return float(a)/float(b)\n", + " except:\n", + " return 0.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "func = partial(last_k_instalment_features_with_fractions, \n", + " periods=[1,5,10,20,50,100],\n", + " fraction_periods=[(5,20),(5,50),(10,100)])\n", + "\n", + "g = parallel_apply(groupby, func, index_name='SK_ID_CURR',\n", + " num_workers=16, chunk_size=1000).reset_index()\n", + "features = features.merge(g, on='SK_ID_CURR', how='left')\n", + "\n", + "display(features.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X = application.merge(features, on='SK_ID_CURR',how='left')\n", + "X = X[features.columns.drop('SK_ID_CURR').tolist()+['TARGET']]\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "X_corr = abs(X.corr())\n", @@ -472,7 +632,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [] } diff --git a/src/feature_extraction.py b/src/feature_extraction.py index 692e1a4..47987e0 100644 --- a/src/feature_extraction.py +++ b/src/feature_extraction.py @@ -11,7 +11,7 @@ from steppy.base import BaseTransformer from steppy.utils import get_logger -from .utils import parallel_apply +from .utils import parallel_apply, safe_div logger = get_logger() @@ -473,11 +473,11 @@ def fit(self, prev_applications, **kwargs): g = prev_app_sorted.groupby(by=['SK_ID_CURR'])['previous_application_prev_was_refused'].mean().reset_index() g.rename(index=str, columns={ 'previous_application_prev_was_refused': 'previous_application_fraction_of_refused_applications'}, - inplace=True) + inplace=True) features = features.merge(g, on=['SK_ID_CURR'], how='left') prev_app_sorted['prev_applications_prev_was_revolving_loan'] = ( - prev_app_sorted['NAME_CONTRACT_TYPE'] == 'Revolving loans').astype('int') + prev_app_sorted['NAME_CONTRACT_TYPE'] == 'Revolving loans').astype('int') g = prev_app_sorted.groupby(by=['SK_ID_CURR'])[ 'prev_applications_prev_was_revolving_loan'].last().reset_index() features = features.merge(g, on=['SK_ID_CURR'], how='left') @@ -513,9 +513,10 @@ def fit(self, prev_applications, **kwargs): class InstallmentPaymentsFeatures(BasicHandCraftedFeatures): - def __init__(self, last_k_agg_periods, last_k_trend_periods, num_workers=1, **kwargs): + def __init__(self, last_k_agg_periods, last_k_agg_period_fractions, last_k_trend_periods, num_workers=1, **kwargs): super().__init__(num_workers=num_workers) self.last_k_agg_periods = last_k_agg_periods + self.last_k_agg_period_fractions = last_k_agg_period_fractions self.last_k_trend_periods = last_k_trend_periods self.num_workers = num_workers @@ -533,6 +534,7 @@ def fit(self, installments, **kwargs): func = partial(InstallmentPaymentsFeatures.generate_features, agg_periods=self.last_k_agg_periods, + period_fractions=self.last_k_agg_period_fractions, trend_periods=self.last_k_trend_periods) g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=self.num_workers).reset_index() features = features.merge(g, on='SK_ID_CURR', how='left') @@ -541,9 +543,11 @@ def fit(self, installments, **kwargs): return self @staticmethod - def generate_features(gr, agg_periods, trend_periods): - all = InstallmentPaymentsFeatures.last_k_installment_features(gr, periods=[10e16]) - agg = InstallmentPaymentsFeatures.last_k_installment_features(gr, agg_periods) + def generate_features(gr, agg_periods, trend_periods, period_fractions): + all = InstallmentPaymentsFeatures.all_installment_features(gr) + agg = InstallmentPaymentsFeatures.last_k_installment_features_with_fractions(gr, + agg_periods, + period_fractions) trend = InstallmentPaymentsFeatures.trend_in_last_k_installment_features(gr, trend_periods) last = InstallmentPaymentsFeatures.last_loan_features(gr) features = {**all, **agg, **trend, **last} @@ -553,6 +557,21 @@ def generate_features(gr, agg_periods, trend_periods): def all_installment_features(gr): return InstallmentPaymentsFeatures.last_k_installment_features(gr, periods=[10e16]) + @staticmethod + def last_k_installment_features_with_fractions(gr, periods, period_fractions): + features = InstallmentPaymentsFeatures.last_k_installment_features(gr, periods) + + for short_period, long_period in period_fractions: + short_feature_names = get_feature_names_by_period(features, short_period) + long_feature_names = get_feature_names_by_period(features, long_period) + + for short_feature, long_feature in zip(short_feature_names, long_feature_names): + old_name_chunk = '_{}_'.format(short_period) + new_name_chunk = '_{}by{}_fraction_'.format(short_period, long_period) + fraction_feature_name = short_feature.replace(old_name_chunk, new_name_chunk) + features[fraction_feature_name] = safe_div(features[short_feature], features[long_feature]) + return features + @staticmethod def last_k_installment_features(gr, periods): gr_ = gr.copy() @@ -675,3 +694,7 @@ def add_trend_feature(features, gr, feature_name, prefix): trend = np.nan features['{}{}'.format(prefix, feature_name)] = trend return features + + +def get_feature_names_by_period(features, period): + return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat]) diff --git a/src/pipeline_config.py b/src/pipeline_config.py index 35f4ea5..ebabe7e 100644 --- a/src/pipeline_config.py +++ b/src/pipeline_config.py @@ -144,7 +144,7 @@ aggregation_pairs = [(col, agg) for col in cols_to_agg for agg in aggs] APPLICATION_AGGREGATION_RECIPIES = [ - (['NAME_EDUCATION_TYPE', 'CODE_GENDER'], aggregation_pairs), + (['NAME_EDUCATION_TYPE', 'CODE_GENDER'], aggregation_pairs), (['NAME_FAMILY_STATUS', 'NAME_EDUCATION_TYPE'], aggregation_pairs), (['NAME_FAMILY_STATUS', 'CODE_GENDER'], aggregation_pairs), (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'), @@ -292,6 +292,8 @@ 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), 'groupby_aggregations': INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES, 'last_k_agg_periods': parameter_eval(params.installments__last_k_agg_periods), + 'last_k_agg_period_fractions': parameter_eval( + params.installments__last_k_agg_period_fractions), 'last_k_trend_periods': parameter_eval(params.installments__last_k_trend_periods), 'num_workers': params.num_workers }, diff --git a/src/utils.py b/src/utils.py index 35d8921..63a2d43 100644 --- a/src/utils.py +++ b/src/utils.py @@ -150,3 +150,9 @@ def _clean_columns(df, keep_colnames): for i, colname in enumerate(feature_colnames): new_colnames.append('model_{}'.format(i)) return new_colnames + +def safe_div(a, b): + try: + return float(a) / float(b) + except: + return 0.0 \ No newline at end of file From 213ab48e6b4309dcb3a39b77978294b4a6af0620 Mon Sep 17 00:00:00 2001 From: Jakub Czakon Date: Mon, 16 Jul 2018 07:40:10 +0200 Subject: [PATCH 2/2] updated hyperparams --- configs/neptune.yaml | 6 +++--- src/pipeline_manager.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/neptune.yaml b/configs/neptune.yaml index 0347d35..5dd9ca8 100644 --- a/configs/neptune.yaml +++ b/configs/neptune.yaml @@ -69,12 +69,12 @@ parameters: lgbm__max_bin: 300 lgbm__max_depth: -1 lgbm__num_leaves: 35 - lgbm__min_child_samples: 50 + lgbm__min_child_samples: 70 lgbm__subsample: 1.0 lgbm__subsample_freq: 1 - lgbm__colsample_bytree: 0.2 + lgbm__colsample_bytree: 0.05 lgbm__min_gain_to_split: 0.5 - lgbm__reg_lambda: 100.0 + lgbm__reg_lambda: 100 lgbm__reg_alpha: 0.0 lgbm__scale_pos_weight: 1 diff --git a/src/pipeline_manager.py b/src/pipeline_manager.py index 6d466b0..7f0459b 100644 --- a/src/pipeline_manager.py +++ b/src/pipeline_manager.py @@ -401,7 +401,7 @@ def _read_data(dev_mode, read_train=True, read_test=False): if read_test: raw_data['application_test'] = pd.read_csv(params.test_filepath, nrows=nrows) - + raw_data['bureau'] = pd.read_csv(params.bureau_filepath, nrows=nrows) raw_data['credit_card_balance'] = pd.read_csv(params.credit_card_balance_filepath, nrows=nrows) raw_data['pos_cash_balance'] = pd.read_csv(params.POS_CASH_balance_filepath, nrows=nrows)