From 5ec73e22f18f020f4df59ccbe66d993a08f196ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mi=C5=82osz=20Michta?= Date: Mon, 9 Jul 2018 10:06:27 +0200 Subject: [PATCH] Notebook - feature importance (#112) --- notebooks/eda_feature_importance.ipynb | 402 +++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 notebooks/eda_feature_importance.ipynb diff --git a/notebooks/eda_feature_importance.ipynb b/notebooks/eda_feature_importance.ipynb new file mode 100644 index 0000000..1cd1248 --- /dev/null +++ b/notebooks/eda_feature_importance.ipynb @@ -0,0 +1,402 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import lightgbm as lgb\n", + "from sklearn.externals import joblib\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import ipywidgets as ipy\n", + "from random import sample\n", + "import shap\n", + "shap.initjs()\n", + "%matplotlib inline\n", + "plt.style.use('seaborn-whitegrid')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "solution_path = 'solution_4_installments_simple_794'\n", + "\n", + "application = pd.read_csv('/mnt/ml-team/minerva/open-solutions/home-credit/files/unzipped_data/application_train.csv')\n", + "oof_train = pd.read_csv('/mnt/ml-team/minerva/open-solutions/home-credit/kuba/experiments/{}/lightGBM_out_of_fold_train_predictions.csv'.format(solution_path))\n", + "model = joblib.load('/mnt/ml-team/minerva/open-solutions/home-credit/kuba/experiments/{}/transformers//light_gbm_fold_0'.format(solution_path))\n", + "features = joblib.load('/mnt/ml-team/minerva/open-solutions/home-credit/kuba/experiments/{}/outputs/feature_joiner_valid_fold_0'.format(solution_path))\n", + "description = pd.read_csv('/mnt/ml-team/minerva/open-solutions/home-credit/data/HomeCredit_columns_description.csv', encoding='latin1')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index_list = oof_train[oof_train.fold_id==0]['SK_ID_CURR']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "oof_train_0 = oof_train[oof_train['SK_ID_CURR'].isin(index_list)]\n", + "application_0 = application[application['SK_ID_CURR'].isin(index_list)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "oof_train_0 = oof_train[oof_train['SK_ID_CURR'].isin(index_list)]\n", + "application_0 = application[application['SK_ID_CURR'].isin(index_list)]\n", + "features_df = features['features']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exploration_data = oof_train_0.copy()\n", + "exploration_data['target'] = application_0['TARGET'].values\n", + "exploration_data['diff_abs'] = np.abs(exploration_data['lightGBM_prediction'] - exploration_data['target'])\n", + "exploration_data['diff'] = exploration_data['lightGBM_prediction'] - exploration_data['target']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Features description" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "description.head()\n", + "@ipy.interact(\n", + " cols = ipy.SelectMultiple(\n", + " options=description.Row,\n", + " rows=10,\n", + " value=(description.Row[0],),\n", + " description='Features',\n", + " layout=ipy.Layout(width='90%')\n", + " )\n", + ")\n", + "def func(cols):\n", + " for i, col in enumerate(cols):\n", + " display('{} --- {}'.format(col, description[description.Row==col]['Description'].values[0]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Shap - Feature impact on a model\n", + "\n", + "https://github.com/slundberg/shap \n", + "\n", + "https://arxiv.org/pdf/1802.03888.pdf \n", + "\n", + "http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our first step is to compute SHAP values for each example in our dataset. $base\\_value$ is mean of our predictions and in our dataset equals $0.04942$ and will be flagged on plots below. The Shap value tells us how certain feature moved our prediction on ceratin example from expected value of all predictions.\n", + "$$ output\\_value (x) = base\\_value + \\sum\\limits_{i=1}^{M} \\phi_{i}z_{i}(x) $$,\n", + "\n", + "where $ z_i(x) \\in \\{0, 1\\}$ describes if $i$-th feature-value(e.g. SEX='Male') occurs at example $x$ and $\\phi_i$ is SHAP value of given feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap_values = shap.TreeExplainer(model).shap_values(features_df)\n", + "global_shap_vals = np.abs(shap_values).mean(0)[:-1]\n", + "inds = np.argsort(global_shap_vals)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(shap.force_plot(shap_values[0,:], features_df.iloc[0,:], link=\"logit\"))\n", + "display(shap.force_plot(shap_values[1,:], features_df.iloc[1,:], link=\"logit\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature importance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@ipy.interact(cols=ipy.IntRangeSlider(\n", + " value=(1, 20),\n", + " min=1,\n", + " max=features_df.shape[1],\n", + " description='Features:',\n", + " continuous_update=False,\n", + " layout=ipy.Layout(width='90%', height='30px')\n", + " ))\n", + "def func(cols):\n", + " min_index = -cols[1]\n", + " max_index = -cols[0]\n", + " y_pos = np.arange(features_df.shape[1])\n", + " plt.title(\"Feature importance: mean(|SHAP|)\")\n", + " plt.barh(y_pos[min_index:max_index], global_shap_vals[inds][min_index:max_index], color=\"#1E88E5\")\n", + " plt.yticks(y_pos[min_index:max_index], features_df.columns[inds][min_index:max_index])\n", + " plt.gca().spines['right'].set_visible(False)\n", + " plt.gca().spines['top'].set_visible(False)\n", + " plt.xlabel(\"mean SHAP value magnitude (change in log odds)\")\n", + " plt.gcf().set_size_inches(11, (cols[1]-cols[0])//2)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(shap.summary_plot(shap_values, features_df, max_display=20))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "@ipy.interact(\n", + " cols = ipy.SelectMultiple(\n", + " options=list(reversed(features_df.columns[inds])),\n", + " rows=10,\n", + " value=(list(reversed(features_df.columns[inds]))[0], ),\n", + " description='Corr Columns',\n", + " layout=ipy.Layout(width='90%')\n", + " ),\n", + " num_samples=ipy.IntSlider(\n", + " value=250,\n", + " min=100,\n", + " max=1000,\n", + " step = 50,\n", + " continuous_update=False,\n", + " description='Samples:',\n", + " layout=ipy.Layout(width='90%', height='30px')\n", + " )\n", + ")\n", + "def func(cols, num_samples):\n", + " smp = sample(range(len(shap_values)), num_samples)\n", + " for col in cols:\n", + " display(shap.dependence_plot(col, shap_values[smp], features_df.loc[smp,:]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(17,5))\n", + "\n", + "plt.title(\"Distribution of |Difference between predictions and target|\")\n", + "sns.distplot(exploration_data[exploration_data['target']==0]['diff_abs'], \n", + " label='Target_0', \n", + " color='#1587E8',\n", + " hist_kws={'alpha': 0.8},\n", + " bins=100);\n", + "sns.distplot(exploration_data[exploration_data['target']==1]['diff_abs'], \n", + " label='Target_1',\n", + " color='#F02958',\n", + " hist_kws={'alpha': 0.8},\n", + " bins=100);\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_analysis = features_df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_analysis['INDEX'] = index_list\n", + "feature_analysis['TARGET'] = exploration_data['target']\n", + "feature_analysis['DIFF'] = exploration_data['diff']\n", + "feature_analysis['DIFF_ABS'] = exploration_data['diff_abs']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "@ipy.interact(\n", + " cols = ipy.SelectMultiple(\n", + " options=list(reversed(features_df.columns[inds])),\n", + " rows=10,\n", + " value=(list(reversed(features_df.columns[inds]))[0], ),\n", + " description='Corr Columns',\n", + " layout=ipy.Layout(width='90%')\n", + " ),\n", + " num_samples=ipy.IntSlider(\n", + " value=250,\n", + " min=100,\n", + " max=1000,\n", + " step = 50,\n", + " continuous_update=False,\n", + " description='Samples:',\n", + " layout=ipy.Layout(width='90%', height='30px')\n", + " )\n", + ")\n", + "def func(cols, num_samples):\n", + " for col in cols:\n", + " fig = plt.figure(figsize=(16, 8));\n", + " target_1 = feature_analysis[feature_analysis['TARGET']==1]\n", + " target_0 = feature_analysis[feature_analysis['TARGET']==0]\n", + " smp_1 = sample(range(len(target_1)), min(num_samples, len(target_1)))\n", + " smp_0 = sample(range(len(target_0)), min(num_samples, len(target_0)))\n", + " df = pd.concat([target_1.iloc[smp_1,:], target_0.iloc[smp_0,:]], axis=0)\n", + " display(sns.swarmplot(x='TARGET', y=col, data=df, palette=['#1587E8', '#F02958'],))\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_between(df, col, interval):\n", + " return df[(interval[0] <= df[col]) & (df[col] <= interval[1])]\n", + "\n", + "def col_without_nan(df, col):\n", + " val = df[col]\n", + " return val[~np.isnan(val)]\n", + "\n", + "@ipy.interact(\n", + " diff_1=ipy.FloatRangeSlider(\n", + " value=(0.5, 1.0),\n", + " min=0.0,\n", + " max=1.0,\n", + " step=0.01,\n", + " description='Difference_1:',\n", + " continuous_update=False,\n", + " layout=ipy.Layout(width='90%', height='30px')\n", + " ), \n", + " diff_2=ipy.FloatRangeSlider(\n", + " value=(0.0, 0.5),\n", + " min=0.0,\n", + " max=1.0,\n", + " step=0.01,\n", + " description='Difference_2:',\n", + " continuous_update=False,\n", + " layout=ipy.Layout(width='90%', height='30px')\n", + " ),\n", + " cols = ipy.SelectMultiple(\n", + " options=list(reversed(features_df.columns[inds])),\n", + " rows=10,\n", + " value=(list(reversed(features_df.columns[inds]))[0], ),\n", + " description='Columns',\n", + " layout=ipy.Layout(width='90%')\n", + " ))\n", + "def func(diff_1, diff_2, cols):\n", + " for col in cols:\n", + " vals_1 = col_without_nan(get_between(feature_analysis, 'DIFF_ABS', diff_1) , col)\n", + " vals_2 = col_without_nan(get_between(feature_analysis, 'DIFF_ABS', diff_2) , col)\n", + " display(sns.distplot(vals_1, label='Difference_1', color='#F02958', hist_kws={'alpha': 0.7}, bins=min(100, len(vals_1))))\n", + " display(sns.distplot(vals_2, label='Difference_2', color='#1587E8', hist_kws={'alpha': 0.7}, bins=min(100, len(vals_2))))\n", + " plt.legend()\n", + " plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}