diff --git a/feature_extraction.py b/feature_extraction.py deleted file mode 100644 index ddce1e9..0000000 --- a/feature_extraction.py +++ /dev/null @@ -1,142 +0,0 @@ -import os - -import category_encoders as ce -import numpy as np -import pandas as pd -from sklearn.externals import joblib -from steppy.base import BaseTransformer -from steppy.utils import get_logger - -logger = get_logger() - - -class DataFrameByTypeSplitter(BaseTransformer): - def __init__(self, numerical_columns, categorical_columns, timestamp_columns): - super().__init__() - self.numerical_columns = numerical_columns - self.categorical_columns = categorical_columns - self.timestamp_columns = timestamp_columns - - def transform(self, X, y=None, **kwargs): - outputs = {} - - if self.numerical_columns is not None: - outputs['numerical_features'] = X[self.numerical_columns] - - if self.categorical_columns is not None: - outputs['categorical_features'] = X[self.categorical_columns] - - if self.timestamp_columns is not None: - outputs['timestamp_features'] = X[self.timestamp_columns] - - return outputs - - -class FeatureJoiner(BaseTransformer): - def transform(self, numerical_feature_list, categorical_feature_list, **kwargs): - features = numerical_feature_list + categorical_feature_list - for feature in features: - feature.reset_index(drop=True, inplace=True) - outputs = dict() - outputs['features'] = pd.concat(features, axis=1).astype(np.float32) - outputs['feature_names'] = self._get_feature_names(features) - outputs['categorical_features'] = self._get_feature_names(categorical_feature_list) - return outputs - - def _get_feature_names(self, dataframes): - feature_names = [] - for dataframe in dataframes: - try: - feature_names.extend(list(dataframe.columns)) - except Exception as e: - print(e) - feature_names.append(dataframe.name) - - return feature_names - - -class CategoricalEncoder(BaseTransformer): - def __init__(self, **kwargs): - super().__init__() - self.params = kwargs - self.encoder_class = ce.OrdinalEncoder - self.categorical_encoder = None - - def fit(self, X, y, **kwargs): - categorical_columns = list(X.columns) - self.categorical_encoder = self.encoder_class(cols=categorical_columns, **self.params) - self.categorical_encoder.fit(X, y) - return self - - def transform(self, X, y=None, **kwargs): - X_ = self.categorical_encoder.transform(X) - return {'categorical_features': X_} - - def load(self, filepath): - self.categorical_encoder = joblib.load(filepath) - return self - - def persist(self, filepath): - joblib.dump(self.categorical_encoder, filepath) - - -class GroupbyAggregations(BaseTransformer): - def __init__(self, groupby_aggregations): - super().__init__() - self.groupby_aggregations = groupby_aggregations - - @property - def groupby_aggregations_names(self): - groupby_aggregations_names = ['{}_{}_{}'.format('_'.join(spec['groupby']), - spec['agg'], - spec['select']) - for spec in self.groupby_aggregations] - return groupby_aggregations_names - - def transform(self, categorical_features, numerical_features): - X = pd.concat([categorical_features, numerical_features], axis=1) - for spec, groupby_aggregations_name in zip(self.groupby_aggregations, self.groupby_aggregations_names): - group_object = X.groupby(spec['groupby']) - X = X.merge(group_object[spec['select']] - .agg(spec['agg']) - .reset_index() - .rename(index=str, - columns={spec['select']: groupby_aggregations_name}) - [spec['groupby'] + [groupby_aggregations_name]], - on=spec['groupby'], - how='left') - - return {'numerical_features': X[self.groupby_aggregations_names].astype(np.float32)} - - -class GroupbyAggregationFromFile(BaseTransformer): - def __init__(self, filepath, id_columns, groupby_aggregations): - super().__init__() - self.filename = os.path.basename(filepath).split('.')[0] - self.file = pd.read_csv(filepath) - self.id_columns = id_columns - self.groupby_aggregations = groupby_aggregations - - @ property - def groupby_aggregations_names(self): - groupby_aggregations_names = ['{}_{}_{}_{}'.format(self.filename, - '_'.join(spec['groupby']), - spec['agg'], - spec['select']) - for spec in self.groupby_aggregations] - return groupby_aggregations_names - - def transform(self, X): - for spec, groupby_aggregations_name in zip(self.groupby_aggregations, self.groupby_aggregations_names): - group_object = self.file.groupby(spec['groupby']) - X = X.merge(group_object[spec['select']] - .agg(spec['agg']) - .reset_index() - .rename(index=str, - columns={spec['select']: groupby_aggregations_name}) - [spec['groupby'] + [groupby_aggregations_name]], - left_on=self.id_columns[0], - right_on=self.id_columns[1], - how='left') - - return {'numerical_features': X[self.groupby_aggregations_names].astype(np.float32)} diff --git a/main.py b/main.py index e505239..c379c5b 100644 --- a/main.py +++ b/main.py @@ -1,196 +1,77 @@ -import os -import shutil - import click -import pandas as pd -from deepsense import neptune -from sklearn.metrics import roc_auc_score -from sklearn.model_selection import train_test_split - -import pipeline_config as cfg -from pipelines import PIPELINES -from utils import create_submission, init_logger, read_params, persist_evaluation_predictions, \ - set_seed, verify_submission +from src.pipeline_manager import PipelineManager -set_seed() -logger = init_logger() -ctx = neptune.Context() -params = read_params(ctx) +pipeline_manager = PipelineManager() @click.group() -def action(): +def main(): pass -@action.command() +@main.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) def train(pipeline_name, dev_mode): - _train(pipeline_name, dev_mode) + pipeline_manager.train(pipeline_name, dev_mode) -@action.command() +@main.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) def evaluate(pipeline_name, dev_mode): - _evaluate(pipeline_name, dev_mode) + pipeline_manager.evaluate(pipeline_name, dev_mode) -@action.command() +@main.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) -def predict(pipeline_name, dev_mode): - _predict(pipeline_name, dev_mode) +@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False) +def predict(pipeline_name, dev_mode, submit_predictions): + pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions) -@action.command() +@main.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) +@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False) @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) -def train_evaluate_predict(pipeline_name, dev_mode): - _train(pipeline_name, dev_mode) - _evaluate(pipeline_name, dev_mode) - _predict(pipeline_name, dev_mode) +def train_evaluate_predict(pipeline_name, submit_predictions, dev_mode): + pipeline_manager.train(pipeline_name, dev_mode) + pipeline_manager.evaluate(pipeline_name, dev_mode) + pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions) -@action.command() +@main.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) -def evaluate_predict(pipeline_name, dev_mode): - _evaluate(pipeline_name, dev_mode) - _predict(pipeline_name, dev_mode) +def train_evaluate(pipeline_name, dev_mode): + pipeline_manager.train(pipeline_name, dev_mode) + pipeline_manager.evaluate(pipeline_name, dev_mode) -@action.command() +@main.command() @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) +@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False) @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) -def train_evaluate(pipeline_name, dev_mode): - _train(pipeline_name, dev_mode) - _evaluate(pipeline_name, dev_mode) - - -def _train(pipeline_name, dev_mode): - logger.info('TRAINING') - if bool(params.clean_experiment_directory_before_training) and os.path.isdir(params.experiment_directory): - logger.info('Cleaning experiment_directory...') - shutil.rmtree(params.experiment_directory) - - logger.info('Reading data...') - if dev_mode: - logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE)) - application_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE) - else: - application_train = pd.read_csv(params.train_filepath) - - logger.info('Shuffling and splitting into train and test...') - train_data_split, valid_data_split = train_test_split(application_train, - test_size=params.validation_size, - random_state=cfg.RANDOM_SEED, - shuffle=params.shuffle) - - logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMN].mean())) - logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMN].mean())) - logger.info('Train shape: {}'.format(train_data_split.shape)) - logger.info('Valid shape: {}'.format(valid_data_split.shape)) - - data = {'input': {'X': train_data_split.drop(cfg.TARGET_COLUMN, axis=1), - 'y': train_data_split[cfg.TARGET_COLUMN], - 'X_valid': valid_data_split.drop(cfg.TARGET_COLUMN, axis=1), - 'y_valid': valid_data_split[cfg.TARGET_COLUMN], - }, - } - - pipeline = PIPELINES[pipeline_name]['train'](cfg.SOLUTION_CONFIG) - pipeline.clean_cache() - logger.info('Start pipeline fit and transform') - pipeline.fit_transform(data) - pipeline.clean_cache() - - -def _evaluate(pipeline_name, dev_mode): - logger.info('EVALUATION') - logger.info('reading data...') - if dev_mode: - logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE)) - application_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE) - else: - application_train = pd.read_csv(params.train_filepath) - - logger.info('Shuffling and splitting to get validation split...') - _, valid_data_split = train_test_split(application_train, - test_size=params.validation_size, - random_state=cfg.RANDOM_SEED, - shuffle=params.shuffle) - - logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMN].mean())) - logger.info('Valid shape: {}'.format(valid_data_split.shape)) - - y_true = valid_data_split[cfg.TARGET_COLUMN].values - data = {'input': {'X': valid_data_split.drop(cfg.TARGET_COLUMN, axis=1), - 'y': valid_data_split[cfg.TARGET_COLUMN], - }, - } - - pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) - pipeline.clean_cache() - logger.info('Start pipeline transform') - output = pipeline.transform(data) - pipeline.clean_cache() - - y_pred = output['clipped_prediction'] - - logger.info('Saving evaluation predictions to the {}'.format(params.experiment_directory)) - persist_evaluation_predictions(params.experiment_directory, - y_pred, - valid_data_split, - cfg.ID_COLUMN, - cfg.TARGET_COLUMN) - - logger.info('Calculating ROC_AUC on validation set') - score = roc_auc_score(y_true, y_pred) - logger.info('ROC_AUC score on validation is {}'.format(score)) - ctx.channel_send('ROC_AUC', 0, score) - - -def _predict(pipeline_name, dev_mode): - logger.info('PREDICTION') - logger.info('reading data...') - if dev_mode: - logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE)) - application_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE) - else: - application_test = pd.read_csv(params.test_filepath) - - data = {'input': {'X': application_test, - 'y': None, - }, - } - - pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) - pipeline.clean_cache() - logger.info('Start pipeline transform') - output = pipeline.transform(data) - pipeline.clean_cache() - y_pred = output['clipped_prediction'] - - if not dev_mode: - logger.info('creating submission file...') - submission = create_submission(application_test, y_pred) - - logger.info('verifying submission...') - sample_submission = pd.read_csv(params.sample_submission_filepath) - verify_submission(submission, sample_submission) - - submission_filepath = os.path.join(params.experiment_directory, 'submission.csv') - submission.to_csv(submission_filepath, index=None, encoding='utf-8') - logger.info('submission persisted to {}'.format(submission_filepath)) - logger.info('submission head \n\n{}'.format(submission.head())) - - if params.kaggle_api: - logger.info('making Kaggle submit...') - os.system('kaggle competitions submit -c home-credit-default-risk -f {} -m {}' - .format(submission_filepath, params.kaggle_message)) +def evaluate_predict(pipeline_name, submit_predictions, dev_mode): + pipeline_manager.evaluate(pipeline_name, dev_mode) + pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions) + + +@main.command() +@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +def train_evaluate_cv(pipeline_name, dev_mode): + pipeline_manager.train_evaluate_cv(pipeline_name, dev_mode) + + +@main.command() +@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) +@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False) +@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) +def train_evaluate_predict_cv(pipeline_name, submit_predictions, dev_mode): + pipeline_manager.train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions) if __name__ == "__main__": - action() + main() \ No newline at end of file diff --git a/models.py b/models.py deleted file mode 100644 index 56dcb64..0000000 --- a/models.py +++ /dev/null @@ -1,84 +0,0 @@ -import xgboost as xgb -from attrdict import AttrDict -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler -from steppy.base import BaseTransformer -from steppy.utils import get_logger -from toolkit.sklearn_transformers.models import SklearnClassifier - -logger = get_logger() - - -class XGBoost(BaseTransformer): - def __init__(self, **params): - super().__init__() - logger.info('initializing XGBoost...') - self.params = params - self.training_params = ['nrounds', 'early_stopping_rounds'] - self.evaluation_function = None - - @property - def model_config(self): - return AttrDict({param: value for param, value in self.params.items() - if param not in self.training_params}) - - @property - def training_config(self): - return AttrDict({param: value for param, value in self.params.items() - if param in self.training_params}) - - def fit(self, - X, y, - X_valid, y_valid, - feature_names=None, - feature_types=None, - **kwargs): - train = xgb.DMatrix(X, - label=y, - feature_names=feature_names, - feature_types=feature_types) - valid = xgb.DMatrix(X_valid, - label=y_valid, - feature_names=feature_names, - feature_types=feature_types) - - evaluation_results = {} - self.estimator = xgb.train(params=self.model_config, - dtrain=train, - evals=[(train, 'train'), (valid, 'valid')], - evals_result=evaluation_results, - num_boost_round=self.training_config.nrounds, - early_stopping_rounds=self.training_config.early_stopping_rounds, - verbose_eval=self.model_config.verbose, - feval=self.evaluation_function) - return self - - def transform(self, X, y=None, feature_names=None, feature_types=None, **kwargs): - X_DMatrix = xgb.DMatrix(X, - label=y, - feature_names=feature_names, - feature_types=feature_types) - prediction = self.estimator.predict(X_DMatrix) - return {'prediction': prediction} - - def load(self, filepath): - self.estimator = xgb.Booster(params=self.model_config) - self.estimator.load_model(filepath) - return self - - def persist(self, filepath): - self.estimator.save_model(filepath) - - -def get_sklearn_classifier(ClassifierClass, normalize=False, **kwargs): - - class SklearnBinaryClassifier(SklearnClassifier): - def transform(self, X, y=None, target=1, **kwargs): - prediction = self.estimator.predict_proba(X)[:, target] - return {SklearnClassifier.RESULT_KEY: prediction} - - if normalize: - return SklearnBinaryClassifier(Pipeline([('standarizer', StandardScaler()), - ('classifier', ClassifierClass(**kwargs))])) - - return SklearnBinaryClassifier(ClassifierClass(**kwargs)) diff --git a/neptune.yaml b/neptune.yaml index fec954c..34f0a3a 100644 --- a/neptune.yaml +++ b/neptune.yaml @@ -1,7 +1,7 @@ project: ORGANIZATION/home-credit name: home-credit-default-risk -tags: [solution-2] +tags: [solution-3, dev] metric: channel: 'ROC_AUC' @@ -9,20 +9,19 @@ metric: exclude: - output - - imgs + - notebooks - neptune.log - offline_job.log - .git - .github - .idea - .ipynb_checkpoints - - Untitled.ipynb parameters: # Data train_filepath: YOUR/PATH/TO/application_train.csv test_filepath: YOUR/PATH/TO/application_test.csv - bureau_balance_filepath: YOUR/PATH/TO/bureau_balance_filepath.csv + bureau_balance_filepath: YOUR/PATH/TO/bureau_balance.csv bureau_filepath: YOUR/PATH/TO/bureau.csv credit_card_balance_filepath: YOUR/PATH/TO/credit_card_balance.csv installments_payments_filepath: YOUR/PATH/TO/installments_payments.csv @@ -33,19 +32,22 @@ parameters: # Kaggle kaggle_api: 0 - kaggle_message: 'solution-2' + kaggle_message: 'solution-3' # Data preparation + n_cv_splits: 5 validation_size: 0.2 + stratified_cv: True shuffle: 1 # Execution clean_experiment_directory_before_training: 1 - num_workers: 16 + num_workers: 1 verbose: 1 # Preprocessing - fillna_value: -1 + fill_missing: False + fill_value: None # Light GBM lgbm_random_search_runs: 0 @@ -53,19 +55,19 @@ parameters: lgbm__boosting_type: gbdt lgbm__objective: binary lgbm__metric: auc - lgbm__number_boosting_rounds: 10000 - lgbm__early_stopping_rounds: 100 - lgbm__learning_rate: 0.005 - lgbm__num_leaves: 50 - lgbm__max_depth: 20 - lgbm__min_child_samples: 20 - lgbm__max_bin: 300 # at most 255 for device=gpu - lgbm__subsample: 0.6 - lgbm__subsample_freq: 0 - lgbm__colsample_bytree: 0.8 - lgbm__min_child_weight: 4 - lgbm__reg_lambda: 0.05 - lgbm__reg_alpha: 0.05 + lgbm__number_boosting_rounds: 500 + lgbm__early_stopping_rounds: 50 + lgbm__learning_rate: 0.1 + lgbm__max_bin: 300 + lgbm__max_depth: -1 + lgbm__num_leaves: 100 + lgbm__min_child_samples: 600 + lgbm__subsample: 1.0 + lgbm__subsample_freq: 1 + lgbm__colsample_bytree: 0.1 + lgbm__min_gain_to_split: 0.5 + lgbm__reg_lambda: 50.0 + lgbm__reg_alpha: 0.0 lgbm__scale_pos_weight: 1 # XGBoost @@ -117,3 +119,6 @@ parameters: svc__probability: True svc__tol: 0.00001 svc__max_iter: -1 + +# Postprocessing + aggregation_method: rank_mean \ No newline at end of file diff --git a/neptune_random_search.yaml b/neptune_random_search.yaml index e1e765e..3659488 100644 --- a/neptune_random_search.yaml +++ b/neptune_random_search.yaml @@ -1,7 +1,7 @@ project: ORGANIZATION/home-credit name: home-credit-default-risk -tags: [solution-2] +tags: [solution-3] metric: channel: 'ROC_AUC' @@ -9,20 +9,19 @@ metric: exclude: - output - - imgs + - notebooks - neptune.log - offline_job.log - .git - .github - .idea - .ipynb_checkpoints - - Untitled.ipynb parameters: # Data train_filepath: YOUR/PATH/TO/application_train.csv test_filepath: YOUR/PATH/TO/application_test.csv - bureau_balance_filepath: YOUR/PATH/TO/bureau_balance_filepath.csv + bureau_balance_filepath: YOUR/PATH/TO/bureau_balance.csv bureau_filepath: YOUR/PATH/TO/bureau.csv credit_card_balance_filepath: YOUR/PATH/TO/credit_card_balance.csv installments_payments_filepath: YOUR/PATH/TO/installments_payments.csv @@ -33,10 +32,12 @@ parameters: # Kaggle kaggle_api: 0 - kaggle_message: 'solution-2' + kaggle_message: 'solution-3' # Data preparation + n_cv_splits: 5 validation_size: 0.2 + stratified_cv: True shuffle: 1 # Execution @@ -45,7 +46,8 @@ parameters: verbose: 1 # Preprocessing - fillna_value: -1 + fill_missing: False + fill_value: None # Light GBM lgbm_random_search_runs: 50 @@ -58,7 +60,7 @@ parameters: lgbm__learning_rate: '[0.0005, 0.1, "log-uniform"]' lgbm__num_leaves: '[20, 50]' lgbm__max_depth: '[7, 30]' - lgbm__min_child_samples: '[20, 45]' + lgbm__min_child_samples: '[20, 50]' lgbm__max_bin: '[180, 500]' # at most 255 for device=gpu lgbm__subsample: '[0.8, 0.9, 0.99, 0.6, 0.7, "list"]' lgbm__subsample_freq: 0 @@ -117,3 +119,6 @@ parameters: svc__probability: True svc__tol: '[0.00001, 0.01, "log-uniform"]' svc__max_iter: '[-1, 100, 1000, 10000, 50000, "list"]' + +# Postprocessing + aggregation_method: rank_mean \ No newline at end of file diff --git a/notebooks/eda-application.ipynb b/notebooks/eda-application.ipynb new file mode 100644 index 0000000..85e57ea --- /dev/null +++ b/notebooks/eda-application.ipynb @@ -0,0 +1,337 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm import tqdm_notebook as tqdm\n", + "from sklearn.externals import joblib\n", + "%matplotlib inline\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = pd.read_csv('/mnt/ml-team/minerva/open-solutions/home-credit/files/unzipped_data/application_train.csv')\n", + "X.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing\n", + "## Solution 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Martin Kotek (Competition Host): \"Value 365243 denotes infinity in DAYS variables in the datasets, therefore you can consider them NA values. Also XNA/XAP denote NA values.\"](https://www.kaggle.com/c/home-credit-default-risk/discussion/57247)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X['CODE_GENDER'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "X.loc[X['DAYS_EMPLOYED'] > 0]['DAYS_EMPLOYED'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sum(X['ORGANIZATION_TYPE'] == 'XNA')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X['CODE_GENDER'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X['CODE_GENDER'].replace('XNA',np.nan, inplace=True)\n", + "X['CODE_GENDER'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature Engineering\n", + "## Solution 3\n", + "### Hand crafted features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']\n", + "X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']\n", + "X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']\n", + "X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']\n", + "X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']\n", + "X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']\n", + "X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']\n", + "X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']\n", + "X['income_credit_percentage'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']\n", + "X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])\n", + "X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']\n", + "X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']\n", + "X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH']\n", + "X['phone_to_employ_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_EMPLOYED']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# External sources\n", + "X['external_sources_weighted'] = X.EXT_SOURCE_1 * 2 + X.EXT_SOURCE_2 * 3 + X.EXT_SOURCE_3 * 4\n", + "for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:\n", + " X['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(\n", + " X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "engineered_numerical_columns = ['annuity_income_percentage',\n", + " 'car_to_birth_ratio',\n", + " 'car_to_employ_ratio',\n", + " 'children_ratio',\n", + " 'credit_to_annuity_ratio',\n", + " 'credit_to_goods_ratio',\n", + " 'credit_to_income_ratio',\n", + " 'days_employed_percentage',\n", + " 'income_credit_percentage',\n", + " 'income_per_child',\n", + " 'income_per_person',\n", + " 'payment_rate',\n", + " 'phone_to_birth_ratio',\n", + " 'phone_to_employ_ratio',\n", + " 'external_sources_weighted',\n", + " 'external_sources_min',\n", + " 'external_sources_max',\n", + " 'external_sources_sum',\n", + " 'external_sources_mean',\n", + " 'external_sources_nanmedian']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_eng = X[engineered_numerical_columns + ['TARGET']]\n", + "X_eng_corr = abs(X_eng.corr())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_eng_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.heatmap(X_eng_corr, \n", + " xticklabels=X_eng_corr.columns,\n", + " yticklabels=X_eng_corr.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aggregation features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "AGGREGATION_RECIPIES = [\n", + " (['CODE_GENDER', 'NAME_EDUCATION_TYPE'], [('AMT_ANNUITY', 'max'),\n", + " ('AMT_CREDIT', 'max'),\n", + " ('EXT_SOURCE_1', 'mean'),\n", + " ('EXT_SOURCE_2', 'mean'),\n", + " ('OWN_CAR_AGE', 'max'),\n", + " ('OWN_CAR_AGE', 'sum')]),\n", + " (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'),\n", + " ('AMT_INCOME_TOTAL', 'mean'),\n", + " ('DAYS_REGISTRATION', 'mean'),\n", + " ('EXT_SOURCE_1', 'mean')]),\n", + " (['CODE_GENDER', 'REG_CITY_NOT_WORK_CITY'], [('AMT_ANNUITY', 'mean'),\n", + " ('CNT_CHILDREN', 'mean'),\n", + " ('DAYS_ID_PUBLISH', 'mean')]),\n", + " (['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('EXT_SOURCE_1', 'mean'),\n", + " ('EXT_SOURCE_2', 'mean')]),\n", + " (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'], [('AMT_CREDIT', 'mean'),\n", + " ('AMT_REQ_CREDIT_BUREAU_YEAR', 'mean'),\n", + " ('APARTMENTS_AVG', 'mean'),\n", + " ('BASEMENTAREA_AVG', 'mean'),\n", + " ('EXT_SOURCE_1', 'mean'),\n", + " ('EXT_SOURCE_2', 'mean'),\n", + " ('EXT_SOURCE_3', 'mean'),\n", + " ('NONLIVINGAREA_AVG', 'mean'),\n", + " ('OWN_CAR_AGE', 'mean'),\n", + " ('YEARS_BUILD_AVG', 'mean')]),\n", + " (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('ELEVATORS_AVG', 'mean'),\n", + " ('EXT_SOURCE_1', 'mean')]),\n", + " (['OCCUPATION_TYPE'], [('AMT_ANNUITY', 'mean'),\n", + " ('CNT_CHILDREN', 'mean'),\n", + " ('CNT_FAM_MEMBERS', 'mean'),\n", + " ('DAYS_BIRTH', 'mean'),\n", + " ('DAYS_EMPLOYED', 'mean'),\n", + " ('DAYS_ID_PUBLISH', 'mean'),\n", + " ('DAYS_REGISTRATION', 'mean'),\n", + " ('EXT_SOURCE_1', 'mean'),\n", + " ('EXT_SOURCE_2', 'mean'),\n", + " ('EXT_SOURCE_3', 'mean')]),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "groupby_aggregate_names = []\n", + "for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):\n", + " group_object = X.groupby(groupby_cols)\n", + " for select, agg in tqdm(specs):\n", + " groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n", + " X = X.merge(group_object[select]\n", + " .agg(agg)\n", + " .reset_index()\n", + " .rename(index=str,\n", + " columns={select: groupby_aggregate_name})\n", + " [groupby_cols + [groupby_aggregate_name]],\n", + " on=groupby_cols,\n", + " how='left')\n", + " groupby_aggregate_names.append(groupby_aggregate_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_agg = X[groupby_aggregate_names + ['TARGET']]\n", + "X_agg_corr = abs(X_agg.corr())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_agg_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.heatmap(X_agg_corr, \n", + " xticklabels=X_agg_corr.columns,\n", + " yticklabels=X_agg_corr.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Solution 4 TODO\n", + "### Hand crafted features\n", + "* Explore other ext_sources features\n", + "* Explore unemployed feature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/eda-bureau.ipynb b/notebooks/eda-bureau.ipynb new file mode 100644 index 0000000..38b6aca --- /dev/null +++ b/notebooks/eda-bureau.ipynb @@ -0,0 +1,459 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "from tqdm import tqdm_notebook as tqdm\n", + "from sklearn.externals import joblib\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "\n", + "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n", + "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n", + "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n", + "bureau = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/bureau.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bureau.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing\n", + "## Solution 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "(bureau['AMT_CREDIT_SUM'] == 0).sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This shows that imputing with nan with 0 is probably a bad idea" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature Engineering\n", + "## Solution 3\n", + "### Hand crafted features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bureau[bureau['SK_ID_CURR']==215354]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### First build helper columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bureau['bureau_credit_active_binary'] = (bureau['CREDIT_ACTIVE'] != 'Closed').astype(int)\n", + "bureau['bureau_credit_enddate_binary'] = (bureau['DAYS_CREDIT_ENDDATE'] > 0).astype(int)\n", + "\n", + "groupby_SK_ID_CURR = bureau.groupby(by=['SK_ID_CURR'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "description[description['Row'] == 'DAYS_CREDIT'].Description.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = pd.DataFrame({'SK_ID_CURR':bureau['SK_ID_CURR'].unique()})\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = groupby_SK_ID_CURR['DAYS_CREDIT'].agg('count').reset_index()\n", + "group_object.rename(index=str, columns={'DAYS_CREDIT': 'bureau_number_of_past_loans'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = groupby_SK_ID_CURR['CREDIT_TYPE'].agg('nunique').reset_index()\n", + "group_object.rename(index=str, columns={'CREDIT_TYPE': 'bureau_number_of_loan_types'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features['bureau_average_of_past_loans_per_type'] = \\\n", + " features['bureau_number_of_past_loans'] / features['bureau_number_of_loan_types']\n", + " \n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = groupby_SK_ID_CURR['bureau_credit_active_binary'].agg('mean').reset_index()\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index()\n", + "group_object.rename(index=str, columns={'AMT_CREDIT_SUM_DEBT': 'bureau_total_customer_debt'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM'].agg('sum').reset_index()\n", + "group_object.rename(index=str, columns={'AMT_CREDIT_SUM': 'bureau_total_customer_credit'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features['bureau_debt_credit_ratio'] = \\\n", + " features['bureau_total_customer_debt'] / features['bureau_total_customer_credit']\n", + " \n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index()\n", + "group_object.rename(index=str, columns={'AMT_CREDIT_SUM_OVERDUE': 'bureau_total_customer_overdue'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features['bureau_overdue_debt_ratio'] = \\\n", + " features['bureau_total_customer_overdue'] / features['bureau_total_customer_debt']\n", + " \n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = groupby_SK_ID_CURR['CNT_CREDIT_PROLONG'].agg('sum').reset_index()\n", + "group_object.rename(index=str, columns={'CNT_CREDIT_PROLONG': 'bureau_average_creditdays_prolonged'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = groupby_SK_ID_CURR['bureau_credit_enddate_binary'].agg('mean').reset_index()\n", + "group_object.rename(index=str, columns={'bureau_credit_enddate_binary': 'bureau_credit_enddate_percentage'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bureau_ONE = features[features['SK_ID_CURR']==215354]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bureau_ONE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application = application.merge(features,\n", + " left_on=['SK_ID_CURR'],\n", + " right_on=['SK_ID_CURR'],\n", + " how='left',\n", + " validate='one_to_one')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "engineered_numerical_columns = list(features.columns)\n", + "engineered_numerical_columns.remove('SK_ID_CURR')\n", + "bureau_eng = application[engineered_numerical_columns + ['TARGET']]\n", + "bureau_eng_corr = abs(bureau_eng.corr())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bureau_eng_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.heatmap(bureau_eng_corr, \n", + " xticklabels=bureau_eng_corr.columns,\n", + " yticklabels=bureau_eng_corr.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BUREAU_AGGREGATION_RECIPIES = [('CREDIT_TYPE', 'count'),\n", + " ('CREDIT_ACTIVE', 'size')\n", + " ]\n", + "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n", + " for select in ['AMT_ANNUITY',\n", + " 'AMT_CREDIT_SUM',\n", + " 'AMT_CREDIT_SUM_DEBT',\n", + " 'AMT_CREDIT_SUM_LIMIT',\n", + " 'AMT_CREDIT_SUM_OVERDUE',\n", + " 'AMT_CREDIT_MAX_OVERDUE',\n", + " 'CNT_CREDIT_PROLONG',\n", + " 'CREDIT_DAY_OVERDUE',\n", + " 'DAYS_CREDIT',\n", + " 'DAYS_CREDIT_ENDDATE',\n", + " 'DAYS_CREDIT_UPDATE'\n", + " ]:\n", + " BUREAU_AGGREGATION_RECIPIES.append((select, agg))\n", + "BUREAU_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], BUREAU_AGGREGATION_RECIPIES)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "groupby_aggregate_names = []\n", + "for groupby_cols, specs in tqdm(BUREAU_AGGREGATION_RECIPIES):\n", + " group_object = bureau.groupby(groupby_cols)\n", + " for select, agg in tqdm(specs):\n", + " groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n", + " application = application.merge(group_object[select]\n", + " .agg(agg)\n", + " .reset_index()\n", + " .rename(index=str,\n", + " columns={select: groupby_aggregate_name})\n", + " [groupby_cols + [groupby_aggregate_name]],\n", + " on=groupby_cols,\n", + " how='left')\n", + " groupby_aggregate_names.append(groupby_aggregate_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application_agg = application[groupby_aggregate_names + ['TARGET']]\n", + "application_agg_corr = abs(application_agg.corr())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Solution 4\n", + "## Hand Crafted Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# group = bureau[bureau['bureau_credit_enddate_binary'] == 1].groupby(\n", + "# by=['SK_ID_CURR']).apply(\n", + "# lambda x: x.sort_values(['DAYS_CREDIT_ENDDATE'], ascending=True)).reset_index(drop=True)\n", + "# group['bureau_days_enddate_diff'] = group.groupby(by=['SK_ID_CURR'])['DAYS_CREDIT_ENDDATE'].diff()\n", + "# group['bureau_days_enddate_diff'] = group['bureau_days_enddate_diff'].fillna(0).astype('uint32')\n", + "\n", + "# bureau = bureau.merge(group[['bureau_days_enddate_diff', 'SK_ID_BUREAU']], on=['SK_ID_BUREAU'], how='left')\n", + "# bureau['bureau_average_enddate_future'] = bureau.groupby(\n", + "# by=['SK_ID_CURR'])['bureau_days_enddate_diff'].agg('mean').reset_index()['bureau_days_enddate_diff']\n", + "\n", + "# bureau['bureau_days_credit_diff'] = bureau.groupby(\n", + "# by=['SK_ID_CURR']).apply(\n", + "# lambda x: x.sort_values(['DAYS_CREDIT'], ascending=False)).reset_index(drop=True)['DAYS_CREDIT']\n", + "# bureau['bureau_days_credit_diff'] *= -1\n", + "# bureau['bureau_days_credit_diff'] = bureau.groupby(by=['SK_ID_CURR'])['bureau_days_credit_diff'].diff()\n", + "# bureau['bureau_days_credit_diff'] = bureau['bureau_days_credit_diff'].fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu py3", + "language": "python", + "name": "cpu_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/eda-credit_card.ipynb b/notebooks/eda-credit_card.ipynb new file mode 100644 index 0000000..2afb2b8 --- /dev/null +++ b/notebooks/eda-credit_card.ipynb @@ -0,0 +1,451 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "from tqdm import tqdm_notebook as tqdm\n", + "from sklearn.externals import joblib\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "\n", + "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n", + "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n", + "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n", + "credit_card = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/credit_card_balance.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "credit_card.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing\n", + "## Solution 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature Engineering\n", + "## Solution 3\n", + "### Hand crafted features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### First build helper columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "credit_card['number_of_instalments'] = credit_card.groupby(\n", + " by=['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].agg('max').reset_index()[\n", + " 'CNT_INSTALMENT_MATURE_CUM']\n", + "\n", + "credit_card['credit_card_max_loading_of_credit_limit'] = credit_card.groupby(\n", + " by=['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply(\n", + " lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "credit_card.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "description[description['Row'] == 'DAYS_CREDIT'].Description.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = pd.DataFrame({'SK_ID_CURR':credit_card['SK_ID_CURR'].unique()})\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = credit_card.groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].agg('nunique').reset_index()\n", + "group_object.rename(index=str, columns={'SK_ID_PREV': 'credit_card_number_of_loans'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features['credit_card_number_of_loans'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Note\n", + "It is worth exploring `credit_card_number_of_loans>1` binary version of this variable" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object= credit_card.groupby(by=['SK_ID_CURR'])['number_of_instalments'].sum().reset_index()\n", + "group_object.rename(index=str, columns={'number_of_instalments': 'credit_card_total_instalments'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features['credit_card_total_instalments'].value_counts()[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.distplot(features['credit_card_total_instalments'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Note\n", + "* Maybe adding a is zero variabl maxes sens" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features['credit_card_installments_per_loan'] = (\n", + " features['credit_card_total_instalments'] / features['credit_card_number_of_loans'])\n", + " \n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = credit_card.groupby(by=['SK_ID_CURR'])['credit_card_max_loading_of_credit_limit'].agg('mean').reset_index()\n", + "group_object.rename(index=str, columns={'credit_card_max_loading_of_credit_limit': 'credit_card_avg_loading_of_credit_limit'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = credit_card.groupby(\n", + " by=['SK_ID_CURR'])['SK_DPD'].agg('mean').reset_index()\n", + "group_object.rename(index=str, columns={'SK_DPD': 'credit_card_average_of_days_past_due'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = credit_card.groupby(by=['SK_ID_CURR'])['AMT_DRAWINGS_ATM_CURRENT'].agg('sum').reset_index()\n", + "group_object.rename(index=str, columns={'AMT_DRAWINGS_ATM_CURRENT': 'credit_card_drawings_atm'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_object = credit_card.groupby(by=['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].agg('sum').reset_index()\n", + "group_object.rename(index=str, columns={'AMT_DRAWINGS_CURRENT': 'credit_card_drawings_total'},inplace=True)\n", + "\n", + "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features['credit_card_cash_card_ratio'] = features['credit_card_drawings_atm'] / features['credit_card_drawings_total']\n", + "\n", + "features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "credit_ONE = features[features['SK_ID_CURR']==215354]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "credit_ONE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application = application.merge(features,\n", + " left_on=['SK_ID_CURR'],\n", + " right_on=['SK_ID_CURR'],\n", + " how='left',\n", + " validate='one_to_one')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "engineered_numerical_columns = list(features.columns)\n", + "engineered_numerical_columns.remove('SK_ID_CURR')\n", + "credit_eng = application[engineered_numerical_columns + ['TARGET']]\n", + "credit_eng_corr = abs(credit_eng.corr())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "credit_eng_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.heatmap(credit_eng_corr, \n", + " xticklabels=credit_eng_corr.columns,\n", + " yticklabels=credit_eng_corr.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = []\n", + "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n", + " for select in ['AMT_BALANCE',\n", + " 'AMT_CREDIT_LIMIT_ACTUAL',\n", + " 'AMT_DRAWINGS_ATM_CURRENT',\n", + " 'AMT_DRAWINGS_CURRENT',\n", + " 'AMT_DRAWINGS_OTHER_CURRENT',\n", + " 'AMT_DRAWINGS_POS_CURRENT',\n", + " 'AMT_PAYMENT_CURRENT',\n", + " 'CNT_DRAWINGS_ATM_CURRENT',\n", + " 'CNT_DRAWINGS_CURRENT',\n", + " 'CNT_DRAWINGS_OTHER_CURRENT',\n", + " 'CNT_INSTALMENT_MATURE_CUM',\n", + " 'MONTHS_BALANCE',\n", + " 'SK_DPD',\n", + " 'SK_DPD_DEF'\n", + " ]:\n", + " CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES.append((select, agg))\n", + "CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "groupby_aggregate_names = []\n", + "for groupby_cols, specs in tqdm(CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES):\n", + " group_object = credit.groupby(groupby_cols)\n", + " for select, agg in tqdm(specs):\n", + " groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n", + " application = application.merge(group_object[select]\n", + " .agg(agg)\n", + " .reset_index()\n", + " .rename(index=str,\n", + " columns={select: groupby_aggregate_name})\n", + " [groupby_cols + [groupby_aggregate_name]],\n", + " on=groupby_cols,\n", + " how='left')\n", + " groupby_aggregate_names.append(groupby_aggregate_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application_agg = application[groupby_aggregate_names + ['TARGET']]\n", + "application_agg_corr = abs(application_agg.corr())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Solution 4\n", + "## Hand Crafted Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# group = bureau[bureau['bureau_credit_enddate_binary'] == 1].groupby(\n", + "# by=['SK_ID_CURR']).apply(\n", + "# lambda x: x.sort_values(['DAYS_CREDIT_ENDDATE'], ascending=True)).reset_index(drop=True)\n", + "# group['bureau_days_enddate_diff'] = group.groupby(by=['SK_ID_CURR'])['DAYS_CREDIT_ENDDATE'].diff()\n", + "# group['bureau_days_enddate_diff'] = group['bureau_days_enddate_diff'].fillna(0).astype('uint32')\n", + "\n", + "# bureau = bureau.merge(group[['bureau_days_enddate_diff', 'SK_ID_BUREAU']], on=['SK_ID_BUREAU'], how='left')\n", + "# bureau['bureau_average_enddate_future'] = bureau.groupby(\n", + "# by=['SK_ID_CURR'])['bureau_days_enddate_diff'].agg('mean').reset_index()['bureau_days_enddate_diff']\n", + "\n", + "# bureau['bureau_days_credit_diff'] = bureau.groupby(\n", + "# by=['SK_ID_CURR']).apply(\n", + "# lambda x: x.sort_values(['DAYS_CREDIT'], ascending=False)).reset_index(drop=True)['DAYS_CREDIT']\n", + "# bureau['bureau_days_credit_diff'] *= -1\n", + "# bureau['bureau_days_credit_diff'] = bureau.groupby(by=['SK_ID_CURR'])['bureau_days_credit_diff'].diff()\n", + "# bureau['bureau_days_credit_diff'] = bureau['bureau_days_credit_diff'].fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu py3", + "language": "python", + "name": "cpu_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/eda-external_sources.ipynb b/notebooks/eda-external_sources.ipynb new file mode 100644 index 0000000..c85d613 --- /dev/null +++ b/notebooks/eda-external_sources.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm import tqdm_notebook as tqdm\n", + "from sklearn.externals import joblib\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "\n", + "from sklearn import tree\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = pd.read_csv('/mnt/ml-team/minerva/open-solutions/home-credit/files/unzipped_data/application_train.csv')\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_ext = X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'TARGET']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# X_ext = X_ext.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " for function_name in ['nanmin', 'nanmax', 'sum', 'mean', 'var', 'median', 'std', 'nanmedian', 'nanmean', 'min', 'max']:\n", + " X_ext['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(\n", + " X_ext[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_ext.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_ext['EXT_SRC_weighted3'] = (X.EXT_SOURCE_1*2+X.EXT_SOURCE_2*3+X.EXT_SOURCE_3*4)/9\n", + "X_ext['EXT_SRC_weighted2'] = (X.EXT_SOURCE_1*3+X.EXT_SOURCE_2*4+X.EXT_SOURCE_3*2)/9\n", + "X_ext['EXT_SRC_weighted1'] = (X.EXT_SOURCE_1*4+X.EXT_SOURCE_2*2+X.EXT_SOURCE_3*3)/9" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_ext.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_ext_corr = abs(X_ext.corr())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_ext_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.heatmap(X_ext_corr, \n", + " xticklabels=X_ext_corr.columns,\n", + " yticklabels=X_ext_corr.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test = train_test_split(X_ext)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Y_train = X_train['TARGET']\n", + "Y_test = X_test['TARGET']\n", + "\n", + "X_train = X_train.drop(columns='TARGET')\n", + "X_test = X_test.drop(columns='TARGET')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = X_train.fillna(0)\n", + "X_test = X_test.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clf = tree.DecisionTreeClassifier()\n", + "clf.fit(X_train, Y_train)\n", + "\n", + "print(\"R^2 on the train set:\")\n", + "print(clf.score(X_train, Y_train))\n", + "\n", + "print(\"\\nR^2 on the test set:\")\n", + "print(clf.score(X_test, Y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_importances = pd.Series(clf.feature_importances_, index=X_train.columns.values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_importances.sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/eda-installments.ipynb b/notebooks/eda-installments.ipynb new file mode 100644 index 0000000..3b27d73 --- /dev/null +++ b/notebooks/eda-installments.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "from tqdm import tqdm_notebook as tqdm\n", + "from sklearn.externals import joblib\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "\n", + "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n", + "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n", + "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n", + "installments = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/installments_payments.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "installments.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing\n", + "## Solution 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature Engineering\n", + "## Solution 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = []\n", + "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n", + " for select in ['AMT_INSTALMENT',\n", + " 'AMT_PAYMENT',\n", + " 'DAYS_ENTRY_PAYMENT',\n", + " 'DAYS_INSTALMENT',\n", + " 'NUM_INSTALMENT_NUMBER',\n", + " 'NUM_INSTALMENT_VERSION'\n", + " ]:\n", + " INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES.append((select, agg))\n", + "INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "groupby_aggregate_names = []\n", + "for groupby_cols, specs in tqdm(INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES):\n", + " group_object = installments.groupby(groupby_cols)\n", + " for select, agg in tqdm(specs):\n", + " groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n", + " application = application.merge(group_object[select]\n", + " .agg(agg)\n", + " .reset_index()\n", + " .rename(index=str,\n", + " columns={select: groupby_aggregate_name})\n", + " [groupby_cols + [groupby_aggregate_name]],\n", + " on=groupby_cols,\n", + " how='left')\n", + " groupby_aggregate_names.append(groupby_aggregate_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application_agg = application[groupby_aggregate_names + ['TARGET']]\n", + "application_agg_corr = abs(application_agg.corr())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu py3", + "language": "python", + "name": "cpu_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/eda-pos_cash_balance.ipynb b/notebooks/eda-pos_cash_balance.ipynb new file mode 100644 index 0000000..94583bc --- /dev/null +++ b/notebooks/eda-pos_cash_balance.ipynb @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "from tqdm import tqdm_notebook as tqdm\n", + "from sklearn.externals import joblib\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "\n", + "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n", + "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n", + "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n", + "pos_cash_balance = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/POS_CASH_balance.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pos_cash_balance.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing\n", + "## Solution 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature Engineering\n", + "## Solution 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "POS_CASH_BALANCE_AGGREGATION_RECIPIES = []\n", + "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n", + " for select in ['MONTHS_BALANCE',\n", + " 'SK_DPD',\n", + " 'SK_DPD_DEF'\n", + " ]:\n", + " POS_CASH_BALANCE_AGGREGATION_RECIPIES.append((select, agg))\n", + "POS_CASH_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], POS_CASH_BALANCE_AGGREGATION_RECIPIES)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "groupby_aggregate_names = []\n", + "for groupby_cols, specs in tqdm(POS_CASH_BALANCE_AGGREGATION_RECIPIES):\n", + " group_object = pos_cash_balance.groupby(groupby_cols)\n", + " for select, agg in tqdm(specs):\n", + " groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n", + " application = application.merge(group_object[select]\n", + " .agg(agg)\n", + " .reset_index()\n", + " .rename(index=str,\n", + " columns={select: groupby_aggregate_name})\n", + " [groupby_cols + [groupby_aggregate_name]],\n", + " on=groupby_cols,\n", + " how='left')\n", + " groupby_aggregate_names.append(groupby_aggregate_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application_agg = application[groupby_aggregate_names + ['TARGET']]\n", + "application_agg_corr = abs(application_agg.corr())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu py3", + "language": "python", + "name": "cpu_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/eda-previous_application.ipynb b/notebooks/eda-previous_application.ipynb new file mode 100644 index 0000000..be6126b --- /dev/null +++ b/notebooks/eda-previous_application.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "from tqdm import tqdm_notebook as tqdm\n", + "from sklearn.externals import joblib\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "\n", + "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n", + "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n", + "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n", + "previous_application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/previous_application.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "previous_application.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing\n", + "## Solution 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature Engineering\n", + "## Solution 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = []\n", + "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n", + " for select in ['AMT_ANNUITY',\n", + " 'AMT_APPLICATION',\n", + " 'AMT_CREDIT',\n", + " 'AMT_DOWN_PAYMENT',\n", + " 'AMT_GOODS_PRICE',\n", + " 'CNT_PAYMENT',\n", + " 'DAYS_DECISION',\n", + " 'HOUR_APPR_PROCESS_START',\n", + " 'RATE_DOWN_PAYMENT'\n", + " ]:\n", + " PREVIOUS_APPLICATION_AGGREGATION_RECIPIES.append((select, agg))\n", + "PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], PREVIOUS_APPLICATION_AGGREGATION_RECIPIES)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "groupby_aggregate_names = []\n", + "for groupby_cols, specs in tqdm(PREVIOUS_APPLICATION_AGGREGATION_RECIPIES):\n", + " group_object = previous_application.groupby(groupby_cols)\n", + " for select, agg in tqdm(specs):\n", + " groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n", + " application = application.merge(group_object[select]\n", + " .agg(agg)\n", + " .reset_index()\n", + " .rename(index=str,\n", + " columns={select: groupby_aggregate_name})\n", + " [groupby_cols + [groupby_aggregate_name]],\n", + " on=groupby_cols,\n", + " how='left')\n", + " groupby_aggregate_names.append(groupby_aggregate_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application_agg = application[groupby_aggregate_names + ['TARGET']]\n", + "application_agg_corr = abs(application_agg.corr())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu py3", + "language": "python", + "name": "cpu_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/model_exploration.ipynb b/notebooks/model_exploration.ipynb new file mode 100644 index 0000000..7ea57c9 --- /dev/null +++ b/notebooks/model_exploration.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline\n", + "import os\n", + "import sys\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from sklearn.externals import joblib\n", + "import lightgbm as lgb\n", + "\n", + "EXPERIMENT_DIR = '/mnt/ml-team/minerva/open-solutions/home-credit/kuba/experiments'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_filepath = os.path.join(EXPERIMENT_DIR, 'solution_3_all_new_externals_790', 'transformers','light_gbm_fold_0')\n", + "light_gbm_model = joblib.load(model_filepath)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(1,1,figsize=(16,10))\n", + "lgb.plot_importance(light_gbm_model, max_num_features=20, ax=ax)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TREE_INDEX = 201\n", + "digraph = lgb.create_tree_digraph(light_gbm_model, tree_index=TREE_INDEX, show_info=['split_gain'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "digraph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu py3", + "language": "python", + "name": "cpu_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/overview.ipynb b/notebooks/overview.ipynb new file mode 100644 index 0000000..4f762a9 --- /dev/null +++ b/notebooks/overview.ipynb @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../')\n", + "\n", + "from src import pipeline_config as cfg\n", + "from src.pipelines import PIPELINES" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "solution_3 = PIPELINES['lightGBM'](config=cfg.SOLUTION_CONFIG, train_mode=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "solution_3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu py3", + "language": "python", + "name": "cpu_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_prediction_distributions.ipynb b/notebooks/test_prediction_distributions.ipynb new file mode 100644 index 0000000..2babee4 --- /dev/null +++ b/notebooks/test_prediction_distributions.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.stats import gmean\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "\n", + "FILEPATH = '/mnt/ml-team/minerva/open-solutions/home-credit/kuba/experiments/solution_3/lightGBM_out_of_fold_test_predictions.csv'\n", + "test_predictions = pd.read_csv(FILEPATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_predictions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_predictions[test_predictions['SK_ID_CURR']==100001]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Arithmetic mean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_prediction_ar_mean = test_predictions.groupby('SK_ID_CURR')['lightGBM_prediction'].apply(np.mean).reset_index()\n", + "\n", + "test_prediction_ar_mean.columns = ['SK_ID_CURR','TARGET']\n", + "test_prediction_ar_mean[test_prediction_ar_mean['SK_ID_CURR']==100001]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Geometric mean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_prediction_geom_mean = test_predictions.groupby('SK_ID_CURR')['lightGBM_prediction'].apply(gmean).reset_index()\n", + "\n", + "test_prediction_geom_mean.columns = ['SK_ID_CURR','TARGET']\n", + "test_prediction_geom_mean[test_prediction_ar_mean['SK_ID_CURR']==100001]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Rank Mean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_rank(predictions):\n", + " rank = (1 + predictions.rank().values) / (predictions.shape[0] + 1)\n", + " return rank\n", + "\n", + "test_predictions_with_ranks = []\n", + "for fold_id, fold_df in test_predictions.groupby('fold_id'):\n", + " fold_df['lightGBM_rank'] = calculate_rank(fold_df['lightGBM_prediction'])\n", + " test_predictions_with_ranks.append(fold_df)\n", + "test_predictions_with_ranks = pd.concat(test_predictions_with_ranks, axis=0)\n", + "\n", + "test_predictions_with_ranks.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_prediction_rank_mean = test_predictions_with_ranks.groupby('SK_ID_CURR')['lightGBM_rank'].apply(np.mean).reset_index()\n", + "\n", + "test_prediction_rank_mean.columns = ['SK_ID_CURR','TARGET']\n", + "test_prediction_rank_mean[test_prediction_ar_mean['SK_ID_CURR']==100001]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu py3", + "language": "python", + "name": "cpu_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pipeline_blocks.py b/pipeline_blocks.py deleted file mode 100644 index c17af89..0000000 --- a/pipeline_blocks.py +++ /dev/null @@ -1,427 +0,0 @@ -from functools import partial - -from sklearn.metrics import roc_auc_score -from steppy.adapter import Adapter, E -from steppy.base import Step, make_transformer -from toolkit.misc import LightGBM - -import feature_extraction as fe -from hyperparameter_tuning import RandomSearchOptimizer, NeptuneMonitor, PersistResults -from models import get_sklearn_classifier, XGBoost -from utils import ToNumpyLabel - - -def classifier_light_gbm(features, config, train_mode, **kwargs): - if train_mode: - features_train, features_valid = features - if config.random_search.light_gbm.n_runs: - transformer = RandomSearchOptimizer(TransformerClass=LightGBM, - params=config.light_gbm, - train_input_keys=[], - valid_input_keys=['X_valid', 'y_valid'], - score_func=roc_auc_score, - maximize=True, - n_runs=config.random_search.light_gbm.n_runs, - callbacks=[ - NeptuneMonitor( - **config.random_search.light_gbm.callbacks.neptune_monitor), - PersistResults( - **config.random_search.light_gbm.callbacks.persist_results)] - ) - else: - transformer = LightGBM(**config.light_gbm) - - light_gbm = Step(name='light_gbm', - transformer=transformer, - input_data=['input'], - input_steps=[features_train, features_valid], - adapter=Adapter({'X': E(features_train.name, 'features'), - 'y': E('input', 'y'), - 'feature_names': E(features_train.name, 'feature_names'), - 'categorical_features': E(features_train.name, 'categorical_features'), - 'X_valid': E(features_valid.name, 'features'), - 'y_valid': E('input', 'y_valid'), - }), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - else: - light_gbm = Step(name='light_gbm', - transformer=LightGBM(**config.light_gbm), - input_steps=[features], - adapter=Adapter({'X': E(features.name, 'features')}), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - return light_gbm - - -def classifier_xgb(features, config, train_mode, **kwargs): - if train_mode: - features_train, features_valid = features - if config.random_search.xgboost.n_runs: - transformer = RandomSearchOptimizer(TransformerClass=XGBoost, - params=config.xgboost, - train_input_keys=[], - valid_input_keys=['X_valid', 'y_valid'], - score_func=roc_auc_score, - maximize=True, - n_runs=config.random_search.xgboost.n_runs, - callbacks=[ - NeptuneMonitor( - **config.random_search.xgboost.callbacks.neptune_monitor), - PersistResults( - **config.random_search.xgboost.callbacks.persist_results)] - ) - else: - transformer = XGBoost(**config.xgboost) - - xgboost = Step(name='xgboost', - transformer=transformer, - input_data=['input'], - input_steps=[features_train, features_valid], - adapter=Adapter({'X': E(features_train.name, 'features'), - 'y': E('input', 'y'), - 'feature_names': E(features_train.name, 'feature_names'), - 'X_valid': E(features_valid.name, 'features'), - 'y_valid': E('input', 'y_valid'), - }), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - else: - xgboost = Step(name='xgboost', - transformer=XGBoost(**config.xgboost), - input_steps=[features], - adapter=Adapter({'X': E(features.name, 'features')}), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - return xgboost - - -def classifier_sklearn(sklearn_features, ClassifierClass, full_config, clf_name, train_mode, normalize, **kwargs): - config, model_params, rs_config = full_config - if train_mode: - if config.random_search.random_forest.n_runs: - transformer = RandomSearchOptimizer( - partial(get_sklearn_classifier, - ClassifierClass=ClassifierClass, - normalize=normalize), - model_params, - train_input_keys=[], - valid_input_keys=['X_valid', 'y_valid'], - score_func=roc_auc_score, - maximize=True, - n_runs=rs_config.n_runs, - callbacks=[NeptuneMonitor(**rs_config.callbacks.neptune_monitor), - PersistResults(**rs_config.callbacks.persist_results)] - ) - else: - transformer = get_sklearn_classifier(ClassifierClass, normalize, **model_params) - - sklearn_clf = Step(name=clf_name, - transformer=transformer, - input_data=['input'], - input_steps=[sklearn_features], - adapter=Adapter({'X': E(sklearn_features.name, 'X'), - 'y': E('input', 'y'), - 'X_valid': E(sklearn_features.name, 'X_valid'), - 'y_valid': E('input', 'y_valid'), - }), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - else: - sklearn_clf = Step(name=clf_name, - transformer=get_sklearn_classifier(ClassifierClass, normalize, **model_params), - input_steps=[sklearn_features], - adapter=Adapter({'X': E(sklearn_features.name, 'X')}), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - return sklearn_clf - - -def feature_extraction(config, train_mode, **kwargs): - if train_mode: - feature_by_type_split, feature_by_type_split_valid = _feature_by_type_splits(config, train_mode) - bureau, bureau_valid = _bureau(config, train_mode, **kwargs) - - categorical_encoder, categorical_encoder_valid = _categorical_encoders( - (feature_by_type_split, feature_by_type_split_valid), - config, - train_mode, - **kwargs) - - groupby_aggregation, groupby_aggregation_valid = _groupby_aggregations( - (feature_by_type_split, feature_by_type_split_valid), - config, - train_mode, - **kwargs) - - feature_combiner, feature_combiner_valid = _join_features(numerical_features=[feature_by_type_split, - groupby_aggregation, - bureau], - numerical_features_valid=[feature_by_type_split_valid, - groupby_aggregation_valid, - bureau_valid], - categorical_features=[categorical_encoder], - categorical_features_valid=[ - categorical_encoder_valid], - config=config, - train_mode=train_mode, - **kwargs) - - return feature_combiner, feature_combiner_valid - else: - feature_by_type_split = _feature_by_type_splits(config, train_mode) - bureau = _bureau(config, train_mode, **kwargs) - categorical_encoder = _categorical_encoders(feature_by_type_split, config, train_mode, **kwargs) - groupby_aggregation = _groupby_aggregations(feature_by_type_split, config, train_mode, **kwargs) - feature_combiner = _join_features(numerical_features=[feature_by_type_split, groupby_aggregation, bureau], - numerical_features_valid=[], - categorical_features=[categorical_encoder], - categorical_features_valid=[], - config=config, - train_mode=train_mode, - **kwargs) - - return feature_combiner - - -def preprocessing_fillna(features, config, train_mode, **kwargs): - if train_mode: - features_train, features_valid = features - fillna = Step(name='fillna', - transformer=_fillna(**config.preprocessing), - input_data=['input'], - input_steps=[features_train, features_valid], - adapter=Adapter({'X': E(features_train.name, 'features'), - 'X_valid': E(features_valid.name, 'features'), - }), - experiment_directory=config.pipeline.experiment_directory, - **kwargs - ) - else: - fillna = Step(name='fillna', - transformer=_fillna(**config.preprocessing), - input_data=['input'], - input_steps=[features], - adapter=Adapter({'X': E(features.name, 'features')}), - experiment_directory=config.pipeline.experiment_directory, - **kwargs - ) - return fillna - - -def _feature_by_type_splits(config, train_mode): - if train_mode: - feature_by_type_split = Step(name='feature_by_type_split', - transformer=fe.DataFrameByTypeSplitter(**config.dataframe_by_type_splitter), - input_data=['input'], - adapter=Adapter({'X': E('input', 'X')}), - experiment_directory=config.pipeline.experiment_directory) - - feature_by_type_split_valid = Step(name='feature_by_type_split_valid', - transformer=feature_by_type_split, - input_data=['input'], - adapter=Adapter({'X': E('input', 'X_valid')}), - experiment_directory=config.pipeline.experiment_directory) - - return feature_by_type_split, feature_by_type_split_valid - - else: - feature_by_type_split = Step(name='feature_by_type_split', - transformer=fe.DataFrameByTypeSplitter(**config.dataframe_by_type_splitter), - input_data=['input'], - adapter=Adapter({'X': E('input', 'X')}), - experiment_directory=config.pipeline.experiment_directory) - - return feature_by_type_split - - -def _join_features(numerical_features, - numerical_features_valid, - categorical_features, - categorical_features_valid, - config, train_mode, - **kwargs): - if train_mode: - feature_joiner = Step(name='feature_joiner', - transformer=fe.FeatureJoiner(), - input_steps=numerical_features + categorical_features, - adapter=Adapter({ - 'numerical_feature_list': [ - E(feature.name, 'numerical_features') for feature in numerical_features], - 'categorical_feature_list': [ - E(feature.name, 'categorical_features') for feature in categorical_features], - }), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - feature_joiner_valid = Step(name='feature_joiner_valid', - transformer=feature_joiner, - input_steps=numerical_features_valid + categorical_features_valid, - adapter=Adapter({ - 'numerical_feature_list': [ - E(feature.name, - 'numerical_features') for feature in numerical_features_valid], - 'categorical_feature_list': [ - E(feature.name, - 'categorical_features') for feature in categorical_features_valid], - }), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - return feature_joiner, feature_joiner_valid - - else: - feature_joiner = Step(name='feature_joiner', - transformer=fe.FeatureJoiner(), - input_steps=numerical_features + categorical_features, - adapter=Adapter( - {'numerical_feature_list': - [E(feature.name, 'numerical_features') for feature in numerical_features], - 'categorical_feature_list': - [E(feature.name, 'categorical_features') for feature in categorical_features]} - ), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - return feature_joiner - - -def _categorical_encoders(dispatchers, config, train_mode, **kwargs): - if train_mode: - feature_by_type_split, feature_by_type_split_valid = dispatchers - numpy_label, numpy_label_valid = _to_numpy_label(config, **kwargs) - categorical_encoder = Step(name='categorical_encoder', - transformer=fe.CategoricalEncoder(), - input_data=['input'], - input_steps=[feature_by_type_split, numpy_label], - adapter=Adapter({'X': E(feature_by_type_split.name, 'categorical_features'), - 'y': E(numpy_label.name, 'y')} - ), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - categorical_encoder_valid = Step(name='categorical_encoder_valid', - transformer=categorical_encoder, - input_data=['input'], - input_steps=[feature_by_type_split_valid, numpy_label_valid], - adapter=Adapter( - {'X': E(feature_by_type_split_valid.name, 'categorical_features'), - 'y': E(numpy_label_valid.name, 'y')} - ), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - return categorical_encoder, categorical_encoder_valid - else: - feature_by_type_split = dispatchers - categorical_encoder = Step(name='categorical_encoder', - transformer=fe.CategoricalEncoder(), - input_data=['input'], - input_steps=[feature_by_type_split], - adapter=Adapter({'X': E(feature_by_type_split.name, 'categorical_features')}), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - return categorical_encoder - - -def _groupby_aggregations(dispatchers, config, train_mode, **kwargs): - if train_mode: - feature_by_type_split, feature_by_type_split_valid = dispatchers - groupby_aggregations = Step(name='groupby_aggregations', - transformer=fe.GroupbyAggregations(**config.groupby_aggregation), - input_data=['input'], - input_steps=[feature_by_type_split], - adapter=Adapter({'categorical_features': E(feature_by_type_split.name, - 'categorical_features'), - 'numerical_features': E(feature_by_type_split.name, - 'numerical_features') - }), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - groupby_aggregations_valid = Step(name='groupby_aggregations_valid', - transformer=groupby_aggregations, - input_data=['input'], - input_steps=[feature_by_type_split_valid], - adapter=Adapter({'categorical_features': E(feature_by_type_split_valid.name, - 'categorical_features'), - 'numerical_features': E(feature_by_type_split_valid.name, - 'numerical_features') - }), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - return groupby_aggregations, groupby_aggregations_valid - - else: - feature_by_type_split = dispatchers - groupby_aggregations = Step(name='groupby_aggregations', - transformer=fe.GroupbyAggregations(**config.groupby_aggregation), - input_data=['input'], - input_steps=[feature_by_type_split], - adapter=Adapter({'categorical_features': E(feature_by_type_split.name, - 'categorical_features'), - 'numerical_features': E(feature_by_type_split.name, - 'numerical_features') - }), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - return groupby_aggregations - - -def _bureau(config, train_mode, **kwargs): - if train_mode: - bureau = Step(name='bureau', - transformer=fe.GroupbyAggregationFromFile(**config.bureau), - input_data=['input'], - adapter=Adapter({'X': E('input', 'X')}), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - bureau_valid = Step(name='bureau_valid', - transformer=bureau, - input_data=['input'], - adapter=Adapter({'X': E('input', 'X_valid')}), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - return bureau, bureau_valid - - else: - bureau = Step(name='bureau', - transformer=fe.GroupbyAggregationFromFile(**config.bureau), - input_data=['input'], - adapter=Adapter({'X': E('input', 'X')}), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - return bureau - - -def _fillna(fillna_value): - def _inner_fillna(X, X_valid=None): - if X_valid is None: - return {'X': X.fillna(fillna_value)} - else: - return {'X': X.fillna(fillna_value), - 'X_valid': X_valid.fillna(fillna_value)} - return make_transformer(_inner_fillna) - - -def _to_numpy_label(config, **kwargs): - to_numpy_label = Step(name='to_numpy_label', - transformer=ToNumpyLabel(), - input_data=['input'], - adapter=Adapter({'y': [E('input', 'y')]}), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - to_numpy_label_valid = Step(name='to_numpy_label_valid', - transformer=to_numpy_label, - input_data=['input'], - adapter=Adapter({'y': [E('input', 'y_valid')]}), - experiment_directory=config.pipeline.experiment_directory, - **kwargs) - - return to_numpy_label, to_numpy_label_valid diff --git a/postprocessing.py b/postprocessing.py deleted file mode 100644 index a9edc86..0000000 --- a/postprocessing.py +++ /dev/null @@ -1,14 +0,0 @@ -import numpy as np - -from steppy.base import BaseTransformer - - -class Clipper(BaseTransformer): - def __init__(self, min_val=0, max_val=1): - super().__init__() - self.min_val = min_val - self.max_val = max_val - - def transform(self, prediction): - prediction_ = np.clip(prediction, self.min_val, self.max_val) - return {'clipped_prediction': prediction_} diff --git a/src/data_cleaning.py b/src/data_cleaning.py new file mode 100644 index 0000000..914efea --- /dev/null +++ b/src/data_cleaning.py @@ -0,0 +1,33 @@ +import numpy as np +from steppy.base import BaseTransformer +from steppy.utils import get_logger + +logger = get_logger() + + +class ApplicationCleaning(BaseTransformer): + def __init__(self, **kwargs): + super().__init__() + + def transform(self, X): + X['CODE_GENDER'].replace('XNA', np.nan, inplace=True) + X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) + X['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True) + X['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True) + + return {'X': X} + + +class BureauCleaning(BaseTransformer): + def __init__(self, fill_missing=False, fill_value=0, **kwargs): + self.fill_missing = fill_missing + self.fill_value = fill_value + + def transform(self, bureau): + if self.fill_missing: + bureau['AMT_CREDIT_SUM'].fillna(self.fill_value, inplace=True) + bureau['AMT_CREDIT_SUM_DEBT'].fillna(self.fill_value, inplace=True) + bureau['AMT_CREDIT_SUM_OVERDUE'].fillna(self.fill_value, inplace=True) + bureau['CNT_CREDIT_PROLONG'].fillna(self.fill_value, inplace=True) + + return {'bureau': bureau} diff --git a/src/feature_extraction.py b/src/feature_extraction.py new file mode 100644 index 0000000..a726ebb --- /dev/null +++ b/src/feature_extraction.py @@ -0,0 +1,370 @@ +from copy import deepcopy + +import category_encoders as ce +import numpy as np +import pandas as pd +from sklearn.externals import joblib +from steppy.base import BaseTransformer +from steppy.utils import get_logger + +logger = get_logger() + + +class FeatureJoiner(BaseTransformer): + def transform(self, numerical_feature_list, categorical_feature_list, **kwargs): + features = numerical_feature_list + categorical_feature_list + for feature in features: + feature.reset_index(drop=True, inplace=True) + outputs = dict() + outputs['features'] = pd.concat(features, axis=1).astype(np.float32) + outputs['feature_names'] = self._get_feature_names(features) + outputs['categorical_features'] = self._get_feature_names(categorical_feature_list) + return outputs + + def _get_feature_names(self, dataframes): + feature_names = [] + for dataframe in dataframes: + try: + feature_names.extend(list(dataframe.columns)) + except Exception as e: + print(e) + feature_names.append(dataframe.name) + + return feature_names + + +class CategoricalEncoder(BaseTransformer): + def __init__(self, **kwargs): + super().__init__() + self.categorical_columns = kwargs['categorical_columns'] + params = deepcopy(kwargs) + params.pop('categorical_columns', None) + self.params = params + self.encoder_class = ce.OrdinalEncoder + self.categorical_encoder = None + + def fit(self, X, y, **kwargs): + X_ = X[self.categorical_columns] + self.categorical_encoder = self.encoder_class(cols=self.categorical_columns, **self.params) + self.categorical_encoder.fit(X_, y) + return self + + def transform(self, X, **kwargs): + X_ = X[self.categorical_columns] + X_ = self.categorical_encoder.transform(X_) + return {'categorical_features': X_} + + def load(self, filepath): + self.categorical_encoder = joblib.load(filepath) + return self + + def persist(self, filepath): + joblib.dump(self.categorical_encoder, filepath) + + +class GroupbyAggregate(BaseTransformer): + def __init__(self, groupby_aggregations): + super().__init__() + self.groupby_aggregations = groupby_aggregations + self.features = [] + self.feature_names = [] + + def fit(self, main_table, **kwargs): + for groupby_cols, specs in self.groupby_aggregations: + group_object = main_table.groupby(groupby_cols) + for select, agg in specs: + groupby_aggregate_name = self._create_colname_from_specs(groupby_cols, select, agg) + + group_features = group_object[select].agg(agg).reset_index() \ + .rename(index=str, + columns={select: groupby_aggregate_name})[groupby_cols + [groupby_aggregate_name]] + + self.features.append((groupby_cols, group_features)) + self.feature_names.append(groupby_aggregate_name) + return self + + def transform(self, main_table, **kwargs): + for groupby_cols, groupby_features in self.features: + main_table = main_table.merge(groupby_features, + on=groupby_cols, + how='left') + + return {'numerical_features': main_table[self.feature_names].astype(np.float32)} + + def load(self, filepath): + params = joblib.load(filepath) + self.features = params['features'] + self.feature_names = params['feature_names'] + return self + + def persist(self, filepath): + params = {'features': self.features, + 'feature_names': self.feature_names} + joblib.dump(params, filepath) + + def _create_colname_from_specs(self, groupby_cols, agg, select): + return '{}_{}_{}'.format('_'.join(groupby_cols), agg, select) + + +class GroupbyAggregateMerge(BaseTransformer): + def __init__(self, table_name, id_columns, groupby_aggregations): + super().__init__() + self.table_name = table_name + self.id_columns = id_columns + self.groupby_aggregations = groupby_aggregations + + @property + def feature_names(self): + feature_names = list(self.features.columns) + feature_names.remove(self.id_columns[0]) + return feature_names + + def fit(self, main_table, side_table, **kwargs): + features = pd.DataFrame({self.id_columns[0]: side_table[self.id_columns[0]].unique()}) + + for groupby_cols, specs in self.groupby_aggregations: + group_object = side_table.groupby(groupby_cols) + for select, agg in specs: + groupby_aggregate_name = self._create_colname_from_specs(groupby_cols, select, agg) + features = features.merge(group_object[select] + .agg(agg) + .reset_index() + .rename(index=str, + columns={select: groupby_aggregate_name}) + [groupby_cols + [groupby_aggregate_name]], + on=groupby_cols, + how='left') + self.features = features + return self + + def transform(self, main_table, side_table, **kwargs): + main_table = main_table.merge(self.features, + left_on=[self.id_columns[0]], + right_on=[self.id_columns[1]], + how='left', + validate='one_to_one') + + return {'numerical_features': main_table[self.feature_names].astype(np.float32)} + + def load(self, filepath): + self.features = joblib.load(filepath) + return self + + def persist(self, filepath): + joblib.dump(self.features, filepath) + + def _create_colname_from_specs(self, groupby_cols, select, agg): + return '{}_{}_{}_{}'.format(self.table_name, '_'.join(groupby_cols), agg, select) + + +class ApplicationFeatures(BaseTransformer): + def __init__(self, categorical_columns, numerical_columns): + self.categorical_columns = categorical_columns + self.numerical_columns = numerical_columns + self.engineered_numerical_columns = ['annuity_income_percentage', + 'car_to_birth_ratio', + 'car_to_employ_ratio', + 'children_ratio', + 'credit_to_annuity_ratio', + 'credit_to_goods_ratio', + 'credit_to_income_ratio', + 'days_employed_percentage', + 'income_credit_percentage', + 'income_per_child', + 'income_per_person', + 'payment_rate', + 'phone_to_birth_ratio', + 'phone_to_employ_ratio', + 'external_sources_weighted', + 'external_sources_min', + 'external_sources_max', + 'external_sources_sum', + 'external_sources_mean', + 'external_sources_nanmedian'] + + def transform(self, X, **kwargs): + X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL'] + X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH'] + X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED'] + X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS'] + X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY'] + X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE'] + X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL'] + X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH'] + X['income_credit_percentage'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT'] + X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN']) + X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS'] + X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT'] + X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH'] + X['phone_to_employ_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_EMPLOYED'] + X['external_sources_weighted'] = X.EXT_SOURCE_1 * 2 + X.EXT_SOURCE_2 * 3 + X.EXT_SOURCE_3 * 4 + for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']: + X['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))( + X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1) + + return {'numerical_features': X[self.engineered_numerical_columns + self.numerical_columns], + 'categorical_features': X[self.categorical_columns] + } + + +class BureauFeatures(BaseTransformer): + def __init__(self, **kwargs): + self.features = None + + @property + def feature_names(self): + feature_names = list(self.features.columns) + feature_names.remove('SK_ID_CURR') + return feature_names + + def fit(self, X, bureau, **kwargs): + bureau['bureau_credit_active_binary'] = (bureau['CREDIT_ACTIVE'] != 'Closed').astype(int) + bureau['bureau_credit_enddate_binary'] = (bureau['DAYS_CREDIT_ENDDATE'] > 0).astype(int) + groupby_SK_ID_CURR = bureau.groupby(by=['SK_ID_CURR']) + features = pd.DataFrame({'SK_ID_CURR': bureau['SK_ID_CURR'].unique()}) + + group_object = groupby_SK_ID_CURR['DAYS_CREDIT'].agg('count').reset_index() + group_object.rename(index=str, columns={'DAYS_CREDIT': 'bureau_number_of_past_loans'}, inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + group_object = groupby_SK_ID_CURR['CREDIT_TYPE'].agg('nunique').reset_index() + group_object.rename(index=str, columns={'CREDIT_TYPE': 'bureau_number_of_loan_types'}, inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + features['bureau_average_of_past_loans_per_type'] = \ + features['bureau_number_of_past_loans'] / features['bureau_number_of_loan_types'] + + group_object = groupby_SK_ID_CURR['bureau_credit_active_binary'].agg('mean').reset_index() + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index() + group_object.rename(index=str, columns={'AMT_CREDIT_SUM_DEBT': 'bureau_total_customer_debt'}, inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM'].agg('sum').reset_index() + group_object.rename(index=str, columns={'AMT_CREDIT_SUM': 'bureau_total_customer_credit'}, inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + features['bureau_debt_credit_ratio'] = \ + features['bureau_total_customer_debt'] / features['bureau_total_customer_credit'] + + group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index() + group_object.rename(index=str, columns={'AMT_CREDIT_SUM_OVERDUE': 'bureau_total_customer_overdue'}, + inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + features['bureau_overdue_debt_ratio'] = \ + features['bureau_total_customer_overdue'] / features['bureau_total_customer_debt'] + + group_object = groupby_SK_ID_CURR['CNT_CREDIT_PROLONG'].agg('sum').reset_index() + group_object.rename(index=str, columns={'CNT_CREDIT_PROLONG': 'bureau_average_creditdays_prolonged'}, + inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + group_object = groupby_SK_ID_CURR['bureau_credit_enddate_binary'].agg('mean').reset_index() + group_object.rename(index=str, columns={'bureau_credit_enddate_binary': 'bureau_credit_enddate_percentage'}, + inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + self.features = features + return self + + def transform(self, X, **kwargs): + X = X.merge(self.features, + left_on=['SK_ID_CURR'], + right_on=['SK_ID_CURR'], + how='left', + validate='one_to_one') + + return {'numerical_features': X[self.feature_names]} + + def load(self, filepath): + self.features = joblib.load(filepath) + return self + + def persist(self, filepath): + joblib.dump(self.features, filepath) + + +class CreditCardBalanceFeatures(BaseTransformer): + def __init__(self, **kwargs): + self.features = None + + @property + def feature_names(self): + feature_names = list(self.features.columns) + feature_names.remove('SK_ID_CURR') + return feature_names + + def fit(self, X, credit_card, **kwargs): + credit_card['number_of_instalments'] = credit_card.groupby( + by=['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].agg('max').reset_index()[ + 'CNT_INSTALMENT_MATURE_CUM'] + + credit_card['credit_card_max_loading_of_credit_limit'] = credit_card.groupby( + by=['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply( + lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0] + + features = pd.DataFrame({'SK_ID_CURR': credit_card['SK_ID_CURR'].unique()}) + + group_object = credit_card.groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].agg('nunique').reset_index() + group_object.rename(index=str, columns={'SK_ID_PREV': 'credit_card_number_of_loans'}, inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + group_object = credit_card.groupby(by=['SK_ID_CURR'])['number_of_instalments'].sum().reset_index() + group_object.rename(index=str, columns={'number_of_instalments': 'credit_card_total_instalments'}, inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + features['credit_card_installments_per_loan'] = ( + features['credit_card_total_instalments'] / features['credit_card_number_of_loans']) + + group_object = credit_card.groupby(by=['SK_ID_CURR'])['credit_card_max_loading_of_credit_limit'].agg( + 'mean').reset_index() + group_object.rename(index=str, columns={ + 'credit_card_max_loading_of_credit_limit': 'credit_card_avg_loading_of_credit_limit'}, inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + group_object = credit_card.groupby( + by=['SK_ID_CURR'])['SK_DPD'].agg('mean').reset_index() + group_object.rename(index=str, columns={'SK_DPD': 'credit_card_average_of_days_past_due'}, inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + group_object = credit_card.groupby(by=['SK_ID_CURR'])['AMT_DRAWINGS_ATM_CURRENT'].agg('sum').reset_index() + group_object.rename(index=str, columns={'AMT_DRAWINGS_ATM_CURRENT': 'credit_card_drawings_atm'}, inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + group_object = credit_card.groupby(by=['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].agg('sum').reset_index() + group_object.rename(index=str, columns={'AMT_DRAWINGS_CURRENT': 'credit_card_drawings_total'}, inplace=True) + features = features.merge(group_object, on=['SK_ID_CURR'], how='left') + + features['credit_card_cash_card_ratio'] = features['credit_card_drawings_atm'] / features[ + 'credit_card_drawings_total'] + + self.features = features + return self + + def transform(self, X, **kwargs): + X = X.merge(self.features, + left_on=['SK_ID_CURR'], + right_on=['SK_ID_CURR'], + how='left', + validate='one_to_one') + + return {'numerical_features': X[self.feature_names]} + + def load(self, filepath): + self.features = joblib.load(filepath) + return self + + def persist(self, filepath): + joblib.dump(self.features, filepath) + + +class ConcatFeatures(BaseTransformer): + def transform(self, **kwargs): + features_concat = [] + for _, feature in kwargs.items(): + feature.reset_index(drop=True, inplace=True) + features_concat.append(feature) + features_concat = pd.concat(features_concat, axis=1) + return {'concatenated_features': features_concat} diff --git a/hyperparameter_tuning.py b/src/hyperparameter_tuning.py similarity index 99% rename from hyperparameter_tuning.py rename to src/hyperparameter_tuning.py index 5bdab15..b5cd2cf 100644 --- a/hyperparameter_tuning.py +++ b/src/hyperparameter_tuning.py @@ -6,7 +6,7 @@ from steppy.base import BaseTransformer from steppy.utils import get_logger -from utils import set_seed +from .utils import set_seed logger = get_logger() diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..3598eba --- /dev/null +++ b/src/models.py @@ -0,0 +1,208 @@ +from attrdict import AttrDict +from deepsense import neptune +import lightgbm as lgb +import numpy as np +import pandas as pd +from sklearn.externals import joblib +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from steppy.base import BaseTransformer +from toolkit.sklearn_transformers.models import SklearnClassifier +import xgboost as xgb + +from .utils import get_logger + +logger = get_logger() +ctx = neptune.Context() + + +class XGBoost(BaseTransformer): + def __init__(self, **params): + super().__init__() + logger.info('initializing XGBoost...') + self.params = params + self.training_params = ['nrounds', 'early_stopping_rounds'] + self.evaluation_function = None + + @property + def model_config(self): + return AttrDict({param: value for param, value in self.params.items() + if param not in self.training_params}) + + @property + def training_config(self): + return AttrDict({param: value for param, value in self.params.items() + if param in self.training_params}) + + def fit(self, + X, y, + X_valid, y_valid, + feature_names=None, + feature_types=None, + **kwargs): + train = xgb.DMatrix(X, + label=y, + feature_names=feature_names, + feature_types=feature_types) + valid = xgb.DMatrix(X_valid, + label=y_valid, + feature_names=feature_names, + feature_types=feature_types) + + evaluation_results = {} + self.estimator = xgb.train(params=self.model_config, + dtrain=train, + evals=[(train, 'train'), (valid, 'valid')], + evals_result=evaluation_results, + num_boost_round=self.training_config.nrounds, + early_stopping_rounds=self.training_config.early_stopping_rounds, + verbose_eval=self.model_config.verbose, + feval=self.evaluation_function) + return self + + def transform(self, X, y=None, feature_names=None, feature_types=None, **kwargs): + X_DMatrix = xgb.DMatrix(X, + label=y, + feature_names=feature_names, + feature_types=feature_types) + prediction = self.estimator.predict(X_DMatrix) + return {'prediction': prediction} + + def load(self, filepath): + self.estimator = xgb.Booster(params=self.model_config) + self.estimator.load_model(filepath) + return self + + def persist(self, filepath): + self.estimator.save_model(filepath) + + +class LightGBM(BaseTransformer): + def __init__(self, name=None, **params): + super().__init__() + logger.info('initializing LightGBM...') + self.params = params + self.training_params = ['number_boosting_rounds', 'early_stopping_rounds'] + self.evaluation_function = None + self.callbacks = callbacks(channel_prefix=name) + + @property + def model_config(self): + return AttrDict({param: value for param, value in self.params.items() + if param not in self.training_params}) + + @property + def training_config(self): + return AttrDict({param: value for param, value in self.params.items() + if param in self.training_params}) + + def fit(self, + X, + y, + X_valid, + y_valid, + feature_names='auto', + categorical_features='auto', + **kwargs): + evaluation_results = {} + + self._check_target_shape_and_type(y, 'y') + self._check_target_shape_and_type(y_valid, 'y_valid') + y = self._format_target(y) + y_valid = self._format_target(y_valid) + + logger.info('LightGBM, train data shape {}'.format(X.shape)) + logger.info('LightGBM, validation data shape {}'.format(X_valid.shape)) + logger.info('LightGBM, train labels shape {}'.format(y.shape)) + logger.info('LightGBM, validation labels shape {}'.format(y_valid.shape)) + + data_train = lgb.Dataset(data=X, + label=y, + feature_name=feature_names, + categorical_feature=categorical_features, + **kwargs) + data_valid = lgb.Dataset(X_valid, + label=y_valid, + feature_name=feature_names, + categorical_feature=categorical_features, + **kwargs) + + self.estimator = lgb.train(self.model_config, + data_train, + feature_name=feature_names, + categorical_feature=categorical_features, + valid_sets=[data_train, data_valid], + valid_names=['data_train', 'data_valid'], + evals_result=evaluation_results, + num_boost_round=self.training_config.number_boosting_rounds, + early_stopping_rounds=self.training_config.early_stopping_rounds, + verbose_eval=self.model_config.verbose, + feval=self.evaluation_function, + callbacks=self.callbacks, + **kwargs) + return self + + def transform(self, X, **kwargs): + prediction = self.estimator.predict(X) + return {'prediction': prediction} + + def load(self, filepath): + self.estimator = joblib.load(filepath) + return self + + def persist(self, filepath): + joblib.dump(self.estimator, filepath) + + def _check_target_shape_and_type(self, target, name): + if not any([isinstance(target, obj_type) for obj_type in [pd.Series, np.ndarray, list]]): + raise TypeError( + '"target" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(type(target))) + try: + assert len(target.shape) == 1, '"{}" must be 1-D. It is {}-D instead.'.format(name, + len(target.shape)) + except AttributeError: + print('Cannot determine shape of the {}. ' + 'Type must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead'.format(name, + type(target))) + + def _format_target(self, target): + + if isinstance(target, pd.Series): + return target.values + elif isinstance(target, np.ndarray): + return target + elif isinstance(target, list): + return np.array(target) + else: + raise TypeError( + '"target" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(type(target))) + + +def get_sklearn_classifier(ClassifierClass, normalize, **kwargs): + class SklearnBinaryClassifier(SklearnClassifier): + def transform(self, X, y=None, target=1, **kwargs): + prediction = self.estimator.predict_proba(X)[:, target] + return {SklearnClassifier.RESULT_KEY: prediction} + + if normalize: + return SklearnBinaryClassifier(Pipeline([('standarizer', StandardScaler()), + ('classifier', ClassifierClass(**kwargs))])) + + return SklearnBinaryClassifier(ClassifierClass(**kwargs)) + + +def callbacks(channel_prefix): + neptune_monitor = neptune_monitor_lgbm(channel_prefix) + return [neptune_monitor] + + +def neptune_monitor_lgbm(channel_prefix=''): + def callback(env): + for name, loss_name, loss_value, _ in env.evaluation_result_list: + if channel_prefix != '': + channel_name = '{}_{}_{}'.format(channel_prefix, name, loss_name) + else: + channel_name = '{}_{}'.format(name, loss_name) + ctx.channel_send(channel_name, x=env.iteration, y=loss_value) + + return callback diff --git a/src/pipeline_blocks.py b/src/pipeline_blocks.py new file mode 100644 index 0000000..d07fb4d --- /dev/null +++ b/src/pipeline_blocks.py @@ -0,0 +1,582 @@ +from functools import partial + +from sklearn.metrics import roc_auc_score +from steppy.adapter import Adapter, E +from steppy.base import Step, make_transformer + +from . import feature_extraction as fe +from . import data_cleaning as dc +from .hyperparameter_tuning import RandomSearchOptimizer, NeptuneMonitor, PersistResults +from .models import get_sklearn_classifier, XGBoost, LightGBM + + +def classifier_light_gbm(features, config, train_mode, suffix, **kwargs): + model_name = 'light_gbm{}'.format(suffix) + + if train_mode: + features_train, features_valid = features + if config.random_search.light_gbm.n_runs: + transformer = RandomSearchOptimizer(TransformerClass=LightGBM, + params=config.light_gbm, + train_input_keys=[], + valid_input_keys=['X_valid', 'y_valid'], + score_func=roc_auc_score, + maximize=True, + n_runs=config.random_search.light_gbm.n_runs, + callbacks=[ + NeptuneMonitor( + **config.random_search.light_gbm.callbacks.neptune_monitor), + PersistResults( + **config.random_search.light_gbm.callbacks.persist_results)] + ) + else: + transformer = LightGBM(name=model_name, **config.light_gbm) + + light_gbm = Step(name=model_name, + transformer=transformer, + input_data=['application'], + input_steps=[features_train, features_valid], + adapter=Adapter({'X': E(features_train.name, 'features'), + 'y': E('application', 'y'), + 'feature_names': E(features_train.name, 'feature_names'), + 'categorical_features': E(features_train.name, 'categorical_features'), + 'X_valid': E(features_valid.name, 'features'), + 'y_valid': E('application', 'y_valid'), + }), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + else: + light_gbm = Step(name=model_name, + transformer=LightGBM(name=model_name, **config.light_gbm), + input_steps=[features], + adapter=Adapter({'X': E(features.name, 'features')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + return light_gbm + + +def classifier_xgb(features, config, train_mode, suffix, **kwargs): + if train_mode: + features_train, features_valid = features + if config.random_search.xgboost.n_runs: + transformer = RandomSearchOptimizer(TransformerClass=XGBoost, + params=config.xgboost, + train_input_keys=[], + valid_input_keys=['X_valid', 'y_valid'], + score_func=roc_auc_score, + maximize=True, + n_runs=config.random_search.xgboost.n_runs, + callbacks=[ + NeptuneMonitor( + **config.random_search.xgboost.callbacks.neptune_monitor), + PersistResults( + **config.random_search.xgboost.callbacks.persist_results)] + ) + else: + transformer = XGBoost(**config.xgboost) + + xgboost = Step(name='xgboost{}'.format(suffix), + transformer=transformer, + input_data=['application'], + input_steps=[features_train, features_valid], + adapter=Adapter({'X': E(features_train.name, 'features'), + 'y': E('application', 'y'), + 'feature_names': E(features_train.name, 'feature_names'), + 'X_valid': E(features_valid.name, 'features'), + 'y_valid': E('application', 'y_valid'), + }), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + else: + xgboost = Step(name='xgboost{}'.format(suffix), + transformer=XGBoost(**config.xgboost), + input_steps=[features], + adapter=Adapter({'X': E(features.name, 'features')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + return xgboost + + +def classifier_sklearn(sklearn_features, + ClassifierClass, + full_config, + clf_name, + train_mode, + suffix, + normalize, + **kwargs): + config, model_params, rs_config = full_config + if train_mode: + if config.random_search.random_forest.n_runs: + transformer = RandomSearchOptimizer( + partial(get_sklearn_classifier, + ClassifierClass=ClassifierClass, + normalize=normalize), + model_params, + train_input_keys=[], + valid_input_keys=['X_valid', 'y_valid'], + score_func=roc_auc_score, + maximize=True, + n_runs=rs_config.n_runs, + callbacks=[NeptuneMonitor(**rs_config.callbacks.neptune_monitor), + PersistResults(**rs_config.callbacks.persist_results)] + ) + else: + transformer = get_sklearn_classifier(ClassifierClass, normalize, **model_params) + + sklearn_clf = Step(name='{}{}'.format(clf_name, suffix), + transformer=transformer, + input_data=['application'], + input_steps=[sklearn_features], + adapter=Adapter({'X': E(sklearn_features.name, 'X'), + 'y': E('application', 'y'), + 'X_valid': E(sklearn_features.name, 'X_valid'), + 'y_valid': E('application', 'y_valid'), + }), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + else: + sklearn_clf = Step(name='{}{}'.format(clf_name, suffix), + transformer=get_sklearn_classifier(ClassifierClass, normalize, **model_params), + input_steps=[sklearn_features], + adapter=Adapter({'X': E(sklearn_features.name, 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + return sklearn_clf + + +def feature_extraction(config, train_mode, suffix, **kwargs): + if train_mode: + application, application_valid = _application(config, train_mode, suffix, **kwargs) + bureau, bureau_valid = _bureau(config, train_mode, suffix, **kwargs) + credit_card_balance, credit_card_balance_valid = _credit_card_balance(config, train_mode, suffix, **kwargs) + + application_agg, application_agg_valid = _application_groupby_agg(config, train_mode, suffix, **kwargs) + bureau_agg, bureau_agg_valid = _bureau_groupby_agg(config, train_mode, suffix, **kwargs) + credit_card_balance_agg, credit_card_balance_agg_valid = _credit_card_balance_groupby_agg( + config, + train_mode, suffix, + **kwargs) + installments_payments_agg, installments_payments_agg_valid = _installments_payments_groupby_agg( + config, + train_mode, suffix, + **kwargs) + pos_cash_balance_agg, pos_cash_balance_agg_valid = _pos_cash_balance_groupby_agg( + config, + train_mode, suffix, + **kwargs) + previous_applications_agg, previous_applications_agg_valid = _previous_applications_groupby_agg( + config, + train_mode, suffix, + **kwargs) + + categorical_encoder, categorical_encoder_valid = _categorical_encoders(config, train_mode, suffix, **kwargs) + + feature_combiner, feature_combiner_valid = _join_features( + numerical_features=[application, + application_agg, + previous_applications_agg, + bureau, + bureau_agg, + credit_card_balance, + credit_card_balance_agg, + installments_payments_agg, + pos_cash_balance_agg, + ], + numerical_features_valid=[application_valid, + application_agg_valid, + previous_applications_agg_valid, + bureau_valid, + bureau_agg_valid, + credit_card_balance_valid, + credit_card_balance_agg_valid, + installments_payments_agg_valid, + pos_cash_balance_agg_valid, + ], + categorical_features=[categorical_encoder + ], + categorical_features_valid=[categorical_encoder_valid + ], + config=config, + train_mode=train_mode, + suffix=suffix, + **kwargs) + + return feature_combiner, feature_combiner_valid + else: + application = _application(config, train_mode, suffix, **kwargs) + bureau = _bureau(config, train_mode, suffix, **kwargs) + credit_card_balance = _credit_card_balance(config, train_mode, suffix, **kwargs) + + application_agg = _application_groupby_agg(config, train_mode, suffix, **kwargs) + bureau_agg = _bureau_groupby_agg(config, train_mode, suffix, **kwargs) + credit_card_balance_agg = _credit_card_balance_groupby_agg(config, train_mode, suffix, **kwargs) + installments_payments_agg = _installments_payments_groupby_agg(config, train_mode, suffix, **kwargs) + pos_cash_balance_agg = _pos_cash_balance_groupby_agg(config, train_mode, suffix, **kwargs) + previous_applications_agg = _previous_applications_groupby_agg(config, train_mode, suffix, **kwargs) + categorical_encoder = _categorical_encoders(config, train_mode, suffix, **kwargs) + feature_combiner = _join_features(numerical_features=[application, + application_agg, + previous_applications_agg, + bureau, + bureau_agg, + credit_card_balance, + credit_card_balance_agg, + installments_payments_agg, + pos_cash_balance_agg, + ], + numerical_features_valid=[], + categorical_features=[categorical_encoder + ], + categorical_features_valid=[], + config=config, + train_mode=train_mode, + suffix=suffix, + **kwargs) + + return feature_combiner + + +def preprocessing_fillna(features, config, train_mode, suffix, **kwargs): + if train_mode: + features_train, features_valid = features + fillna = Step(name='fillna{}'.format(suffix), + transformer=_fillna(**config.preprocessing), + input_steps=[features_train, features_valid], + adapter=Adapter({'X': E(features_train.name, 'features'), + 'X_valid': E(features_valid.name, 'features'), + }), + experiment_directory=config.pipeline.experiment_directory, + **kwargs + ) + else: + fillna = Step(name='fillna{}'.format(suffix), + transformer=_fillna(**config.preprocessing), + input_steps=[features], + adapter=Adapter({'X': E(features.name, 'features')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs + ) + return fillna + + +def _join_features(numerical_features, + numerical_features_valid, + categorical_features, + categorical_features_valid, + config, train_mode, suffix, + **kwargs): + if train_mode: + persist_output = True + cache_output = True + load_persisted_output = True + else: + persist_output = False + cache_output = True + load_persisted_output = False + + feature_joiner = Step(name='feature_joiner{}'.format(suffix), + transformer=fe.FeatureJoiner(), + input_steps=numerical_features + categorical_features, + adapter=Adapter({ + 'numerical_feature_list': [ + E(feature.name, 'numerical_features') for feature in numerical_features], + 'categorical_feature_list': [ + E(feature.name, 'categorical_features') for feature in categorical_features], + }), + experiment_directory=config.pipeline.experiment_directory, + persist_output=persist_output, + cache_output=cache_output, + load_persisted_output=load_persisted_output) + if train_mode: + feature_joiner_valid = Step(name='feature_joiner_valid{}'.format(suffix), + transformer=feature_joiner, + input_steps=numerical_features_valid + categorical_features_valid, + adapter=Adapter({ + 'numerical_feature_list': [ + E(feature.name, + 'numerical_features') for feature in numerical_features_valid], + 'categorical_feature_list': [ + E(feature.name, + 'categorical_features') for feature in categorical_features_valid], + }), + experiment_directory=config.pipeline.experiment_directory, + persist_output=persist_output, + cache_output=cache_output, + load_persisted_output=load_persisted_output) + + return feature_joiner, feature_joiner_valid + + else: + return feature_joiner + + +def _categorical_encoders(config, train_mode, suffix, **kwargs): + categorical_encoder = Step(name='categorical_encoder{}'.format(suffix), + transformer=fe.CategoricalEncoder(**config.preprocessing.categorical_encoder), + input_data=['application'], + adapter=Adapter({'X': E('application', 'X'), + 'y': E('application', 'y')} + ), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + if train_mode: + categorical_encoder_valid = Step(name='categorical_encoder_valid{}'.format(suffix), + transformer=categorical_encoder, + input_data=['application'], + adapter=Adapter( + {'X': E('application', 'X_valid'), + 'y': E('application', 'y_valid')} + ), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + return categorical_encoder, categorical_encoder_valid + else: + return categorical_encoder + + +def _application_groupby_agg(config, train_mode, suffix, **kwargs): + application_groupby_agg = Step(name='application_groupby_agg{}'.format(suffix), + transformer=fe.GroupbyAggregate(**config.applications.aggregations), + input_data=['application'], + adapter=Adapter( + {'main_table': E('application', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + if train_mode: + + application_groupby_agg_valid = Step(name='application_groupby_agg_valid{}'.format(suffix), + transformer=application_groupby_agg, + input_data=['application'], + adapter=Adapter( + {'main_table': E('application', 'X_valid'), + }), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return application_groupby_agg, application_groupby_agg_valid + + else: + return application_groupby_agg + + +def _bureau_groupby_agg(config, train_mode, suffix, **kwargs): + bureau_groupby_agg = Step(name='bureau_groupby_agg{}'.format(suffix), + transformer=fe.GroupbyAggregateMerge(**config.bureau), + input_data=['application', 'bureau'], + adapter=Adapter({'main_table': E('application', 'X'), + 'side_table': E('bureau', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + if train_mode: + bureau_groupby_agg_valid = Step(name='bureau_groupby_agg_valid{}'.format(suffix), + transformer=bureau_groupby_agg, + input_data=['application', 'bureau'], + adapter=Adapter({'main_table': E('application', 'X_valid'), + 'side_table': E('bureau', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + return bureau_groupby_agg, bureau_groupby_agg_valid + else: + return bureau_groupby_agg + + +def _credit_card_balance_groupby_agg(config, train_mode, suffix, **kwargs): + credit_card_balance_groupby_agg = Step(name='credit_card_balance_groupby_agg{}'.format(suffix), + transformer=fe.GroupbyAggregateMerge(**config.credit_card_balance), + input_data=['application', 'credit_card_balance'], + adapter=Adapter({'main_table': E('application', 'X'), + 'side_table': E('credit_card_balance', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + if train_mode: + credit_card_balance_groupby_agg_valid = Step(name='credit_card_balance_groupby_agg_valid{}'.format(suffix), + transformer=credit_card_balance_groupby_agg, + input_data=['application', 'credit_card_balance'], + adapter=Adapter({'main_table': E('application', 'X_valid'), + 'side_table': E('credit_card_balance', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + return credit_card_balance_groupby_agg, credit_card_balance_groupby_agg_valid + + else: + return credit_card_balance_groupby_agg + + +def _installments_payments_groupby_agg(config, train_mode, suffix, **kwargs): + installments_payments_groupby_agg = Step(name='installments_payments_groupby_agg{}'.format(suffix), + transformer=fe.GroupbyAggregateMerge(**config.installments_payments), + input_data=['application', 'installments_payments'], + adapter=Adapter({'main_table': E('application', 'X'), + 'side_table': E('installments_payments', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + if train_mode: + installments_payments_groupby_agg_valid = Step(name='installments_payments_groupby_agg_valid{}'.format(suffix), + transformer=installments_payments_groupby_agg, + input_data=['application', 'installments_payments'], + adapter=Adapter({'main_table': E('application', 'X_valid'), + 'side_table': E('installments_payments', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return installments_payments_groupby_agg, installments_payments_groupby_agg_valid + + else: + return installments_payments_groupby_agg + + +def _pos_cash_balance_groupby_agg(config, train_mode, suffix, **kwargs): + pos_cash_balance_groupby_agg = Step(name='pos_cash_balance_groupby_agg{}'.format(suffix), + transformer=fe.GroupbyAggregateMerge(**config.pos_cash_balance), + input_data=['application', 'pos_cash_balance'], + adapter=Adapter({'main_table': E('application', 'X'), + 'side_table': E('pos_cash_balance', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + if train_mode: + pos_cash_balance_groupby_agg_valid = Step(name='pos_cash_balance_groupby_agg_valid{}'.format(suffix), + transformer=pos_cash_balance_groupby_agg, + input_data=['application', 'pos_cash_balance'], + adapter=Adapter({'main_table': E('application', 'X_valid'), + 'side_table': E('pos_cash_balance', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return pos_cash_balance_groupby_agg, pos_cash_balance_groupby_agg_valid + + else: + return pos_cash_balance_groupby_agg + + +def _previous_applications_groupby_agg(config, train_mode, suffix, **kwargs): + previous_applications_groupby_agg = Step(name='previous_applications_groupby_agg{}'.format(suffix), + transformer=fe.GroupbyAggregateMerge(**config.previous_applications), + input_data=['application', 'previous_application'], + adapter=Adapter({'main_table': E('application', 'X'), + 'side_table': E('previous_application', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + if train_mode: + previous_applications_groupby_agg_valid = Step(name='previous_applications_groupby_agg_valid{}'.format(suffix), + transformer=previous_applications_groupby_agg, + input_data=['application', 'previous_application'], + adapter=Adapter({'main_table': E('application', 'X_valid'), + 'side_table': E('previous_application', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + return previous_applications_groupby_agg, previous_applications_groupby_agg_valid + else: + return previous_applications_groupby_agg + + +def _application_cleaning(config, train_mode, suffix, **kwargs): + application_cleaning = Step(name='application_cleaning{}'.format(suffix), + transformer=dc.ApplicationCleaning(**config.preprocessing.impute_missing), + input_data=['application'], + adapter=Adapter({'X': E('application', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + if train_mode: + application_cleaning_valid = Step(name='application_cleaning_valid{}'.format(suffix), + transformer=dc.ApplicationCleaning(), + input_data=['application'], + adapter=Adapter({'X': E('application', 'X_valid')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + return application_cleaning, application_cleaning_valid + else: + return application_cleaning + + +def _application(config, train_mode, suffix, **kwargs): + if train_mode: + application_cleaning, application_cleaning_valid = _application_cleaning(config, train_mode, suffix, **kwargs) + else: + application_cleaning = _application_cleaning(config, train_mode, suffix, **kwargs) + + application = Step(name='application_hand_crafted{}'.format(suffix), + transformer=fe.ApplicationFeatures(**config.applications.columns), + input_steps=[application_cleaning], + adapter=Adapter({'X': E(application_cleaning.name, 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + if train_mode: + application_valid = Step(name='application_hand_crafted_valid{}'.format(suffix), + transformer=application, + input_steps=[application_cleaning_valid], + adapter=Adapter({'X': E(application_cleaning_valid.name, 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + return application, application_valid + else: + return application + + +def _bureau_cleaning(config, suffix, **kwargs): + bureau_cleaning = Step(name='bureau_cleaning{}'.format(suffix), + transformer=dc.BureauCleaning(**config.preprocessing.impute_missing), + input_data=['bureau'], + adapter=Adapter({'bureau': E('bureau', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return bureau_cleaning + + +def _bureau(config, train_mode, suffix, **kwargs): + bureau_cleaned = _bureau_cleaning(config, suffix, **kwargs) + + bureau = Step(name='bureau_hand_crafted{}'.format(suffix), + transformer=fe.BureauFeatures(), + input_data=['application'], + input_steps=[bureau_cleaned], + adapter=Adapter({'X': E('application', 'X'), + 'bureau': E(bureau_cleaned.name, 'bureau')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + if train_mode: + bureau_valid = Step(name='bureau__hand_crafted_valid{}'.format(suffix), + transformer=bureau, + input_data=['application'], + adapter=Adapter({'X': E('application', 'X_valid')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + return bureau, bureau_valid + else: + return bureau + + +def _credit_card_balance(config, train_mode, suffix, **kwargs): + credit_card_balance = Step(name='credit_card_balance_hand_crafted{}'.format(suffix), + transformer=fe.CreditCardBalanceFeatures(**config.credit_card_balance), + input_data=['application', 'credit_card_balance'], + adapter=Adapter({'X': E('application', 'X'), + 'credit_card': E('credit_card_balance', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + if train_mode: + credit_card_balance_valid = Step(name='credit_card_balance__hand_crafted_valid{}'.format(suffix), + transformer=credit_card_balance, + input_data=['application'], + adapter=Adapter({'X': E('application', 'X_valid')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return credit_card_balance, credit_card_balance_valid + + else: + return credit_card_balance + + +def _fillna(fillna_value): + def _inner_fillna(X, X_valid=None): + if X_valid is None: + return {'X': X.fillna(fillna_value)} + else: + return {'X': X.fillna(fillna_value), + 'X_valid': X_valid.fillna(fillna_value)} + + return make_transformer(_inner_fillna) diff --git a/pipeline_config.py b/src/pipeline_config.py similarity index 53% rename from pipeline_config.py rename to src/pipeline_config.py index bf4312f..2f8f298 100644 --- a/pipeline_config.py +++ b/src/pipeline_config.py @@ -3,25 +3,17 @@ from attrdict import AttrDict from deepsense import neptune -from utils import read_params, parameter_eval +from .utils import read_params, parameter_eval ctx = neptune.Context() -params = read_params(ctx) +params = read_params(ctx, fallback_file='../neptune.yaml') RANDOM_SEED = 90210 DEV_SAMPLE_SIZE = 1000 -BUREAU_BALANCE = params.bureau_balance_filepath -BUREAU = params.bureau_filepath -CREDIT_CARD_BALANCE = params.credit_card_balance_filepath -INSTALLMENTS_PAYMENTS = params.installments_payments_filepath -POS_CASH_BALANCE = params.POS_CASH_balance_filepath -PREVIOUS_APPLICATION = params.previous_application_filepath +ID_COLUMNS = ['SK_ID_CURR'] +TARGET_COLUMNS = ['TARGET'] -ID_COLUMN = 'SK_ID_CURR' -TARGET_COLUMN = 'TARGET' - -TIMESTAMP_COLUMNS = [] CATEGORICAL_COLUMNS = ['CODE_GENDER', 'EMERGENCYSTATE_MODE', 'FLAG_CONT_MOBILE', @@ -60,9 +52,9 @@ 'REG_REGION_NOT_WORK_REGION', 'WALLSMATERIAL_MODE', 'WEEKDAY_APPR_PROCESS_START'] + NUMERICAL_COLUMNS = ['AMT_ANNUITY', 'AMT_CREDIT', - 'AMT_GOODS_PRICE', 'AMT_INCOME_TOTAL', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', @@ -71,14 +63,8 @@ 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'APARTMENTS_AVG', - 'APARTMENTS_MEDI', - 'APARTMENTS_MODE', 'BASEMENTAREA_AVG', - 'BASEMENTAREA_MEDI', - 'BASEMENTAREA_MODE', 'COMMONAREA_AVG', - 'COMMONAREA_MEDI', - 'COMMONAREA_MODE', 'CNT_CHILDREN', 'CNT_FAM_MEMBERS', 'DAYS_BIRTH', @@ -89,48 +75,25 @@ 'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'ELEVATORS_AVG', - 'ELEVATORS_MEDI', - 'ELEVATORS_MODE', 'ENTRANCES_AVG', - 'ENTRANCES_MEDI', - 'ENTRANCES_MODE', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'FLOORSMAX_AVG', - 'FLOORSMAX_MEDI', - 'FLOORSMAX_MODE', 'FLOORSMIN_AVG', - 'FLOORSMIN_MEDI', - 'FLOORSMIN_MODE', 'LANDAREA_AVG', - 'LANDAREA_MEDI', - 'LANDAREA_MODE', 'LIVINGAPARTMENTS_AVG', - 'LIVINGAPARTMENTS_MEDI', - 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_AVG', - 'LIVINGAREA_MEDI', - 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_AVG', - 'NONLIVINGAPARTMENTS_MEDI', - 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_AVG', - 'NONLIVINGAREA_MEDI', - 'NONLIVINGAREA_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', - 'OBS_60_CNT_SOCIAL_CIRCLE', 'OWN_CAR_AGE', 'REGION_POPULATION_RELATIVE', 'REGION_RATING_CLIENT', - 'REGION_RATING_CLIENT_W_CITY', 'TOTALAREA_MODE', 'YEARS_BEGINEXPLUATATION_AVG', - 'YEARS_BEGINEXPLUATATION_MEDI', - 'YEARS_BEGINEXPLUATATION_MODE', - 'YEARS_BUILD_AVG', - 'YEARS_BUILD_MEDI', - 'YEARS_BUILD_MODE'] + 'YEARS_BUILD_AVG'] + USELESS_COLUMNS = ['FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', @@ -143,46 +106,212 @@ 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'] -AGGREGATION_RECIPIES = [] -for agg in ['mean', 'size', 'var', 'min', 'max']: - for select in NUMERICAL_COLUMNS: - for group in [['CODE_GENDER'], - ['CODE_GENDER', 'OCCUPATION_TYPE'], - ['CODE_GENDER', 'FLAG_OWN_REALTY'], - ['CODE_GENDER', 'ORGANIZATION_TYPE'], - ['CODE_GENDER', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE'], - ['FLAG_OWN_REALTY', 'NAME_HOUSING_TYPE'], - ['FLAG_OWN_REALTY', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE'], - ['OCCUPATION_TYPE', 'ORGANIZATION_TYPE'], - ]: - AGGREGATION_RECIPIES.append({'groupby': group, 'select': select, 'agg': agg}) +HIGHLY_CORRELATED_NUMERICAL_COLUMNS = ['AMT_GOODS_PRICE', + 'APARTMENTS_MEDI', + 'APARTMENTS_MODE', + 'BASEMENTAREA_MEDI', + 'BASEMENTAREA_MODE', + 'COMMONAREA_MEDI', + 'COMMONAREA_MODE', + 'ELEVATORS_MEDI', + 'ELEVATORS_MODE', + 'ENTRANCES_MEDI', + 'ENTRANCES_MODE', + 'FLAG_EMP_PHONE', + 'FLOORSMAX_MEDI', + 'FLOORSMAX_MODE', + 'FLOORSMIN_MEDI', + 'FLOORSMIN_MODE', + 'LANDAREA_MEDI', + 'LANDAREA_MODE', + 'LIVINGAPARTMENTS_MEDI', + 'LIVINGAPARTMENTS_MODE', + 'LIVINGAREA_MEDI', + 'LIVINGAREA_MODE', + 'NONLIVINGAPARTMENTS_MEDI', + 'NONLIVINGAPARTMENTS_MODE', + 'NONLIVINGAREA_MEDI', + 'NONLIVINGAREA_MODE', + 'OBS_60_CNT_SOCIAL_CIRCLE', + 'REGION_RATING_CLIENT_W_CITY', + 'YEARS_BEGINEXPLUATATION_MEDI', + 'YEARS_BEGINEXPLUATATION_MODE', + 'YEARS_BUILD_MEDI', + 'YEARS_BUILD_MODE'] + +APPLICATION_AGGREGATION_RECIPIES = [ + (['CODE_GENDER', 'NAME_EDUCATION_TYPE'], [('AMT_ANNUITY', 'max'), + ('AMT_CREDIT', 'max'), + ('EXT_SOURCE_1', 'mean'), + ('EXT_SOURCE_2', 'mean'), + ('OWN_CAR_AGE', 'max'), + ('OWN_CAR_AGE', 'sum')]), + (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'), + ('AMT_INCOME_TOTAL', 'mean'), + ('DAYS_REGISTRATION', 'mean'), + ('EXT_SOURCE_1', 'mean')]), + (['CODE_GENDER', 'REG_CITY_NOT_WORK_CITY'], [('AMT_ANNUITY', 'mean'), + ('CNT_CHILDREN', 'mean'), + ('DAYS_ID_PUBLISH', 'mean')]), + (['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('EXT_SOURCE_1', 'mean'), + ('EXT_SOURCE_2', 'mean')]), + (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'], [('AMT_CREDIT', 'mean'), + ('AMT_REQ_CREDIT_BUREAU_YEAR', 'mean'), + ('APARTMENTS_AVG', 'mean'), + ('BASEMENTAREA_AVG', 'mean'), + ('EXT_SOURCE_1', 'mean'), + ('EXT_SOURCE_2', 'mean'), + ('EXT_SOURCE_3', 'mean'), + ('NONLIVINGAREA_AVG', 'mean'), + ('OWN_CAR_AGE', 'mean'), + ('YEARS_BUILD_AVG', 'mean')]), + (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('ELEVATORS_AVG', 'mean'), + ('EXT_SOURCE_1', 'mean')]), + (['OCCUPATION_TYPE'], [('AMT_ANNUITY', 'mean'), + ('CNT_CHILDREN', 'mean'), + ('CNT_FAM_MEMBERS', 'mean'), + ('DAYS_BIRTH', 'mean'), + ('DAYS_EMPLOYED', 'mean'), + ('DAYS_ID_PUBLISH', 'mean'), + ('DAYS_REGISTRATION', 'mean'), + ('EXT_SOURCE_1', 'mean'), + ('EXT_SOURCE_2', 'mean'), + ('EXT_SOURCE_3', 'mean')]), +] + +BUREAU_AGGREGATION_RECIPIES = [('CREDIT_TYPE', 'count'), + ('CREDIT_ACTIVE', 'size') + ] +for agg in ['mean', 'min', 'max', 'sum', 'var']: + for select in ['AMT_ANNUITY', + 'AMT_CREDIT_SUM', + 'AMT_CREDIT_SUM_DEBT', + 'AMT_CREDIT_SUM_LIMIT', + 'AMT_CREDIT_SUM_OVERDUE', + 'AMT_CREDIT_MAX_OVERDUE', + 'CNT_CREDIT_PROLONG', + 'CREDIT_DAY_OVERDUE', + 'DAYS_CREDIT', + 'DAYS_CREDIT_ENDDATE', + 'DAYS_CREDIT_UPDATE' + ]: + BUREAU_AGGREGATION_RECIPIES.append((select, agg)) +BUREAU_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], BUREAU_AGGREGATION_RECIPIES)] + +CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = [] +for agg in ['mean', 'min', 'max', 'sum', 'var']: + for select in ['AMT_BALANCE', + 'AMT_CREDIT_LIMIT_ACTUAL', + 'AMT_DRAWINGS_ATM_CURRENT', + 'AMT_DRAWINGS_CURRENT', + 'AMT_DRAWINGS_OTHER_CURRENT', + 'AMT_DRAWINGS_POS_CURRENT', + 'AMT_PAYMENT_CURRENT', + 'CNT_DRAWINGS_ATM_CURRENT', + 'CNT_DRAWINGS_CURRENT', + 'CNT_DRAWINGS_OTHER_CURRENT', + 'CNT_INSTALMENT_MATURE_CUM', + 'MONTHS_BALANCE', + 'SK_DPD', + 'SK_DPD_DEF' + ]: + CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES.append((select, agg)) +CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES)] + +INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = [] +for agg in ['mean', 'min', 'max', 'sum', 'var']: + for select in ['AMT_INSTALMENT', + 'AMT_PAYMENT', + 'DAYS_ENTRY_PAYMENT', + 'DAYS_INSTALMENT', + 'NUM_INSTALMENT_NUMBER', + 'NUM_INSTALMENT_VERSION' + ]: + INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES.append((select, agg)) +INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES)] + +POS_CASH_BALANCE_AGGREGATION_RECIPIES = [] +for agg in ['mean', 'min', 'max', 'sum', 'var']: + for select in ['MONTHS_BALANCE', + 'SK_DPD', + 'SK_DPD_DEF' + ]: + POS_CASH_BALANCE_AGGREGATION_RECIPIES.append((select, agg)) +POS_CASH_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], POS_CASH_BALANCE_AGGREGATION_RECIPIES)] + +PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = [] +for agg in ['mean', 'min', 'max', 'sum', 'var']: + for select in ['AMT_ANNUITY', + 'AMT_APPLICATION', + 'AMT_CREDIT', + 'AMT_DOWN_PAYMENT', + 'AMT_GOODS_PRICE', + 'CNT_PAYMENT', + 'DAYS_DECISION', + 'HOUR_APPR_PROCESS_START', + 'RATE_DOWN_PAYMENT' + ]: + PREVIOUS_APPLICATION_AGGREGATION_RECIPIES.append((select, agg)) +PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], PREVIOUS_APPLICATION_AGGREGATION_RECIPIES)] SOLUTION_CONFIG = AttrDict({ 'pipeline': {'experiment_directory': params.experiment_directory }, - 'preprocessing': {'fillna_value': params.fillna_value}, + 'preprocessing': {'impute_missing': {'fill_missing': params.fill_missing, + 'fill_value': params.fill_value}, + 'categorical_encoder': {'categorical_columns': CATEGORICAL_COLUMNS + }, + }, + + 'applications': {'columns': {'categorical_columns': CATEGORICAL_COLUMNS, + 'numerical_columns': NUMERICAL_COLUMNS + }, + 'aggregations': {'groupby_aggregations': APPLICATION_AGGREGATION_RECIPIES + } + }, + + 'bureau': {'table_name': 'bureau', + 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), + 'groupby_aggregations': BUREAU_AGGREGATION_RECIPIES + }, + + 'credit_card_balance': {'table_name': 'credit_card_balance', + 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), + 'groupby_aggregations': CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES + }, - 'dataframe_by_type_splitter': {'numerical_columns': NUMERICAL_COLUMNS, - 'categorical_columns': CATEGORICAL_COLUMNS, - 'timestamp_columns': TIMESTAMP_COLUMNS, - }, + 'installments_payments': {'table_name': 'installments_payments', + 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), + 'groupby_aggregations': INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES + }, + + 'pos_cash_balance': {'table_name': 'POS_CASH_balance', + 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), + 'groupby_aggregations': POS_CASH_BALANCE_AGGREGATION_RECIPIES + }, + + 'previous_applications': {'table_name': 'previous_application', + 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), + 'groupby_aggregations': PREVIOUS_APPLICATION_AGGREGATION_RECIPIES + }, 'light_gbm': {'device': parameter_eval(params.lgbm__device), 'boosting_type': parameter_eval(params.lgbm__boosting_type), 'objective': parameter_eval(params.lgbm__objective), 'metric': parameter_eval(params.lgbm__metric), + 'scale_pos_weight': parameter_eval(params.lgbm__scale_pos_weight), 'learning_rate': parameter_eval(params.lgbm__learning_rate), + 'max_bin': parameter_eval(params.lgbm__max_bin), 'max_depth': parameter_eval(params.lgbm__max_depth), + 'num_leaves': parameter_eval(params.lgbm__num_leaves), + 'min_child_samples': parameter_eval(params.lgbm__min_child_samples), 'subsample': parameter_eval(params.lgbm__subsample), 'colsample_bytree': parameter_eval(params.lgbm__colsample_bytree), - 'min_child_weight': parameter_eval(params.lgbm__min_child_weight), + 'subsample_freq': parameter_eval(params.lgbm__subsample_freq), + 'min_gain_to_split': parameter_eval(params.lgbm__min_gain_to_split), 'reg_lambda': parameter_eval(params.lgbm__reg_lambda), 'reg_alpha': parameter_eval(params.lgbm__reg_alpha), - 'subsample_freq': parameter_eval(params.lgbm__subsample_freq), - 'max_bin': parameter_eval(params.lgbm__max_bin), - 'min_child_samples': parameter_eval(params.lgbm__min_child_samples), - 'num_leaves': parameter_eval(params.lgbm__num_leaves), 'nthread': parameter_eval(params.num_workers), 'number_boosting_rounds': parameter_eval(params.lgbm__number_boosting_rounds), 'early_stopping_rounds': parameter_eval(params.lgbm__early_stopping_rounds), @@ -282,21 +411,4 @@ }, }, - 'bureau': {'filepath': BUREAU, - 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), - 'groupby_aggregations': [ - {'groupby': ['SK_ID_CURR'], 'select': 'DAYS_CREDIT', 'agg': 'count'}, # 1 - {'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_TYPE', 'agg': 'nunique'}, # 2 - {'groupby': ['SK_ID_CURR'], 'select': 'CNT_CREDIT_PROLONG', 'agg': 'mean'}, # 10 - {'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_DAY_OVERDUE', 'agg': 'count'}, - {'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_ACTIVE', 'agg': 'size'}, - {'groupby': ['SK_ID_CURR'], 'select': 'AMT_CREDIT_SUM', 'agg': 'count'}, - ]}, - - 'clipper': {'min_val': 0, - 'max_val': 1 - }, - - 'groupby_aggregation': {'groupby_aggregations': AGGREGATION_RECIPIES - }, }) diff --git a/src/pipeline_manager.py b/src/pipeline_manager.py new file mode 100644 index 0000000..7e405a4 --- /dev/null +++ b/src/pipeline_manager.py @@ -0,0 +1,402 @@ +import os +import shutil + +from attrdict import AttrDict +import numpy as np +import pandas as pd +from scipy.stats import gmean +from deepsense import neptune +from sklearn.metrics import roc_auc_score +from sklearn.model_selection import train_test_split, KFold, StratifiedKFold + +from . import pipeline_config as cfg +from .pipelines import PIPELINES +from .utils import init_logger, read_params, set_seed, create_submission, verify_submission, calculate_rank + +set_seed(cfg.RANDOM_SEED) +logger = init_logger() +ctx = neptune.Context() +params = read_params(ctx, fallback_file='neptune.yaml') + + +class PipelineManager(): + def train(self, pipeline_name, dev_mode): + train(pipeline_name, dev_mode) + + def evaluate(self, pipeline_name, dev_mode, ): + evaluate(pipeline_name, dev_mode) + + def predict(self, pipeline_name, dev_mode, submit_predictions): + predict(pipeline_name, dev_mode, submit_predictions) + + def train_evaluate_cv(self, pipeline_name, dev_mode): + train_evaluate_cv(pipeline_name, dev_mode) + + def train_evaluate_predict_cv(self, pipeline_name, dev_mode, submit_predictions): + train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions) + + +def train(pipeline_name, dev_mode): + logger.info('TRAINING') + if bool(params.clean_experiment_directory_before_training) and os.path.isdir(params.experiment_directory): + logger.info('Cleaning experiment_directory...') + shutil.rmtree(params.experiment_directory) + + tables = _read_data(dev_mode, read_train=True, read_test=False) + + logger.info('Shuffling and splitting into train and test...') + train_data_split, valid_data_split = train_test_split(tables.application_train, + test_size=params.validation_size, + random_state=cfg.RANDOM_SEED, + shuffle=params.shuffle) + + logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMNS].mean())) + logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMNS].mean())) + logger.info('Train shape: {}'.format(train_data_split.shape)) + logger.info('Valid shape: {}'.format(valid_data_split.shape)) + + train_data = {'application': {'X': train_data_split.drop(cfg.TARGET_COLUMNS, axis=1), + 'y': train_data_split[cfg.TARGET_COLUMNS].values.reshape(-1), + 'X_valid': valid_data_split.drop(cfg.TARGET_COLUMNS, axis=1), + 'y_valid': valid_data_split[cfg.TARGET_COLUMNS].values.reshape(-1) + }, + 'bureau_balance': {'X': tables.bureau_balance}, + 'bureau': {'X': tables.bureau}, + 'credit_card_balance': {'X': tables.credit_card_balance}, + 'installments_payments': {'X': tables.installments_payments}, + 'pos_cash_balance': {'X': tables.pos_cash_balance}, + 'previous_application': {'X': tables.previous_application}, + } + + pipeline = PIPELINES[pipeline_name](config=cfg.SOLUTION_CONFIG, train_mode=True) + pipeline.clean_cache() + logger.info('Start pipeline fit and transform') + pipeline.fit_transform(train_data) + pipeline.clean_cache() + + +def evaluate(pipeline_name, dev_mode): + logger.info('EVALUATION') + logger.info('Reading data...') + + tables = _read_data(dev_mode, read_train=True, read_test=False) + + logger.info('Shuffling and splitting to get validation split...') + _, valid_data_split = train_test_split(tables.application_train, + test_size=params.validation_size, + random_state=cfg.RANDOM_SEED, + shuffle=params.shuffle) + + logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMNS].mean())) + logger.info('Valid shape: {}'.format(valid_data_split.shape)) + + y_true = valid_data_split[cfg.TARGET_COLUMNS].values + + eval_data = {'application': {'X': valid_data_split.drop(cfg.TARGET_COLUMNS, axis=1), + 'y': None, + }, + 'bureau_balance': {'X': tables.bureau_balance}, + 'bureau': {'X': tables.bureau}, + 'credit_card_balance': {'X': tables.credit_card_balance}, + 'installments_payments': {'X': tables.installments_payments}, + 'pos_cash_balance': {'X': tables.pos_cash_balance}, + 'previous_application': {'X': tables.previous_application}, + } + + pipeline = PIPELINES[pipeline_name](config=cfg.SOLUTION_CONFIG, train_mode=False) + pipeline.clean_cache() + logger.info('Start pipeline transform') + output = pipeline.transform(eval_data) + pipeline.clean_cache() + + y_pred = output['prediction'] + + logger.info('Calculating ROC_AUC on validation set') + score = roc_auc_score(y_true, y_pred) + logger.info('ROC_AUC score on validation is {}'.format(score)) + ctx.channel_send('ROC_AUC', 0, score) + + +def predict(pipeline_name, dev_mode, submit_predictions): + logger.info('PREDICTION') + + tables = _read_data(dev_mode, read_train=False, read_test=True) + + test_data = {'application': {'X': tables.application_test, + 'y': None, + }, + 'bureau_balance': {'X': tables.bureau_balance}, + 'bureau': {'X': tables.bureau}, + 'credit_card_balance': {'X': tables.credit_card_balance}, + 'installments_payments': {'X': tables.installments_payments}, + 'pos_cash_balance': {'X': tables.pos_cash_balance}, + 'previous_application': {'X': tables.previous_application}, + } + + pipeline = PIPELINES[pipeline_name](config=cfg.SOLUTION_CONFIG, train_mode=False) + + pipeline.clean_cache() + logger.info('Start pipeline transform') + output = pipeline.transform(test_data) + pipeline.clean_cache() + y_pred = output['prediction'] + + if not dev_mode: + logger.info('creating submission file...') + submission = create_submission(tables.application_test, y_pred) + + logger.info('verifying submission...') + sample_submission = pd.read_csv(params.sample_submission_filepath) + verify_submission(submission, sample_submission) + + submission_filepath = os.path.join(params.experiment_directory, 'submission.csv') + submission.to_csv(submission_filepath, index=None, encoding='utf-8') + logger.info('submission persisted to {}'.format(submission_filepath)) + logger.info('submission head \n\n{}'.format(submission.head())) + + if submit_predictions and params.kaggle_api: + make_submission(submission_filepath) + + +def train_evaluate_cv(pipeline_name, dev_mode): + if bool(params.clean_experiment_directory_before_training) and os.path.isdir(params.experiment_directory): + logger.info('Cleaning experiment_directory...') + shutil.rmtree(params.experiment_directory) + + tables = _read_data(dev_mode, read_train=True, read_test=False) + + target_values = tables.application_train[cfg.TARGET_COLUMNS].values.reshape(-1) + fold_generator = _get_fold_generator(target_values) + + fold_scores = [] + for fold_id, (train_idx, valid_idx) in enumerate(fold_generator): + (train_data_split, + valid_data_split) = tables.application_train.iloc[train_idx], tables.application_train.iloc[valid_idx] + + logger.info('Started fold {}'.format(fold_id)) + logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMNS].mean())) + logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMNS].mean())) + logger.info('Train shape: {}'.format(train_data_split.shape)) + logger.info('Valid shape: {}'.format(valid_data_split.shape)) + + score, _, _ = _fold_fit_evaluate_loop(train_data_split, valid_data_split, tables, fold_id, pipeline_name) + + logger.info('Fold {} ROC_AUC {}'.format(fold_id, score)) + ctx.channel_send('Fold {} ROC_AUC'.format(fold_id), 0, score) + + fold_scores.append(score) + + score_mean, score_std = np.mean(fold_scores), np.std(fold_scores) + + logger.info('ROC_AUC mean {}, ROC_AUC std {}'.format(score_mean, score_std)) + ctx.channel_send('ROC_AUC', 0, score_mean) + ctx.channel_send('ROC_AUC STD', 0, score_std) + + +def train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions): + if bool(params.clean_experiment_directory_before_training) and os.path.isdir(params.experiment_directory): + logger.info('Cleaning experiment_directory...') + shutil.rmtree(params.experiment_directory) + + tables = _read_data(dev_mode, read_train=True, read_test=True) + + target_values = tables.application_train[cfg.TARGET_COLUMNS].values.reshape(-1) + fold_generator = _get_fold_generator(target_values) + + fold_scores, out_of_fold_train_predictions, out_of_fold_test_predictions = [], [], [] + for fold_id, (train_idx, valid_idx) in enumerate(fold_generator): + (train_data_split, + valid_data_split) = tables.application_train.iloc[train_idx], tables.application_train.iloc[valid_idx] + + logger.info('Started fold {}'.format(fold_id)) + logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMNS].mean())) + logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMNS].mean())) + logger.info('Train shape: {}'.format(train_data_split.shape)) + logger.info('Valid shape: {}'.format(valid_data_split.shape)) + + score, out_of_fold_prediction, test_prediction = _fold_fit_evaluate_predict_loop(train_data_split, + valid_data_split, + tables, + fold_id, pipeline_name) + + logger.info('Fold {} ROC_AUC {}'.format(fold_id, score)) + ctx.channel_send('Fold {} ROC_AUC'.format(fold_id), 0, score) + + out_of_fold_train_predictions.append(out_of_fold_prediction) + out_of_fold_test_predictions.append(test_prediction) + fold_scores.append(score) + + out_of_fold_train_predictions = pd.concat(out_of_fold_train_predictions, axis=0) + out_of_fold_test_predictions = pd.concat(out_of_fold_test_predictions, axis=0) + + test_prediction_aggregated = _aggregate_test_prediction(out_of_fold_test_predictions) + score_mean, score_std = np.mean(fold_scores), np.std(fold_scores) + + logger.info('ROC_AUC mean {}, ROC_AUC std {}'.format(score_mean, score_std)) + ctx.channel_send('ROC_AUC', 0, score_mean) + ctx.channel_send('ROC_AUC STD', 0, score_std) + + logger.info('Saving predictions') + out_of_fold_train_predictions.to_csv(os.path.join(params.experiment_directory, + '{}_out_of_fold_train_predictions.csv'.format(pipeline_name)), + index=None) + out_of_fold_test_predictions.to_csv(os.path.join(params.experiment_directory, + '{}_out_of_fold_test_predictions.csv'.format(pipeline_name)), + index=None) + test_aggregated_file_path = os.path.join(params.experiment_directory, + '{}_test_predictions_{}.csv'.format(pipeline_name, + params.aggregation_method)) + test_prediction_aggregated.to_csv(test_aggregated_file_path, index=None) + + if not dev_mode: + logger.info('verifying submission...') + sample_submission = pd.read_csv(params.sample_submission_filepath) + verify_submission(test_prediction_aggregated, sample_submission) + + if submit_predictions and params.kaggle_api: + make_submission(test_aggregated_file_path) + + +def make_submission(submission_filepath): + logger.info('making Kaggle submit...') + os.system('kaggle competitions submit -c home-credit-default-risk -f {} -m {}' + .format(submission_filepath, params.kaggle_message)) + + +def _read_data(dev_mode, read_train=True, read_test=False): + logger.info('Reading data...') + if dev_mode: + nrows = cfg.DEV_SAMPLE_SIZE + logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE)) + else: + nrows = None + + raw_data = {} + + if read_train: + raw_data['application_train'] = pd.read_csv(params.train_filepath, nrows=nrows) + + if read_test: + raw_data['application_test'] = pd.read_csv(params.test_filepath, nrows=nrows) + + raw_data['bureau'] = pd.read_csv(params.bureau_filepath, nrows=nrows) + raw_data['credit_card_balance'] = pd.read_csv(params.credit_card_balance_filepath, nrows=nrows) + raw_data['installments_payments'] = pd.read_csv(params.installments_payments_filepath, nrows=nrows) + raw_data['pos_cash_balance'] = pd.read_csv(params.POS_CASH_balance_filepath, nrows=nrows) + raw_data['previous_application'] = pd.read_csv(params.previous_application_filepath, nrows=nrows) + raw_data['bureau_balance'] = pd.read_csv(params.bureau_balance_filepath, nrows=nrows) + + return AttrDict(raw_data) + + +def _get_fold_generator(target_values): + if params.stratified_cv: + cv = StratifiedKFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED) + cv.get_n_splits(target_values) + fold_generator = cv.split(target_values, target_values) + else: + cv = KFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED) + fold_generator = cv.split(target_values) + return fold_generator + + +def _fold_fit_evaluate_predict_loop(train_data_split, valid_data_split, tables, fold_id, pipeline_name): + score, y_valid_pred, pipeline = _fold_fit_evaluate_loop(train_data_split, valid_data_split, tables, + fold_id, pipeline_name) + + test_data = {'application': {'X': tables.application_test, + 'y': None, + }, + 'bureau_balance': {'X': tables.bureau_balance}, + 'bureau': {'X': tables.bureau}, + 'credit_card_balance': {'X': tables.credit_card_balance}, + 'installments_payments': {'X': tables.installments_payments}, + 'pos_cash_balance': {'X': tables.pos_cash_balance}, + 'previous_application': {'X': tables.previous_application}, + } + + logger.info('Start pipeline transform on test') + pipeline.clean_cache() + output_test = pipeline.transform(test_data) + pipeline.clean_cache() + y_test_pred = output_test['prediction'] + + train_out_of_fold_prediction_chunk = valid_data_split[cfg.ID_COLUMNS] + train_out_of_fold_prediction_chunk['fold_id'] = fold_id + train_out_of_fold_prediction_chunk['{}_prediction'.format(pipeline_name)] = y_valid_pred + + test_out_of_fold_prediction_chunk = tables.application_test[cfg.ID_COLUMNS] + test_out_of_fold_prediction_chunk['fold_id'] = fold_id + test_out_of_fold_prediction_chunk['{}_prediction'.format(pipeline_name)] = y_test_pred + + return score, train_out_of_fold_prediction_chunk, test_out_of_fold_prediction_chunk + + +def _fold_fit_evaluate_loop(train_data_split, valid_data_split, tables, fold_id, pipeline_name): + train_data = {'application': {'X': train_data_split.drop(cfg.TARGET_COLUMNS, axis=1), + 'y': train_data_split[cfg.TARGET_COLUMNS].values.reshape(-1), + 'X_valid': valid_data_split.drop(cfg.TARGET_COLUMNS, axis=1), + 'y_valid': valid_data_split[cfg.TARGET_COLUMNS].values.reshape(-1), + }, + 'bureau_balance': {'X': tables.bureau_balance}, + 'bureau': {'X': tables.bureau}, + 'credit_card_balance': {'X': tables.credit_card_balance}, + 'installments_payments': {'X': tables.installments_payments}, + 'pos_cash_balance': {'X': tables.pos_cash_balance}, + 'previous_application': {'X': tables.previous_application}, + } + + valid_data = {'application': {'X': valid_data_split.drop(cfg.TARGET_COLUMNS, axis=1), + 'y': None, + }, + 'bureau_balance': {'X': tables.bureau_balance}, + 'bureau': {'X': tables.bureau}, + 'credit_card_balance': {'X': tables.credit_card_balance}, + 'installments_payments': {'X': tables.installments_payments}, + 'pos_cash_balance': {'X': tables.pos_cash_balance}, + 'previous_application': {'X': tables.previous_application}, + } + + pipeline = PIPELINES[pipeline_name](config=cfg.SOLUTION_CONFIG, train_mode=True, + suffix='_fold_{}'.format(fold_id)) + + logger.info('Start pipeline fit and transform on train') + pipeline.clean_cache() + pipeline.fit_transform(train_data) + pipeline.clean_cache() + + pipeline = PIPELINES[pipeline_name](config=cfg.SOLUTION_CONFIG, train_mode=False, + suffix='_fold_{}'.format(fold_id)) + logger.info('Start pipeline transform on valid') + pipeline.clean_cache() + output_valid = pipeline.transform(valid_data) + pipeline.clean_cache() + + y_valid_pred = output_valid['prediction'] + y_valid_true = valid_data_split[cfg.TARGET_COLUMNS].values + score = roc_auc_score(y_valid_true, y_valid_pred) + + return score, y_valid_pred, pipeline + + +def _aggregate_test_prediction(out_of_fold_test_predictions): + agg_methods = {'mean': np.mean, + 'gmean': gmean} + prediction_column = [col for col in out_of_fold_test_predictions.columns if '_prediction' in col][0] + if params.aggregation_method == 'rank_mean': + rank_column = prediction_column.replace('_prediction', '_rank') + test_predictions_with_ranks = [] + for fold_id, fold_df in out_of_fold_test_predictions.groupby('fold_id'): + fold_df[rank_column] = calculate_rank(fold_df[prediction_column]) + test_predictions_with_ranks.append(fold_df) + test_predictions_with_ranks = pd.concat(test_predictions_with_ranks, axis=0) + + test_prediction_aggregated = test_predictions_with_ranks.groupby(cfg.ID_COLUMNS)[rank_column].apply( + np.mean).reset_index() + else: + test_prediction_aggregated = out_of_fold_test_predictions.groupby(cfg.ID_COLUMNS)[prediction_column].apply( + agg_methods[params.aggregation_method]).reset_index() + + test_prediction_aggregated.columns = [cfg.ID_COLUMNS + cfg.TARGET_COLUMNS] + + return test_prediction_aggregated diff --git a/pipelines.py b/src/pipelines.py similarity index 71% rename from pipelines.py rename to src/pipelines.py index d2df898..2fc94e3 100644 --- a/pipelines.py +++ b/src/pipelines.py @@ -3,107 +3,91 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC -from steppy.adapter import Adapter, E -from steppy.base import Step -from pipeline_blocks import feature_extraction, classifier_light_gbm, preprocessing_fillna, classifier_sklearn, \ +from .pipeline_blocks import feature_extraction, classifier_light_gbm, preprocessing_fillna, classifier_sklearn, \ classifier_xgb -from postprocessing import Clipper -def lightGBM(config, train_mode): +def lightGBM(config, train_mode, suffix=''): if train_mode: features, features_valid = feature_extraction(config, train_mode, - persist_output=True, - cache_output=True, - load_persisted_output=True) + suffix, + persist_output=False, + cache_output=False, + load_persisted_output=False) light_gbm = classifier_light_gbm((features, features_valid), config, - train_mode) + train_mode, suffix) else: features = feature_extraction(config, train_mode, - cache_output=True) + suffix, + cache_output=False) light_gbm = classifier_light_gbm(features, config, - train_mode) + train_mode, suffix) - clipper = Step(name='clipper', - transformer=Clipper(**config.clipper), - input_steps=[light_gbm], - adapter=Adapter({'prediction': E(light_gbm.name, 'prediction')}), - experiment_directory=config.pipeline.experiment_directory) + return light_gbm - return clipper - -def xgboost(config, train_mode): +def xgboost(config, train_mode, suffix=''): if train_mode: features, features_valid = feature_extraction(config, train_mode, + suffix, persist_output=True, cache_output=True, load_persisted_output=True) xgb = classifier_xgb((features, features_valid), config, - train_mode) + train_mode, + suffix) else: features = feature_extraction(config, train_mode, + suffix, cache_output=True) xgb = classifier_xgb(features, config, - train_mode) - - clipper = Step(name='clipper', - transformer=Clipper(**config.clipper), - input_steps=[xgb], - adapter=Adapter({'prediction': E(xgb.name, 'prediction')}), - experiment_directory=config.pipeline.experiment_directory) + train_mode, + suffix) - return clipper + return xgb -def sklearn_main(config, ClassifierClass, clf_name, train_mode, normalize=False): +def sklearn_main(config, ClassifierClass, clf_name, train_mode, suffix='', normalize=False): model_params = getattr(config, clf_name) random_search_config = getattr(config.random_search, clf_name) full_config = (config, model_params, random_search_config) if train_mode: features, features_valid = feature_extraction(config, train_mode, + suffix, persist_output=True, cache_output=True, load_persisted_output=True) - sklearn_preproc = preprocessing_fillna((features, features_valid), config, train_mode) + sklearn_preproc = preprocessing_fillna((features, features_valid), config, train_mode, suffix) else: features = feature_extraction(config, train_mode, + suffix, cache_output=True) - sklearn_preproc = preprocessing_fillna(features, config, train_mode) + sklearn_preproc = preprocessing_fillna(features, config, train_mode, suffix) sklearn_clf = classifier_sklearn(sklearn_preproc, ClassifierClass, full_config, clf_name, train_mode, + suffix, normalize) + return sklearn_clf - clipper = Step(name='clipper', - transformer=Clipper(**config.clipper), - input_steps=[sklearn_clf], - adapter=Adapter({'prediction': E(sklearn_clf.name, 'predicted')}), - experiment_directory=config.pipeline.experiment_directory) - return clipper - -PIPELINES = {'lightGBM': {'train': partial(lightGBM, train_mode=True), - 'inference': partial(lightGBM, train_mode=False) - }, - 'XGBoost': {'train': partial(xgboost, train_mode=True), - 'inference': partial(xgboost, train_mode=False) - }, +PIPELINES = {'lightGBM': lightGBM, + 'XGBoost': xgboost, 'random_forest': {'train': partial(sklearn_main, ClassifierClass=RandomForestClassifier, clf_name='random_forest', diff --git a/utils.py b/src/utils.py similarity index 80% rename from utils.py rename to src/utils.py index b1eb89c..bdbbbab 100644 --- a/utils.py +++ b/src/utils.py @@ -7,7 +7,6 @@ import pandas as pd import yaml from attrdict import AttrDict -from steppy.base import BaseTransformer def create_submission(meta, predictions): @@ -18,7 +17,6 @@ def create_submission(meta, predictions): def verify_submission(submission, sample_submission): - assert submission.shape == sample_submission.shape, \ 'Expected submission to have shape {} but got {}'.format(sample_submission.shape, submission.shape) @@ -49,12 +47,9 @@ def init_logger(): return logger -def read_params(ctx): +def read_params(ctx, fallback_file): if ctx.params.__class__.__name__ == 'OfflineContextParams': - try: - neptune_config = read_yaml('neptune.yaml') - except FileNotFoundError: - neptune_config = read_yaml('../neptune.yaml') + neptune_config = read_yaml(fallback_file) params = neptune_config.parameters else: params = ctx.params @@ -86,17 +81,6 @@ def set_seed(seed=90210): random.seed(seed) np.random.seed(seed) - -class ToNumpyLabel(BaseTransformer): - def __init__(self, **kwargs): - super().__init__() - self.y = None - - def fit(self, y, **kwargs): - self.y = y[0].values.reshape(-1) - return self - - def transform(self, **kwargs): - if self.y.any(): - return {'y': self.y} - return {} +def calculate_rank(predictions): + rank = (1 + predictions.rank().values) / (predictions.shape[0] + 1) + return rank \ No newline at end of file