diff --git a/feature_extraction.py b/feature_extraction.py
deleted file mode 100644
index ddce1e9..0000000
--- a/feature_extraction.py
+++ /dev/null
@@ -1,142 +0,0 @@
-import os
-
-import category_encoders as ce
-import numpy as np
-import pandas as pd
-from sklearn.externals import joblib
-from steppy.base import BaseTransformer
-from steppy.utils import get_logger
-
-logger = get_logger()
-
-
-class DataFrameByTypeSplitter(BaseTransformer):
-    def __init__(self, numerical_columns, categorical_columns, timestamp_columns):
-        super().__init__()
-        self.numerical_columns = numerical_columns
-        self.categorical_columns = categorical_columns
-        self.timestamp_columns = timestamp_columns
-
-    def transform(self, X, y=None, **kwargs):
-        outputs = {}
-
-        if self.numerical_columns is not None:
-            outputs['numerical_features'] = X[self.numerical_columns]
-
-        if self.categorical_columns is not None:
-            outputs['categorical_features'] = X[self.categorical_columns]
-
-        if self.timestamp_columns is not None:
-            outputs['timestamp_features'] = X[self.timestamp_columns]
-
-        return outputs
-
-
-class FeatureJoiner(BaseTransformer):
-    def transform(self, numerical_feature_list, categorical_feature_list, **kwargs):
-        features = numerical_feature_list + categorical_feature_list
-        for feature in features:
-            feature.reset_index(drop=True, inplace=True)
-        outputs = dict()
-        outputs['features'] = pd.concat(features, axis=1).astype(np.float32)
-        outputs['feature_names'] = self._get_feature_names(features)
-        outputs['categorical_features'] = self._get_feature_names(categorical_feature_list)
-        return outputs
-
-    def _get_feature_names(self, dataframes):
-        feature_names = []
-        for dataframe in dataframes:
-            try:
-                feature_names.extend(list(dataframe.columns))
-            except Exception as e:
-                print(e)
-                feature_names.append(dataframe.name)
-
-        return feature_names
-
-
-class CategoricalEncoder(BaseTransformer):
-    def __init__(self, **kwargs):
-        super().__init__()
-        self.params = kwargs
-        self.encoder_class = ce.OrdinalEncoder
-        self.categorical_encoder = None
-
-    def fit(self, X, y, **kwargs):
-        categorical_columns = list(X.columns)
-        self.categorical_encoder = self.encoder_class(cols=categorical_columns, **self.params)
-        self.categorical_encoder.fit(X, y)
-        return self
-
-    def transform(self, X, y=None, **kwargs):
-        X_ = self.categorical_encoder.transform(X)
-        return {'categorical_features': X_}
-
-    def load(self, filepath):
-        self.categorical_encoder = joblib.load(filepath)
-        return self
-
-    def persist(self, filepath):
-        joblib.dump(self.categorical_encoder, filepath)
-
-
-class GroupbyAggregations(BaseTransformer):
-    def __init__(self, groupby_aggregations):
-        super().__init__()
-        self.groupby_aggregations = groupby_aggregations
-
-    @property
-    def groupby_aggregations_names(self):
-        groupby_aggregations_names = ['{}_{}_{}'.format('_'.join(spec['groupby']),
-                                                        spec['agg'],
-                                                        spec['select'])
-                                      for spec in self.groupby_aggregations]
-        return groupby_aggregations_names
-
-    def transform(self, categorical_features, numerical_features):
-        X = pd.concat([categorical_features, numerical_features], axis=1)
-        for spec, groupby_aggregations_name in zip(self.groupby_aggregations, self.groupby_aggregations_names):
-            group_object = X.groupby(spec['groupby'])
-            X = X.merge(group_object[spec['select']]
-                        .agg(spec['agg'])
-                        .reset_index()
-                        .rename(index=str,
-                                columns={spec['select']: groupby_aggregations_name})
-                        [spec['groupby'] + [groupby_aggregations_name]],
-                        on=spec['groupby'],
-                        how='left')
-
-        return {'numerical_features': X[self.groupby_aggregations_names].astype(np.float32)}
-
-
-class GroupbyAggregationFromFile(BaseTransformer):
-    def __init__(self, filepath, id_columns, groupby_aggregations):
-        super().__init__()
-        self.filename = os.path.basename(filepath).split('.')[0]
-        self.file = pd.read_csv(filepath)
-        self.id_columns = id_columns
-        self.groupby_aggregations = groupby_aggregations
-
-    @ property
-    def groupby_aggregations_names(self):
-        groupby_aggregations_names = ['{}_{}_{}_{}'.format(self.filename,
-                                                           '_'.join(spec['groupby']),
-                                                           spec['agg'],
-                                                           spec['select'])
-                                      for spec in self.groupby_aggregations]
-        return groupby_aggregations_names
-
-    def transform(self, X):
-        for spec, groupby_aggregations_name in zip(self.groupby_aggregations, self.groupby_aggregations_names):
-            group_object = self.file.groupby(spec['groupby'])
-            X = X.merge(group_object[spec['select']]
-                        .agg(spec['agg'])
-                        .reset_index()
-                        .rename(index=str,
-                                columns={spec['select']: groupby_aggregations_name})
-                        [spec['groupby'] + [groupby_aggregations_name]],
-                        left_on=self.id_columns[0],
-                        right_on=self.id_columns[1],
-                        how='left')
-
-        return {'numerical_features': X[self.groupby_aggregations_names].astype(np.float32)}
diff --git a/main.py b/main.py
index e505239..c379c5b 100644
--- a/main.py
+++ b/main.py
@@ -1,196 +1,77 @@
-import os
-import shutil
-
 import click
-import pandas as pd
-from deepsense import neptune
-from sklearn.metrics import roc_auc_score
-from sklearn.model_selection import train_test_split
-
-import pipeline_config as cfg
-from pipelines import PIPELINES
-from utils import create_submission, init_logger, read_params, persist_evaluation_predictions, \
-    set_seed, verify_submission
+from src.pipeline_manager import PipelineManager
 
-set_seed()
-logger = init_logger()
-ctx = neptune.Context()
-params = read_params(ctx)
+pipeline_manager = PipelineManager()
 
 
 @click.group()
-def action():
+def main():
     pass
 
 
-@action.command()
+@main.command()
 @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
 @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
 def train(pipeline_name, dev_mode):
-    _train(pipeline_name, dev_mode)
+    pipeline_manager.train(pipeline_name, dev_mode)
 
 
-@action.command()
+@main.command()
 @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
 @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
 def evaluate(pipeline_name, dev_mode):
-    _evaluate(pipeline_name, dev_mode)
+    pipeline_manager.evaluate(pipeline_name, dev_mode)
 
 
-@action.command()
+@main.command()
 @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
 @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
-def predict(pipeline_name, dev_mode):
-    _predict(pipeline_name, dev_mode)
+@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
+def predict(pipeline_name, dev_mode, submit_predictions):
+    pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)
 
 
-@action.command()
+@main.command()
 @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
 @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
-def train_evaluate_predict(pipeline_name, dev_mode):
-    _train(pipeline_name, dev_mode)
-    _evaluate(pipeline_name, dev_mode)
-    _predict(pipeline_name, dev_mode)
+def train_evaluate_predict(pipeline_name, submit_predictions, dev_mode):
+    pipeline_manager.train(pipeline_name, dev_mode)
+    pipeline_manager.evaluate(pipeline_name, dev_mode)
+    pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)
 
 
-@action.command()
+@main.command()
 @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
 @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
-def evaluate_predict(pipeline_name, dev_mode):
-    _evaluate(pipeline_name, dev_mode)
-    _predict(pipeline_name, dev_mode)
+def train_evaluate(pipeline_name, dev_mode):
+    pipeline_manager.train(pipeline_name, dev_mode)
+    pipeline_manager.evaluate(pipeline_name, dev_mode)
 
 
-@action.command()
+@main.command()
 @click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
 @click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
-def train_evaluate(pipeline_name, dev_mode):
-    _train(pipeline_name, dev_mode)
-    _evaluate(pipeline_name, dev_mode)
-
-
-def _train(pipeline_name, dev_mode):
-    logger.info('TRAINING')
-    if bool(params.clean_experiment_directory_before_training) and os.path.isdir(params.experiment_directory):
-        logger.info('Cleaning experiment_directory...')
-        shutil.rmtree(params.experiment_directory)
-
-    logger.info('Reading data...')
-    if dev_mode:
-        logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
-        application_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
-    else:
-        application_train = pd.read_csv(params.train_filepath)
-
-    logger.info('Shuffling and splitting into train and test...')
-    train_data_split, valid_data_split = train_test_split(application_train,
-                                                          test_size=params.validation_size,
-                                                          random_state=cfg.RANDOM_SEED,
-                                                          shuffle=params.shuffle)
-
-    logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMN].mean()))
-    logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMN].mean()))
-    logger.info('Train shape: {}'.format(train_data_split.shape))
-    logger.info('Valid shape: {}'.format(valid_data_split.shape))
-
-    data = {'input': {'X': train_data_split.drop(cfg.TARGET_COLUMN, axis=1),
-                      'y': train_data_split[cfg.TARGET_COLUMN],
-                      'X_valid': valid_data_split.drop(cfg.TARGET_COLUMN, axis=1),
-                      'y_valid': valid_data_split[cfg.TARGET_COLUMN],
-                      },
-            }
-
-    pipeline = PIPELINES[pipeline_name]['train'](cfg.SOLUTION_CONFIG)
-    pipeline.clean_cache()
-    logger.info('Start pipeline fit and transform')
-    pipeline.fit_transform(data)
-    pipeline.clean_cache()
-
-
-def _evaluate(pipeline_name, dev_mode):
-    logger.info('EVALUATION')
-    logger.info('reading data...')
-    if dev_mode:
-        logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
-        application_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
-    else:
-        application_train = pd.read_csv(params.train_filepath)
-
-    logger.info('Shuffling and splitting to get validation split...')
-    _, valid_data_split = train_test_split(application_train,
-                                           test_size=params.validation_size,
-                                           random_state=cfg.RANDOM_SEED,
-                                           shuffle=params.shuffle)
-
-    logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMN].mean()))
-    logger.info('Valid shape: {}'.format(valid_data_split.shape))
-
-    y_true = valid_data_split[cfg.TARGET_COLUMN].values
-    data = {'input': {'X': valid_data_split.drop(cfg.TARGET_COLUMN, axis=1),
-                      'y': valid_data_split[cfg.TARGET_COLUMN],
-                      },
-            }
-
-    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
-    pipeline.clean_cache()
-    logger.info('Start pipeline transform')
-    output = pipeline.transform(data)
-    pipeline.clean_cache()
-
-    y_pred = output['clipped_prediction']
-
-    logger.info('Saving evaluation predictions to the {}'.format(params.experiment_directory))
-    persist_evaluation_predictions(params.experiment_directory,
-                                   y_pred,
-                                   valid_data_split,
-                                   cfg.ID_COLUMN,
-                                   cfg.TARGET_COLUMN)
-
-    logger.info('Calculating ROC_AUC on validation set')
-    score = roc_auc_score(y_true, y_pred)
-    logger.info('ROC_AUC score on validation is {}'.format(score))
-    ctx.channel_send('ROC_AUC', 0, score)
-
-
-def _predict(pipeline_name, dev_mode):
-    logger.info('PREDICTION')
-    logger.info('reading data...')
-    if dev_mode:
-        logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
-        application_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
-    else:
-        application_test = pd.read_csv(params.test_filepath)
-
-    data = {'input': {'X': application_test,
-                      'y': None,
-                      },
-            }
-
-    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
-    pipeline.clean_cache()
-    logger.info('Start pipeline transform')
-    output = pipeline.transform(data)
-    pipeline.clean_cache()
-    y_pred = output['clipped_prediction']
-
-    if not dev_mode:
-        logger.info('creating submission file...')
-        submission = create_submission(application_test, y_pred)
-
-        logger.info('verifying submission...')
-        sample_submission = pd.read_csv(params.sample_submission_filepath)
-        verify_submission(submission, sample_submission)
-
-        submission_filepath = os.path.join(params.experiment_directory, 'submission.csv')
-        submission.to_csv(submission_filepath, index=None, encoding='utf-8')
-        logger.info('submission persisted to {}'.format(submission_filepath))
-        logger.info('submission head \n\n{}'.format(submission.head()))
-
-        if params.kaggle_api:
-            logger.info('making Kaggle submit...')
-            os.system('kaggle competitions submit -c home-credit-default-risk -f {} -m {}'
-                      .format(submission_filepath, params.kaggle_message))
+def evaluate_predict(pipeline_name, submit_predictions, dev_mode):
+    pipeline_manager.evaluate(pipeline_name, dev_mode)
+    pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)
+
+
+@main.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def train_evaluate_cv(pipeline_name, dev_mode):
+    pipeline_manager.train_evaluate_cv(pipeline_name, dev_mode)
+
+
+@main.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def train_evaluate_predict_cv(pipeline_name, submit_predictions, dev_mode):
+    pipeline_manager.train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions)
 
 
 if __name__ == "__main__":
-    action()
+    main()
\ No newline at end of file
diff --git a/models.py b/models.py
deleted file mode 100644
index 56dcb64..0000000
--- a/models.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import xgboost as xgb
-from attrdict import AttrDict
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
-from steppy.base import BaseTransformer
-from steppy.utils import get_logger
-from toolkit.sklearn_transformers.models import SklearnClassifier
-
-logger = get_logger()
-
-
-class XGBoost(BaseTransformer):
-    def __init__(self, **params):
-        super().__init__()
-        logger.info('initializing XGBoost...')
-        self.params = params
-        self.training_params = ['nrounds', 'early_stopping_rounds']
-        self.evaluation_function = None
-
-    @property
-    def model_config(self):
-        return AttrDict({param: value for param, value in self.params.items()
-                         if param not in self.training_params})
-
-    @property
-    def training_config(self):
-        return AttrDict({param: value for param, value in self.params.items()
-                         if param in self.training_params})
-
-    def fit(self,
-            X, y,
-            X_valid, y_valid,
-            feature_names=None,
-            feature_types=None,
-            **kwargs):
-        train = xgb.DMatrix(X,
-                            label=y,
-                            feature_names=feature_names,
-                            feature_types=feature_types)
-        valid = xgb.DMatrix(X_valid,
-                            label=y_valid,
-                            feature_names=feature_names,
-                            feature_types=feature_types)
-
-        evaluation_results = {}
-        self.estimator = xgb.train(params=self.model_config,
-                                   dtrain=train,
-                                   evals=[(train, 'train'), (valid, 'valid')],
-                                   evals_result=evaluation_results,
-                                   num_boost_round=self.training_config.nrounds,
-                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
-                                   verbose_eval=self.model_config.verbose,
-                                   feval=self.evaluation_function)
-        return self
-
-    def transform(self, X, y=None, feature_names=None, feature_types=None, **kwargs):
-        X_DMatrix = xgb.DMatrix(X,
-                                label=y,
-                                feature_names=feature_names,
-                                feature_types=feature_types)
-        prediction = self.estimator.predict(X_DMatrix)
-        return {'prediction': prediction}
-
-    def load(self, filepath):
-        self.estimator = xgb.Booster(params=self.model_config)
-        self.estimator.load_model(filepath)
-        return self
-
-    def persist(self, filepath):
-        self.estimator.save_model(filepath)
-
-
-def get_sklearn_classifier(ClassifierClass, normalize=False, **kwargs):
-
-    class SklearnBinaryClassifier(SklearnClassifier):
-        def transform(self, X, y=None, target=1, **kwargs):
-            prediction = self.estimator.predict_proba(X)[:, target]
-            return {SklearnClassifier.RESULT_KEY: prediction}
-
-    if normalize:
-        return SklearnBinaryClassifier(Pipeline([('standarizer', StandardScaler()),
-                                                 ('classifier', ClassifierClass(**kwargs))]))
-
-    return SklearnBinaryClassifier(ClassifierClass(**kwargs))
diff --git a/neptune.yaml b/neptune.yaml
index fec954c..34f0a3a 100644
--- a/neptune.yaml
+++ b/neptune.yaml
@@ -1,7 +1,7 @@
 project: ORGANIZATION/home-credit
 
 name: home-credit-default-risk
-tags: [solution-2]
+tags: [solution-3, dev]
 
 metric:
   channel: 'ROC_AUC'
@@ -9,20 +9,19 @@ metric:
 
 exclude:
   - output
-  - imgs
+  - notebooks
   - neptune.log
   - offline_job.log
   - .git
   - .github
   - .idea
   - .ipynb_checkpoints
-  - Untitled.ipynb
 
 parameters:
 # Data
   train_filepath:                 YOUR/PATH/TO/application_train.csv
   test_filepath:                  YOUR/PATH/TO/application_test.csv
-  bureau_balance_filepath:        YOUR/PATH/TO/bureau_balance_filepath.csv
+  bureau_balance_filepath:        YOUR/PATH/TO/bureau_balance.csv
   bureau_filepath:                YOUR/PATH/TO/bureau.csv
   credit_card_balance_filepath:   YOUR/PATH/TO/credit_card_balance.csv
   installments_payments_filepath: YOUR/PATH/TO/installments_payments.csv
@@ -33,19 +32,22 @@ parameters:
 
 # Kaggle
   kaggle_api: 0
-  kaggle_message: 'solution-2'
+  kaggle_message: 'solution-3'
 
 # Data preparation
+  n_cv_splits: 5
   validation_size: 0.2
+  stratified_cv: True
   shuffle: 1
 
 # Execution
   clean_experiment_directory_before_training: 1
-  num_workers: 16
+  num_workers: 1
   verbose: 1
 
 # Preprocessing
-  fillna_value: -1
+  fill_missing: False
+  fill_value: None
 
 # Light GBM
   lgbm_random_search_runs: 0
@@ -53,19 +55,19 @@ parameters:
   lgbm__boosting_type: gbdt
   lgbm__objective: binary
   lgbm__metric: auc
-  lgbm__number_boosting_rounds: 10000
-  lgbm__early_stopping_rounds: 100
-  lgbm__learning_rate: 0.005
-  lgbm__num_leaves: 50
-  lgbm__max_depth: 20
-  lgbm__min_child_samples: 20
-  lgbm__max_bin: 300  # at most 255 for device=gpu
-  lgbm__subsample: 0.6
-  lgbm__subsample_freq: 0
-  lgbm__colsample_bytree: 0.8
-  lgbm__min_child_weight: 4
-  lgbm__reg_lambda: 0.05
-  lgbm__reg_alpha: 0.05
+  lgbm__number_boosting_rounds: 500
+  lgbm__early_stopping_rounds: 50
+  lgbm__learning_rate: 0.1
+  lgbm__max_bin: 300
+  lgbm__max_depth: -1
+  lgbm__num_leaves: 100
+  lgbm__min_child_samples: 600
+  lgbm__subsample: 1.0
+  lgbm__subsample_freq: 1
+  lgbm__colsample_bytree: 0.1
+  lgbm__min_gain_to_split: 0.5
+  lgbm__reg_lambda: 50.0
+  lgbm__reg_alpha: 0.0
   lgbm__scale_pos_weight: 1
 
 # XGBoost
@@ -117,3 +119,6 @@ parameters:
   svc__probability: True
   svc__tol: 0.00001
   svc__max_iter: -1
+
+# Postprocessing
+  aggregation_method: rank_mean
\ No newline at end of file
diff --git a/neptune_random_search.yaml b/neptune_random_search.yaml
index e1e765e..3659488 100644
--- a/neptune_random_search.yaml
+++ b/neptune_random_search.yaml
@@ -1,7 +1,7 @@
 project: ORGANIZATION/home-credit
 
 name: home-credit-default-risk
-tags: [solution-2]
+tags: [solution-3]
 
 metric:
   channel: 'ROC_AUC'
@@ -9,20 +9,19 @@ metric:
 
 exclude:
   - output
-  - imgs
+  - notebooks
   - neptune.log
   - offline_job.log
   - .git
   - .github
   - .idea
   - .ipynb_checkpoints
-  - Untitled.ipynb
 
 parameters:
 # Data
   train_filepath:                 YOUR/PATH/TO/application_train.csv
   test_filepath:                  YOUR/PATH/TO/application_test.csv
-  bureau_balance_filepath:        YOUR/PATH/TO/bureau_balance_filepath.csv
+  bureau_balance_filepath:        YOUR/PATH/TO/bureau_balance.csv
   bureau_filepath:                YOUR/PATH/TO/bureau.csv
   credit_card_balance_filepath:   YOUR/PATH/TO/credit_card_balance.csv
   installments_payments_filepath: YOUR/PATH/TO/installments_payments.csv
@@ -33,10 +32,12 @@ parameters:
 
 # Kaggle
   kaggle_api: 0
-  kaggle_message: 'solution-2'
+  kaggle_message: 'solution-3'
 
 # Data preparation
+  n_cv_splits: 5
   validation_size: 0.2
+  stratified_cv: True
   shuffle: 1
 
 # Execution
@@ -45,7 +46,8 @@ parameters:
   verbose: 1
 
 # Preprocessing
-  fillna_value: -1
+  fill_missing: False
+  fill_value: None
 
 # Light GBM
   lgbm_random_search_runs: 50
@@ -58,7 +60,7 @@ parameters:
   lgbm__learning_rate: '[0.0005, 0.1, "log-uniform"]'
   lgbm__num_leaves: '[20, 50]'
   lgbm__max_depth: '[7, 30]'
-  lgbm__min_child_samples: '[20, 45]'
+  lgbm__min_child_samples: '[20, 50]'
   lgbm__max_bin: '[180, 500]' # at most 255 for device=gpu
   lgbm__subsample: '[0.8, 0.9, 0.99, 0.6, 0.7, "list"]'
   lgbm__subsample_freq: 0
@@ -117,3 +119,6 @@ parameters:
   svc__probability: True
   svc__tol: '[0.00001, 0.01, "log-uniform"]'
   svc__max_iter: '[-1, 100, 1000, 10000, 50000, "list"]'
+
+# Postprocessing
+  aggregation_method: rank_mean
\ No newline at end of file
diff --git a/notebooks/eda-application.ipynb b/notebooks/eda-application.ipynb
new file mode 100644
index 0000000..85e57ea
--- /dev/null
+++ b/notebooks/eda-application.ipynb
@@ -0,0 +1,337 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "from sklearn.externals import joblib\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = pd.read_csv('/mnt/ml-team/minerva/open-solutions/home-credit/files/unzipped_data/application_train.csv')\n",
+    "X.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preprocessing\n",
+    "## Solution 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[Martin Kotek (Competition Host): \"Value 365243 denotes infinity in DAYS variables in the datasets, therefore you can consider them NA values. Also XNA/XAP denote NA values.\"](https://www.kaggle.com/c/home-credit-default-risk/discussion/57247)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X['CODE_GENDER'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "X.loc[X['DAYS_EMPLOYED'] > 0]['DAYS_EMPLOYED'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sum(X['ORGANIZATION_TYPE'] == 'XNA')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X['CODE_GENDER'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X['CODE_GENDER'].replace('XNA',np.nan, inplace=True)\n",
+    "X['CODE_GENDER'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Feature Engineering\n",
+    "## Solution 3\n",
+    "### Hand crafted features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']\n",
+    "X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']\n",
+    "X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']\n",
+    "X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']\n",
+    "X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']\n",
+    "X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']\n",
+    "X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']\n",
+    "X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']\n",
+    "X['income_credit_percentage'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']\n",
+    "X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])\n",
+    "X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']\n",
+    "X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']\n",
+    "X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH']\n",
+    "X['phone_to_employ_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_EMPLOYED']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# External sources\n",
+    "X['external_sources_weighted'] = X.EXT_SOURCE_1 * 2 + X.EXT_SOURCE_2 * 3 + X.EXT_SOURCE_3 * 4\n",
+    "for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:\n",
+    "    X['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(\n",
+    "        X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "engineered_numerical_columns = ['annuity_income_percentage',\n",
+    "                                'car_to_birth_ratio',\n",
+    "                                'car_to_employ_ratio',\n",
+    "                                'children_ratio',\n",
+    "                                'credit_to_annuity_ratio',\n",
+    "                                'credit_to_goods_ratio',\n",
+    "                                'credit_to_income_ratio',\n",
+    "                                'days_employed_percentage',\n",
+    "                                'income_credit_percentage',\n",
+    "                                'income_per_child',\n",
+    "                                'income_per_person',\n",
+    "                                'payment_rate',\n",
+    "                                'phone_to_birth_ratio',\n",
+    "                                'phone_to_employ_ratio',\n",
+    "                                'external_sources_weighted',\n",
+    "                                'external_sources_min',\n",
+    "                                'external_sources_max',\n",
+    "                                'external_sources_sum',\n",
+    "                                'external_sources_mean',\n",
+    "                                'external_sources_nanmedian']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_eng = X[engineered_numerical_columns + ['TARGET']]\n",
+    "X_eng_corr = abs(X_eng.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_eng_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.heatmap(X_eng_corr, \n",
+    "            xticklabels=X_eng_corr.columns,\n",
+    "            yticklabels=X_eng_corr.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Aggregation features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "AGGREGATION_RECIPIES = [\n",
+    "    (['CODE_GENDER', 'NAME_EDUCATION_TYPE'], [('AMT_ANNUITY', 'max'),\n",
+    "                                              ('AMT_CREDIT', 'max'),\n",
+    "                                              ('EXT_SOURCE_1', 'mean'),\n",
+    "                                              ('EXT_SOURCE_2', 'mean'),\n",
+    "                                              ('OWN_CAR_AGE', 'max'),\n",
+    "                                              ('OWN_CAR_AGE', 'sum')]),\n",
+    "    (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'),\n",
+    "                                            ('AMT_INCOME_TOTAL', 'mean'),\n",
+    "                                            ('DAYS_REGISTRATION', 'mean'),\n",
+    "                                            ('EXT_SOURCE_1', 'mean')]),\n",
+    "    (['CODE_GENDER', 'REG_CITY_NOT_WORK_CITY'], [('AMT_ANNUITY', 'mean'),\n",
+    "                                                 ('CNT_CHILDREN', 'mean'),\n",
+    "                                                 ('DAYS_ID_PUBLISH', 'mean')]),\n",
+    "    (['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('EXT_SOURCE_1', 'mean'),\n",
+    "                                                                                           ('EXT_SOURCE_2', 'mean')]),\n",
+    "    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'], [('AMT_CREDIT', 'mean'),\n",
+    "                                                  ('AMT_REQ_CREDIT_BUREAU_YEAR', 'mean'),\n",
+    "                                                  ('APARTMENTS_AVG', 'mean'),\n",
+    "                                                  ('BASEMENTAREA_AVG', 'mean'),\n",
+    "                                                  ('EXT_SOURCE_1', 'mean'),\n",
+    "                                                  ('EXT_SOURCE_2', 'mean'),\n",
+    "                                                  ('EXT_SOURCE_3', 'mean'),\n",
+    "                                                  ('NONLIVINGAREA_AVG', 'mean'),\n",
+    "                                                  ('OWN_CAR_AGE', 'mean'),\n",
+    "                                                  ('YEARS_BUILD_AVG', 'mean')]),\n",
+    "    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('ELEVATORS_AVG', 'mean'),\n",
+    "                                                                            ('EXT_SOURCE_1', 'mean')]),\n",
+    "    (['OCCUPATION_TYPE'], [('AMT_ANNUITY', 'mean'),\n",
+    "                           ('CNT_CHILDREN', 'mean'),\n",
+    "                           ('CNT_FAM_MEMBERS', 'mean'),\n",
+    "                           ('DAYS_BIRTH', 'mean'),\n",
+    "                           ('DAYS_EMPLOYED', 'mean'),\n",
+    "                           ('DAYS_ID_PUBLISH', 'mean'),\n",
+    "                           ('DAYS_REGISTRATION', 'mean'),\n",
+    "                           ('EXT_SOURCE_1', 'mean'),\n",
+    "                           ('EXT_SOURCE_2', 'mean'),\n",
+    "                           ('EXT_SOURCE_3', 'mean')]),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "groupby_aggregate_names = []\n",
+    "for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):\n",
+    "    group_object = X.groupby(groupby_cols)\n",
+    "    for select, agg in tqdm(specs):\n",
+    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
+    "        X = X.merge(group_object[select]\n",
+    "                              .agg(agg)\n",
+    "                              .reset_index()\n",
+    "                              .rename(index=str,\n",
+    "                                      columns={select: groupby_aggregate_name})\n",
+    "                              [groupby_cols + [groupby_aggregate_name]],\n",
+    "                              on=groupby_cols,\n",
+    "                              how='left')\n",
+    "        groupby_aggregate_names.append(groupby_aggregate_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_agg = X[groupby_aggregate_names + ['TARGET']]\n",
+    "X_agg_corr = abs(X_agg.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.heatmap(X_agg_corr, \n",
+    "            xticklabels=X_agg_corr.columns,\n",
+    "            yticklabels=X_agg_corr.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Solution 4 TODO\n",
+    "### Hand crafted features\n",
+    "* Explore other ext_sources features\n",
+    "* Explore unemployed feature"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/eda-bureau.ipynb b/notebooks/eda-bureau.ipynb
new file mode 100644
index 0000000..38b6aca
--- /dev/null
+++ b/notebooks/eda-bureau.ipynb
@@ -0,0 +1,459 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "from sklearn.externals import joblib\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns\n",
+    "\n",
+    "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n",
+    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
+    "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n",
+    "bureau = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/bureau.csv'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bureau.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preprocessing\n",
+    "## Solution 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "(bureau['AMT_CREDIT_SUM'] == 0).sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This shows that imputing with nan with 0 is probably a bad idea"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Feature Engineering\n",
+    "## Solution 3\n",
+    "### Hand crafted features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bureau[bureau['SK_ID_CURR']==215354]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### First build helper columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bureau['bureau_credit_active_binary'] = (bureau['CREDIT_ACTIVE'] != 'Closed').astype(int)\n",
+    "bureau['bureau_credit_enddate_binary'] = (bureau['DAYS_CREDIT_ENDDATE'] > 0).astype(int)\n",
+    "\n",
+    "groupby_SK_ID_CURR = bureau.groupby(by=['SK_ID_CURR'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "description[description['Row'] == 'DAYS_CREDIT'].Description.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features = pd.DataFrame({'SK_ID_CURR':bureau['SK_ID_CURR'].unique()})\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = groupby_SK_ID_CURR['DAYS_CREDIT'].agg('count').reset_index()\n",
+    "group_object.rename(index=str, columns={'DAYS_CREDIT': 'bureau_number_of_past_loans'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = groupby_SK_ID_CURR['CREDIT_TYPE'].agg('nunique').reset_index()\n",
+    "group_object.rename(index=str, columns={'CREDIT_TYPE': 'bureau_number_of_loan_types'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features['bureau_average_of_past_loans_per_type'] = \\\n",
+    "    features['bureau_number_of_past_loans'] / features['bureau_number_of_loan_types']\n",
+    "    \n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = groupby_SK_ID_CURR['bureau_credit_active_binary'].agg('mean').reset_index()\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index()\n",
+    "group_object.rename(index=str, columns={'AMT_CREDIT_SUM_DEBT': 'bureau_total_customer_debt'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM'].agg('sum').reset_index()\n",
+    "group_object.rename(index=str, columns={'AMT_CREDIT_SUM': 'bureau_total_customer_credit'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features['bureau_debt_credit_ratio'] = \\\n",
+    "    features['bureau_total_customer_debt'] / features['bureau_total_customer_credit']\n",
+    "    \n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index()\n",
+    "group_object.rename(index=str, columns={'AMT_CREDIT_SUM_OVERDUE': 'bureau_total_customer_overdue'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features['bureau_overdue_debt_ratio'] = \\\n",
+    "    features['bureau_total_customer_overdue'] / features['bureau_total_customer_debt']\n",
+    "    \n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = groupby_SK_ID_CURR['CNT_CREDIT_PROLONG'].agg('sum').reset_index()\n",
+    "group_object.rename(index=str, columns={'CNT_CREDIT_PROLONG': 'bureau_average_creditdays_prolonged'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = groupby_SK_ID_CURR['bureau_credit_enddate_binary'].agg('mean').reset_index()\n",
+    "group_object.rename(index=str, columns={'bureau_credit_enddate_binary': 'bureau_credit_enddate_percentage'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bureau_ONE = features[features['SK_ID_CURR']==215354]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bureau_ONE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application = application.merge(features,\n",
+    "                                left_on=['SK_ID_CURR'],\n",
+    "                                right_on=['SK_ID_CURR'],\n",
+    "                                how='left',\n",
+    "                                validate='one_to_one')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "engineered_numerical_columns = list(features.columns)\n",
+    "engineered_numerical_columns.remove('SK_ID_CURR')\n",
+    "bureau_eng = application[engineered_numerical_columns + ['TARGET']]\n",
+    "bureau_eng_corr = abs(bureau_eng.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bureau_eng_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.heatmap(bureau_eng_corr, \n",
+    "            xticklabels=bureau_eng_corr.columns,\n",
+    "            yticklabels=bureau_eng_corr.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Aggregations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BUREAU_AGGREGATION_RECIPIES = [('CREDIT_TYPE', 'count'),\n",
+    "                               ('CREDIT_ACTIVE', 'size')\n",
+    "                               ]\n",
+    "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n",
+    "    for select in ['AMT_ANNUITY',\n",
+    "                   'AMT_CREDIT_SUM',\n",
+    "                   'AMT_CREDIT_SUM_DEBT',\n",
+    "                   'AMT_CREDIT_SUM_LIMIT',\n",
+    "                   'AMT_CREDIT_SUM_OVERDUE',\n",
+    "                   'AMT_CREDIT_MAX_OVERDUE',\n",
+    "                   'CNT_CREDIT_PROLONG',\n",
+    "                   'CREDIT_DAY_OVERDUE',\n",
+    "                   'DAYS_CREDIT',\n",
+    "                   'DAYS_CREDIT_ENDDATE',\n",
+    "                   'DAYS_CREDIT_UPDATE'\n",
+    "                   ]:\n",
+    "        BUREAU_AGGREGATION_RECIPIES.append((select, agg))\n",
+    "BUREAU_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], BUREAU_AGGREGATION_RECIPIES)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "groupby_aggregate_names = []\n",
+    "for groupby_cols, specs in tqdm(BUREAU_AGGREGATION_RECIPIES):\n",
+    "    group_object = bureau.groupby(groupby_cols)\n",
+    "    for select, agg in tqdm(specs):\n",
+    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
+    "        application = application.merge(group_object[select]\n",
+    "                              .agg(agg)\n",
+    "                              .reset_index()\n",
+    "                              .rename(index=str,\n",
+    "                                      columns={select: groupby_aggregate_name})\n",
+    "                              [groupby_cols + [groupby_aggregate_name]],\n",
+    "                              on=groupby_cols,\n",
+    "                              how='left')\n",
+    "        groupby_aggregate_names.append(groupby_aggregate_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application_agg = application[groupby_aggregate_names + ['TARGET']]\n",
+    "application_agg_corr = abs(application_agg.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Solution 4\n",
+    "## Hand Crafted Features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# group = bureau[bureau['bureau_credit_enddate_binary'] == 1].groupby(\n",
+    "#     by=['SK_ID_CURR']).apply(\n",
+    "#     lambda x: x.sort_values(['DAYS_CREDIT_ENDDATE'], ascending=True)).reset_index(drop=True)\n",
+    "# group['bureau_days_enddate_diff'] = group.groupby(by=['SK_ID_CURR'])['DAYS_CREDIT_ENDDATE'].diff()\n",
+    "# group['bureau_days_enddate_diff'] = group['bureau_days_enddate_diff'].fillna(0).astype('uint32')\n",
+    "\n",
+    "# bureau = bureau.merge(group[['bureau_days_enddate_diff', 'SK_ID_BUREAU']], on=['SK_ID_BUREAU'], how='left')\n",
+    "# bureau['bureau_average_enddate_future'] = bureau.groupby(\n",
+    "#     by=['SK_ID_CURR'])['bureau_days_enddate_diff'].agg('mean').reset_index()['bureau_days_enddate_diff']\n",
+    "\n",
+    "# bureau['bureau_days_credit_diff'] = bureau.groupby(\n",
+    "#     by=['SK_ID_CURR']).apply(\n",
+    "#     lambda x: x.sort_values(['DAYS_CREDIT'], ascending=False)).reset_index(drop=True)['DAYS_CREDIT']\n",
+    "# bureau['bureau_days_credit_diff'] *= -1\n",
+    "# bureau['bureau_days_credit_diff'] = bureau.groupby(by=['SK_ID_CURR'])['bureau_days_credit_diff'].diff()\n",
+    "# bureau['bureau_days_credit_diff'] = bureau['bureau_days_credit_diff'].fillna(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cpu py3",
+   "language": "python",
+   "name": "cpu_py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/eda-credit_card.ipynb b/notebooks/eda-credit_card.ipynb
new file mode 100644
index 0000000..2afb2b8
--- /dev/null
+++ b/notebooks/eda-credit_card.ipynb
@@ -0,0 +1,451 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "from sklearn.externals import joblib\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns\n",
+    "\n",
+    "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n",
+    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
+    "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n",
+    "credit_card = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/credit_card_balance.csv'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "credit_card.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preprocessing\n",
+    "## Solution 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Feature Engineering\n",
+    "## Solution 3\n",
+    "### Hand crafted features"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### First build helper columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "credit_card['number_of_instalments'] = credit_card.groupby(\n",
+    "    by=['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].agg('max').reset_index()[\n",
+    "    'CNT_INSTALMENT_MATURE_CUM']\n",
+    "\n",
+    "credit_card['credit_card_max_loading_of_credit_limit'] = credit_card.groupby(\n",
+    "    by=['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply(\n",
+    "    lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "credit_card.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "description[description['Row'] == 'DAYS_CREDIT'].Description.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features = pd.DataFrame({'SK_ID_CURR':credit_card['SK_ID_CURR'].unique()})\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = credit_card.groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].agg('nunique').reset_index()\n",
+    "group_object.rename(index=str, columns={'SK_ID_PREV': 'credit_card_number_of_loans'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features['credit_card_number_of_loans'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Note\n",
+    "It is worth exploring `credit_card_number_of_loans>1` binary version of this variable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object= credit_card.groupby(by=['SK_ID_CURR'])['number_of_instalments'].sum().reset_index()\n",
+    "group_object.rename(index=str, columns={'number_of_instalments': 'credit_card_total_instalments'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features['credit_card_total_instalments'].value_counts()[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.distplot(features['credit_card_total_instalments'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Note\n",
+    "* Maybe adding a is zero variabl maxes sens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features['credit_card_installments_per_loan'] = (\n",
+    "    features['credit_card_total_instalments'] / features['credit_card_number_of_loans'])\n",
+    "    \n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = credit_card.groupby(by=['SK_ID_CURR'])['credit_card_max_loading_of_credit_limit'].agg('mean').reset_index()\n",
+    "group_object.rename(index=str, columns={'credit_card_max_loading_of_credit_limit': 'credit_card_avg_loading_of_credit_limit'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = credit_card.groupby(\n",
+    "    by=['SK_ID_CURR'])['SK_DPD'].agg('mean').reset_index()\n",
+    "group_object.rename(index=str, columns={'SK_DPD': 'credit_card_average_of_days_past_due'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = credit_card.groupby(by=['SK_ID_CURR'])['AMT_DRAWINGS_ATM_CURRENT'].agg('sum').reset_index()\n",
+    "group_object.rename(index=str, columns={'AMT_DRAWINGS_ATM_CURRENT': 'credit_card_drawings_atm'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_object = credit_card.groupby(by=['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].agg('sum').reset_index()\n",
+    "group_object.rename(index=str, columns={'AMT_DRAWINGS_CURRENT': 'credit_card_drawings_total'},inplace=True)\n",
+    "\n",
+    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features['credit_card_cash_card_ratio'] = features['credit_card_drawings_atm'] / features['credit_card_drawings_total']\n",
+    "\n",
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "credit_ONE = features[features['SK_ID_CURR']==215354]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "credit_ONE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application = application.merge(features,\n",
+    "                                left_on=['SK_ID_CURR'],\n",
+    "                                right_on=['SK_ID_CURR'],\n",
+    "                                how='left',\n",
+    "                                validate='one_to_one')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "engineered_numerical_columns = list(features.columns)\n",
+    "engineered_numerical_columns.remove('SK_ID_CURR')\n",
+    "credit_eng = application[engineered_numerical_columns + ['TARGET']]\n",
+    "credit_eng_corr = abs(credit_eng.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "credit_eng_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.heatmap(credit_eng_corr, \n",
+    "            xticklabels=credit_eng_corr.columns,\n",
+    "            yticklabels=credit_eng_corr.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Aggregations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = []\n",
+    "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n",
+    "    for select in ['AMT_BALANCE',\n",
+    "                   'AMT_CREDIT_LIMIT_ACTUAL',\n",
+    "                   'AMT_DRAWINGS_ATM_CURRENT',\n",
+    "                   'AMT_DRAWINGS_CURRENT',\n",
+    "                   'AMT_DRAWINGS_OTHER_CURRENT',\n",
+    "                   'AMT_DRAWINGS_POS_CURRENT',\n",
+    "                   'AMT_PAYMENT_CURRENT',\n",
+    "                   'CNT_DRAWINGS_ATM_CURRENT',\n",
+    "                   'CNT_DRAWINGS_CURRENT',\n",
+    "                   'CNT_DRAWINGS_OTHER_CURRENT',\n",
+    "                   'CNT_INSTALMENT_MATURE_CUM',\n",
+    "                   'MONTHS_BALANCE',\n",
+    "                   'SK_DPD',\n",
+    "                   'SK_DPD_DEF'\n",
+    "                   ]:\n",
+    "        CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES.append((select, agg))\n",
+    "CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "groupby_aggregate_names = []\n",
+    "for groupby_cols, specs in tqdm(CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES):\n",
+    "    group_object = credit.groupby(groupby_cols)\n",
+    "    for select, agg in tqdm(specs):\n",
+    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
+    "        application = application.merge(group_object[select]\n",
+    "                              .agg(agg)\n",
+    "                              .reset_index()\n",
+    "                              .rename(index=str,\n",
+    "                                      columns={select: groupby_aggregate_name})\n",
+    "                              [groupby_cols + [groupby_aggregate_name]],\n",
+    "                              on=groupby_cols,\n",
+    "                              how='left')\n",
+    "        groupby_aggregate_names.append(groupby_aggregate_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application_agg = application[groupby_aggregate_names + ['TARGET']]\n",
+    "application_agg_corr = abs(application_agg.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Solution 4\n",
+    "## Hand Crafted Features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# group = bureau[bureau['bureau_credit_enddate_binary'] == 1].groupby(\n",
+    "#     by=['SK_ID_CURR']).apply(\n",
+    "#     lambda x: x.sort_values(['DAYS_CREDIT_ENDDATE'], ascending=True)).reset_index(drop=True)\n",
+    "# group['bureau_days_enddate_diff'] = group.groupby(by=['SK_ID_CURR'])['DAYS_CREDIT_ENDDATE'].diff()\n",
+    "# group['bureau_days_enddate_diff'] = group['bureau_days_enddate_diff'].fillna(0).astype('uint32')\n",
+    "\n",
+    "# bureau = bureau.merge(group[['bureau_days_enddate_diff', 'SK_ID_BUREAU']], on=['SK_ID_BUREAU'], how='left')\n",
+    "# bureau['bureau_average_enddate_future'] = bureau.groupby(\n",
+    "#     by=['SK_ID_CURR'])['bureau_days_enddate_diff'].agg('mean').reset_index()['bureau_days_enddate_diff']\n",
+    "\n",
+    "# bureau['bureau_days_credit_diff'] = bureau.groupby(\n",
+    "#     by=['SK_ID_CURR']).apply(\n",
+    "#     lambda x: x.sort_values(['DAYS_CREDIT'], ascending=False)).reset_index(drop=True)['DAYS_CREDIT']\n",
+    "# bureau['bureau_days_credit_diff'] *= -1\n",
+    "# bureau['bureau_days_credit_diff'] = bureau.groupby(by=['SK_ID_CURR'])['bureau_days_credit_diff'].diff()\n",
+    "# bureau['bureau_days_credit_diff'] = bureau['bureau_days_credit_diff'].fillna(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cpu py3",
+   "language": "python",
+   "name": "cpu_py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/eda-external_sources.ipynb b/notebooks/eda-external_sources.ipynb
new file mode 100644
index 0000000..c85d613
--- /dev/null
+++ b/notebooks/eda-external_sources.ipynb
@@ -0,0 +1,228 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "from sklearn.externals import joblib\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns\n",
+    "\n",
+    "from sklearn import tree\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = pd.read_csv('/mnt/ml-team/minerva/open-solutions/home-credit/files/unzipped_data/application_train.csv')\n",
+    "X.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_ext = X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'TARGET']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# X_ext = X_ext.fillna(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "        for function_name in ['nanmin', 'nanmax', 'sum', 'mean', 'var', 'median', 'std', 'nanmedian', 'nanmean', 'min', 'max']:\n",
+    "            X_ext['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(\n",
+    "                X_ext[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_ext.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_ext['EXT_SRC_weighted3'] = (X.EXT_SOURCE_1*2+X.EXT_SOURCE_2*3+X.EXT_SOURCE_3*4)/9\n",
+    "X_ext['EXT_SRC_weighted2'] = (X.EXT_SOURCE_1*3+X.EXT_SOURCE_2*4+X.EXT_SOURCE_3*2)/9\n",
+    "X_ext['EXT_SRC_weighted1'] = (X.EXT_SOURCE_1*4+X.EXT_SOURCE_2*2+X.EXT_SOURCE_3*3)/9"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_ext.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_ext_corr = abs(X_ext.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_ext_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.heatmap(X_ext_corr, \n",
+    "            xticklabels=X_ext_corr.columns,\n",
+    "            yticklabels=X_ext_corr.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Tree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test = train_test_split(X_ext)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_train = X_train['TARGET']\n",
+    "Y_test = X_test['TARGET']\n",
+    "\n",
+    "X_train = X_train.drop(columns='TARGET')\n",
+    "X_test = X_test.drop(columns='TARGET')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train = X_train.fillna(0)\n",
+    "X_test = X_test.fillna(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clf = tree.DecisionTreeClassifier()\n",
+    "clf.fit(X_train, Y_train)\n",
+    "\n",
+    "print(\"R^2 on the train set:\")\n",
+    "print(clf.score(X_train, Y_train))\n",
+    "\n",
+    "print(\"\\nR^2 on the test set:\")\n",
+    "print(clf.score(X_test, Y_test))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feature_importances = pd.Series(clf.feature_importances_, index=X_train.columns.values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feature_importances.sort_values(ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/eda-installments.ipynb b/notebooks/eda-installments.ipynb
new file mode 100644
index 0000000..3b27d73
--- /dev/null
+++ b/notebooks/eda-installments.ipynb
@@ -0,0 +1,152 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "from sklearn.externals import joblib\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns\n",
+    "\n",
+    "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n",
+    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
+    "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n",
+    "installments = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/installments_payments.csv'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "installments.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preprocessing\n",
+    "## Solution 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Feature Engineering\n",
+    "## Solution 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Aggregations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = []\n",
+    "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n",
+    "    for select in ['AMT_INSTALMENT',\n",
+    "                   'AMT_PAYMENT',\n",
+    "                   'DAYS_ENTRY_PAYMENT',\n",
+    "                   'DAYS_INSTALMENT',\n",
+    "                   'NUM_INSTALMENT_NUMBER',\n",
+    "                   'NUM_INSTALMENT_VERSION'\n",
+    "                   ]:\n",
+    "        INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES.append((select, agg))\n",
+    "INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "groupby_aggregate_names = []\n",
+    "for groupby_cols, specs in tqdm(INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES):\n",
+    "    group_object = installments.groupby(groupby_cols)\n",
+    "    for select, agg in tqdm(specs):\n",
+    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
+    "        application = application.merge(group_object[select]\n",
+    "                              .agg(agg)\n",
+    "                              .reset_index()\n",
+    "                              .rename(index=str,\n",
+    "                                      columns={select: groupby_aggregate_name})\n",
+    "                              [groupby_cols + [groupby_aggregate_name]],\n",
+    "                              on=groupby_cols,\n",
+    "                              how='left')\n",
+    "        groupby_aggregate_names.append(groupby_aggregate_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application_agg = application[groupby_aggregate_names + ['TARGET']]\n",
+    "application_agg_corr = abs(application_agg.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cpu py3",
+   "language": "python",
+   "name": "cpu_py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/eda-pos_cash_balance.ipynb b/notebooks/eda-pos_cash_balance.ipynb
new file mode 100644
index 0000000..94583bc
--- /dev/null
+++ b/notebooks/eda-pos_cash_balance.ipynb
@@ -0,0 +1,149 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "from sklearn.externals import joblib\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns\n",
+    "\n",
+    "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n",
+    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
+    "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n",
+    "pos_cash_balance = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/POS_CASH_balance.csv'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pos_cash_balance.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preprocessing\n",
+    "## Solution 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Feature Engineering\n",
+    "## Solution 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Aggregations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "POS_CASH_BALANCE_AGGREGATION_RECIPIES = []\n",
+    "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n",
+    "    for select in ['MONTHS_BALANCE',\n",
+    "                   'SK_DPD',\n",
+    "                   'SK_DPD_DEF'\n",
+    "                   ]:\n",
+    "        POS_CASH_BALANCE_AGGREGATION_RECIPIES.append((select, agg))\n",
+    "POS_CASH_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], POS_CASH_BALANCE_AGGREGATION_RECIPIES)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "groupby_aggregate_names = []\n",
+    "for groupby_cols, specs in tqdm(POS_CASH_BALANCE_AGGREGATION_RECIPIES):\n",
+    "    group_object = pos_cash_balance.groupby(groupby_cols)\n",
+    "    for select, agg in tqdm(specs):\n",
+    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
+    "        application = application.merge(group_object[select]\n",
+    "                              .agg(agg)\n",
+    "                              .reset_index()\n",
+    "                              .rename(index=str,\n",
+    "                                      columns={select: groupby_aggregate_name})\n",
+    "                              [groupby_cols + [groupby_aggregate_name]],\n",
+    "                              on=groupby_cols,\n",
+    "                              how='left')\n",
+    "        groupby_aggregate_names.append(groupby_aggregate_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application_agg = application[groupby_aggregate_names + ['TARGET']]\n",
+    "application_agg_corr = abs(application_agg.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cpu py3",
+   "language": "python",
+   "name": "cpu_py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/eda-previous_application.ipynb b/notebooks/eda-previous_application.ipynb
new file mode 100644
index 0000000..be6126b
--- /dev/null
+++ b/notebooks/eda-previous_application.ipynb
@@ -0,0 +1,155 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "from sklearn.externals import joblib\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns\n",
+    "\n",
+    "DIR = '/mnt/ml-team/minerva/open-solutions/home-credit'\n",
+    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
+    "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n",
+    "previous_application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/previous_application.csv'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "previous_application.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preprocessing\n",
+    "## Solution 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Feature Engineering\n",
+    "## Solution 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Aggregations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = []\n",
+    "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n",
+    "    for select in ['AMT_ANNUITY',\n",
+    "                   'AMT_APPLICATION',\n",
+    "                   'AMT_CREDIT',\n",
+    "                   'AMT_DOWN_PAYMENT',\n",
+    "                   'AMT_GOODS_PRICE',\n",
+    "                   'CNT_PAYMENT',\n",
+    "                   'DAYS_DECISION',\n",
+    "                   'HOUR_APPR_PROCESS_START',\n",
+    "                   'RATE_DOWN_PAYMENT'\n",
+    "                   ]:\n",
+    "        PREVIOUS_APPLICATION_AGGREGATION_RECIPIES.append((select, agg))\n",
+    "PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], PREVIOUS_APPLICATION_AGGREGATION_RECIPIES)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "groupby_aggregate_names = []\n",
+    "for groupby_cols, specs in tqdm(PREVIOUS_APPLICATION_AGGREGATION_RECIPIES):\n",
+    "    group_object = previous_application.groupby(groupby_cols)\n",
+    "    for select, agg in tqdm(specs):\n",
+    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
+    "        application = application.merge(group_object[select]\n",
+    "                              .agg(agg)\n",
+    "                              .reset_index()\n",
+    "                              .rename(index=str,\n",
+    "                                      columns={select: groupby_aggregate_name})\n",
+    "                              [groupby_cols + [groupby_aggregate_name]],\n",
+    "                              on=groupby_cols,\n",
+    "                              how='left')\n",
+    "        groupby_aggregate_names.append(groupby_aggregate_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application_agg = application[groupby_aggregate_names + ['TARGET']]\n",
+    "application_agg_corr = abs(application_agg.corr())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cpu py3",
+   "language": "python",
+   "name": "cpu_py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/model_exploration.ipynb b/notebooks/model_exploration.ipynb
new file mode 100644
index 0000000..7ea57c9
--- /dev/null
+++ b/notebooks/model_exploration.ipynb
@@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%matplotlib inline\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "from sklearn.externals import joblib\n",
+    "import lightgbm as lgb\n",
+    "\n",
+    "EXPERIMENT_DIR = '/mnt/ml-team/minerva/open-solutions/home-credit/kuba/experiments'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_filepath = os.path.join(EXPERIMENT_DIR, 'solution_3_all_new_externals_790', 'transformers','light_gbm_fold_0')\n",
+    "light_gbm_model = joblib.load(model_filepath)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(1,1,figsize=(16,10))\n",
+    "lgb.plot_importance(light_gbm_model, max_num_features=20, ax=ax)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TREE_INDEX = 201\n",
+    "digraph = lgb.create_tree_digraph(light_gbm_model, tree_index=TREE_INDEX, show_info=['split_gain'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "digraph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cpu py3",
+   "language": "python",
+   "name": "cpu_py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/overview.ipynb b/notebooks/overview.ipynb
new file mode 100644
index 0000000..4f762a9
--- /dev/null
+++ b/notebooks/overview.ipynb
@@ -0,0 +1,63 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.append('../')\n",
+    "\n",
+    "from src import pipeline_config  as cfg\n",
+    "from src.pipelines import PIPELINES"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "solution_3 = PIPELINES['lightGBM'](config=cfg.SOLUTION_CONFIG, train_mode=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "solution_3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cpu py3",
+   "language": "python",
+   "name": "cpu_py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/test_prediction_distributions.ipynb b/notebooks/test_prediction_distributions.ipynb
new file mode 100644
index 0000000..2babee4
--- /dev/null
+++ b/notebooks/test_prediction_distributions.ipynb
@@ -0,0 +1,142 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from scipy.stats import gmean\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns\n",
+    "\n",
+    "FILEPATH = '/mnt/ml-team/minerva/open-solutions/home-credit/kuba/experiments/solution_3/lightGBM_out_of_fold_test_predictions.csv'\n",
+    "test_predictions = pd.read_csv(FILEPATH)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_predictions.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_predictions[test_predictions['SK_ID_CURR']==100001]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Arithmetic mean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_prediction_ar_mean = test_predictions.groupby('SK_ID_CURR')['lightGBM_prediction'].apply(np.mean).reset_index()\n",
+    "\n",
+    "test_prediction_ar_mean.columns = ['SK_ID_CURR','TARGET']\n",
+    "test_prediction_ar_mean[test_prediction_ar_mean['SK_ID_CURR']==100001]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Geometric mean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_prediction_geom_mean = test_predictions.groupby('SK_ID_CURR')['lightGBM_prediction'].apply(gmean).reset_index()\n",
+    "\n",
+    "test_prediction_geom_mean.columns = ['SK_ID_CURR','TARGET']\n",
+    "test_prediction_geom_mean[test_prediction_ar_mean['SK_ID_CURR']==100001]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Rank Mean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_rank(predictions):\n",
+    "    rank = (1 + predictions.rank().values) / (predictions.shape[0] + 1)\n",
+    "    return rank\n",
+    "\n",
+    "test_predictions_with_ranks = []\n",
+    "for fold_id, fold_df in test_predictions.groupby('fold_id'):\n",
+    "    fold_df['lightGBM_rank'] = calculate_rank(fold_df['lightGBM_prediction'])\n",
+    "    test_predictions_with_ranks.append(fold_df)\n",
+    "test_predictions_with_ranks = pd.concat(test_predictions_with_ranks, axis=0)\n",
+    "\n",
+    "test_predictions_with_ranks.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_prediction_rank_mean = test_predictions_with_ranks.groupby('SK_ID_CURR')['lightGBM_rank'].apply(np.mean).reset_index()\n",
+    "\n",
+    "test_prediction_rank_mean.columns = ['SK_ID_CURR','TARGET']\n",
+    "test_prediction_rank_mean[test_prediction_ar_mean['SK_ID_CURR']==100001]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cpu py3",
+   "language": "python",
+   "name": "cpu_py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pipeline_blocks.py b/pipeline_blocks.py
deleted file mode 100644
index c17af89..0000000
--- a/pipeline_blocks.py
+++ /dev/null
@@ -1,427 +0,0 @@
-from functools import partial
-
-from sklearn.metrics import roc_auc_score
-from steppy.adapter import Adapter, E
-from steppy.base import Step, make_transformer
-from toolkit.misc import LightGBM
-
-import feature_extraction as fe
-from hyperparameter_tuning import RandomSearchOptimizer, NeptuneMonitor, PersistResults
-from models import get_sklearn_classifier, XGBoost
-from utils import ToNumpyLabel
-
-
-def classifier_light_gbm(features, config, train_mode, **kwargs):
-    if train_mode:
-        features_train, features_valid = features
-        if config.random_search.light_gbm.n_runs:
-            transformer = RandomSearchOptimizer(TransformerClass=LightGBM,
-                                                params=config.light_gbm,
-                                                train_input_keys=[],
-                                                valid_input_keys=['X_valid', 'y_valid'],
-                                                score_func=roc_auc_score,
-                                                maximize=True,
-                                                n_runs=config.random_search.light_gbm.n_runs,
-                                                callbacks=[
-                                                    NeptuneMonitor(
-                                                        **config.random_search.light_gbm.callbacks.neptune_monitor),
-                                                    PersistResults(
-                                                        **config.random_search.light_gbm.callbacks.persist_results)]
-                                                )
-        else:
-            transformer = LightGBM(**config.light_gbm)
-
-        light_gbm = Step(name='light_gbm',
-                         transformer=transformer,
-                         input_data=['input'],
-                         input_steps=[features_train, features_valid],
-                         adapter=Adapter({'X': E(features_train.name, 'features'),
-                                          'y': E('input', 'y'),
-                                          'feature_names': E(features_train.name, 'feature_names'),
-                                          'categorical_features': E(features_train.name, 'categorical_features'),
-                                          'X_valid': E(features_valid.name, 'features'),
-                                          'y_valid': E('input', 'y_valid'),
-                                          }),
-                         experiment_directory=config.pipeline.experiment_directory,
-                         **kwargs)
-    else:
-        light_gbm = Step(name='light_gbm',
-                         transformer=LightGBM(**config.light_gbm),
-                         input_steps=[features],
-                         adapter=Adapter({'X': E(features.name, 'features')}),
-                         experiment_directory=config.pipeline.experiment_directory,
-                         **kwargs)
-    return light_gbm
-
-
-def classifier_xgb(features, config, train_mode, **kwargs):
-    if train_mode:
-        features_train, features_valid = features
-        if config.random_search.xgboost.n_runs:
-            transformer = RandomSearchOptimizer(TransformerClass=XGBoost,
-                                                params=config.xgboost,
-                                                train_input_keys=[],
-                                                valid_input_keys=['X_valid', 'y_valid'],
-                                                score_func=roc_auc_score,
-                                                maximize=True,
-                                                n_runs=config.random_search.xgboost.n_runs,
-                                                callbacks=[
-                                                    NeptuneMonitor(
-                                                        **config.random_search.xgboost.callbacks.neptune_monitor),
-                                                    PersistResults(
-                                                        **config.random_search.xgboost.callbacks.persist_results)]
-                                                )
-        else:
-            transformer = XGBoost(**config.xgboost)
-
-        xgboost = Step(name='xgboost',
-                       transformer=transformer,
-                       input_data=['input'],
-                       input_steps=[features_train, features_valid],
-                       adapter=Adapter({'X': E(features_train.name, 'features'),
-                                        'y': E('input', 'y'),
-                                        'feature_names': E(features_train.name, 'feature_names'),
-                                        'X_valid': E(features_valid.name, 'features'),
-                                        'y_valid': E('input', 'y_valid'),
-                                        }),
-                       experiment_directory=config.pipeline.experiment_directory,
-                       **kwargs)
-    else:
-        xgboost = Step(name='xgboost',
-                       transformer=XGBoost(**config.xgboost),
-                       input_steps=[features],
-                       adapter=Adapter({'X': E(features.name, 'features')}),
-                       experiment_directory=config.pipeline.experiment_directory,
-                       **kwargs)
-    return xgboost
-
-
-def classifier_sklearn(sklearn_features, ClassifierClass, full_config, clf_name, train_mode, normalize, **kwargs):
-    config, model_params, rs_config = full_config
-    if train_mode:
-        if config.random_search.random_forest.n_runs:
-            transformer = RandomSearchOptimizer(
-                partial(get_sklearn_classifier,
-                        ClassifierClass=ClassifierClass,
-                        normalize=normalize),
-                model_params,
-                train_input_keys=[],
-                valid_input_keys=['X_valid', 'y_valid'],
-                score_func=roc_auc_score,
-                maximize=True,
-                n_runs=rs_config.n_runs,
-                callbacks=[NeptuneMonitor(**rs_config.callbacks.neptune_monitor),
-                           PersistResults(**rs_config.callbacks.persist_results)]
-            )
-        else:
-            transformer = get_sklearn_classifier(ClassifierClass, normalize, **model_params)
-
-        sklearn_clf = Step(name=clf_name,
-                           transformer=transformer,
-                           input_data=['input'],
-                           input_steps=[sklearn_features],
-                           adapter=Adapter({'X': E(sklearn_features.name, 'X'),
-                                            'y': E('input', 'y'),
-                                            'X_valid': E(sklearn_features.name, 'X_valid'),
-                                            'y_valid': E('input', 'y_valid'),
-                                            }),
-                           experiment_directory=config.pipeline.experiment_directory,
-                           **kwargs)
-    else:
-        sklearn_clf = Step(name=clf_name,
-                           transformer=get_sklearn_classifier(ClassifierClass, normalize, **model_params),
-                           input_steps=[sklearn_features],
-                           adapter=Adapter({'X': E(sklearn_features.name, 'X')}),
-                           experiment_directory=config.pipeline.experiment_directory,
-                           **kwargs)
-    return sklearn_clf
-
-
-def feature_extraction(config, train_mode, **kwargs):
-    if train_mode:
-        feature_by_type_split, feature_by_type_split_valid = _feature_by_type_splits(config, train_mode)
-        bureau, bureau_valid = _bureau(config, train_mode, **kwargs)
-
-        categorical_encoder, categorical_encoder_valid = _categorical_encoders(
-            (feature_by_type_split, feature_by_type_split_valid),
-            config,
-            train_mode,
-            **kwargs)
-
-        groupby_aggregation, groupby_aggregation_valid = _groupby_aggregations(
-            (feature_by_type_split, feature_by_type_split_valid),
-            config,
-            train_mode,
-            **kwargs)
-
-        feature_combiner, feature_combiner_valid = _join_features(numerical_features=[feature_by_type_split,
-                                                                                      groupby_aggregation,
-                                                                                      bureau],
-                                                                  numerical_features_valid=[feature_by_type_split_valid,
-                                                                                            groupby_aggregation_valid,
-                                                                                            bureau_valid],
-                                                                  categorical_features=[categorical_encoder],
-                                                                  categorical_features_valid=[
-                                                                      categorical_encoder_valid],
-                                                                  config=config,
-                                                                  train_mode=train_mode,
-                                                                  **kwargs)
-
-        return feature_combiner, feature_combiner_valid
-    else:
-        feature_by_type_split = _feature_by_type_splits(config, train_mode)
-        bureau = _bureau(config, train_mode, **kwargs)
-        categorical_encoder = _categorical_encoders(feature_by_type_split, config, train_mode, **kwargs)
-        groupby_aggregation = _groupby_aggregations(feature_by_type_split, config, train_mode, **kwargs)
-        feature_combiner = _join_features(numerical_features=[feature_by_type_split, groupby_aggregation, bureau],
-                                          numerical_features_valid=[],
-                                          categorical_features=[categorical_encoder],
-                                          categorical_features_valid=[],
-                                          config=config,
-                                          train_mode=train_mode,
-                                          **kwargs)
-
-        return feature_combiner
-
-
-def preprocessing_fillna(features, config, train_mode, **kwargs):
-    if train_mode:
-        features_train, features_valid = features
-        fillna = Step(name='fillna',
-                      transformer=_fillna(**config.preprocessing),
-                      input_data=['input'],
-                      input_steps=[features_train, features_valid],
-                      adapter=Adapter({'X': E(features_train.name, 'features'),
-                                       'X_valid': E(features_valid.name, 'features'),
-                                       }),
-                      experiment_directory=config.pipeline.experiment_directory,
-                      **kwargs
-                      )
-    else:
-        fillna = Step(name='fillna',
-                      transformer=_fillna(**config.preprocessing),
-                      input_data=['input'],
-                      input_steps=[features],
-                      adapter=Adapter({'X': E(features.name, 'features')}),
-                      experiment_directory=config.pipeline.experiment_directory,
-                      **kwargs
-                      )
-    return fillna
-
-
-def _feature_by_type_splits(config, train_mode):
-    if train_mode:
-        feature_by_type_split = Step(name='feature_by_type_split',
-                                     transformer=fe.DataFrameByTypeSplitter(**config.dataframe_by_type_splitter),
-                                     input_data=['input'],
-                                     adapter=Adapter({'X': E('input', 'X')}),
-                                     experiment_directory=config.pipeline.experiment_directory)
-
-        feature_by_type_split_valid = Step(name='feature_by_type_split_valid',
-                                           transformer=feature_by_type_split,
-                                           input_data=['input'],
-                                           adapter=Adapter({'X': E('input', 'X_valid')}),
-                                           experiment_directory=config.pipeline.experiment_directory)
-
-        return feature_by_type_split, feature_by_type_split_valid
-
-    else:
-        feature_by_type_split = Step(name='feature_by_type_split',
-                                     transformer=fe.DataFrameByTypeSplitter(**config.dataframe_by_type_splitter),
-                                     input_data=['input'],
-                                     adapter=Adapter({'X': E('input', 'X')}),
-                                     experiment_directory=config.pipeline.experiment_directory)
-
-    return feature_by_type_split
-
-
-def _join_features(numerical_features,
-                   numerical_features_valid,
-                   categorical_features,
-                   categorical_features_valid,
-                   config, train_mode,
-                   **kwargs):
-    if train_mode:
-        feature_joiner = Step(name='feature_joiner',
-                              transformer=fe.FeatureJoiner(),
-                              input_steps=numerical_features + categorical_features,
-                              adapter=Adapter({
-                                  'numerical_feature_list': [
-                                      E(feature.name, 'numerical_features') for feature in numerical_features],
-                                  'categorical_feature_list': [
-                                      E(feature.name, 'categorical_features') for feature in categorical_features],
-                              }),
-                              experiment_directory=config.pipeline.experiment_directory,
-                              **kwargs)
-
-        feature_joiner_valid = Step(name='feature_joiner_valid',
-                                    transformer=feature_joiner,
-                                    input_steps=numerical_features_valid + categorical_features_valid,
-                                    adapter=Adapter({
-                                        'numerical_feature_list': [
-                                            E(feature.name,
-                                              'numerical_features') for feature in numerical_features_valid],
-                                        'categorical_feature_list': [
-                                            E(feature.name,
-                                              'categorical_features') for feature in categorical_features_valid],
-                                    }),
-                                    experiment_directory=config.pipeline.experiment_directory,
-                                    **kwargs)
-
-        return feature_joiner, feature_joiner_valid
-
-    else:
-        feature_joiner = Step(name='feature_joiner',
-                              transformer=fe.FeatureJoiner(),
-                              input_steps=numerical_features + categorical_features,
-                              adapter=Adapter(
-                                  {'numerical_feature_list':
-                                       [E(feature.name, 'numerical_features') for feature in numerical_features],
-                                   'categorical_feature_list':
-                                       [E(feature.name, 'categorical_features') for feature in categorical_features]}
-                              ),
-                              experiment_directory=config.pipeline.experiment_directory,
-                              **kwargs)
-
-    return feature_joiner
-
-
-def _categorical_encoders(dispatchers, config, train_mode, **kwargs):
-    if train_mode:
-        feature_by_type_split, feature_by_type_split_valid = dispatchers
-        numpy_label, numpy_label_valid = _to_numpy_label(config, **kwargs)
-        categorical_encoder = Step(name='categorical_encoder',
-                                   transformer=fe.CategoricalEncoder(),
-                                   input_data=['input'],
-                                   input_steps=[feature_by_type_split, numpy_label],
-                                   adapter=Adapter({'X': E(feature_by_type_split.name, 'categorical_features'),
-                                                    'y': E(numpy_label.name, 'y')}
-                                                   ),
-                                   experiment_directory=config.pipeline.experiment_directory,
-                                   **kwargs)
-
-        categorical_encoder_valid = Step(name='categorical_encoder_valid',
-                                         transformer=categorical_encoder,
-                                         input_data=['input'],
-                                         input_steps=[feature_by_type_split_valid, numpy_label_valid],
-                                         adapter=Adapter(
-                                             {'X': E(feature_by_type_split_valid.name, 'categorical_features'),
-                                              'y': E(numpy_label_valid.name, 'y')}
-                                         ),
-                                         experiment_directory=config.pipeline.experiment_directory,
-                                         **kwargs)
-
-        return categorical_encoder, categorical_encoder_valid
-    else:
-        feature_by_type_split = dispatchers
-        categorical_encoder = Step(name='categorical_encoder',
-                                   transformer=fe.CategoricalEncoder(),
-                                   input_data=['input'],
-                                   input_steps=[feature_by_type_split],
-                                   adapter=Adapter({'X': E(feature_by_type_split.name, 'categorical_features')}),
-                                   experiment_directory=config.pipeline.experiment_directory,
-                                   **kwargs)
-        return categorical_encoder
-
-
-def _groupby_aggregations(dispatchers, config, train_mode, **kwargs):
-    if train_mode:
-        feature_by_type_split, feature_by_type_split_valid = dispatchers
-        groupby_aggregations = Step(name='groupby_aggregations',
-                                    transformer=fe.GroupbyAggregations(**config.groupby_aggregation),
-                                    input_data=['input'],
-                                    input_steps=[feature_by_type_split],
-                                    adapter=Adapter({'categorical_features': E(feature_by_type_split.name,
-                                                                               'categorical_features'),
-                                                     'numerical_features': E(feature_by_type_split.name,
-                                                                             'numerical_features')
-                                                     }),
-                                    experiment_directory=config.pipeline.experiment_directory,
-                                    **kwargs)
-
-        groupby_aggregations_valid = Step(name='groupby_aggregations_valid',
-                                          transformer=groupby_aggregations,
-                                          input_data=['input'],
-                                          input_steps=[feature_by_type_split_valid],
-                                          adapter=Adapter({'categorical_features': E(feature_by_type_split_valid.name,
-                                                                                     'categorical_features'),
-                                                           'numerical_features': E(feature_by_type_split_valid.name,
-                                                                                   'numerical_features')
-                                                           }),
-                                          experiment_directory=config.pipeline.experiment_directory,
-                                          **kwargs)
-
-        return groupby_aggregations, groupby_aggregations_valid
-
-    else:
-        feature_by_type_split = dispatchers
-        groupby_aggregations = Step(name='groupby_aggregations',
-                                    transformer=fe.GroupbyAggregations(**config.groupby_aggregation),
-                                    input_data=['input'],
-                                    input_steps=[feature_by_type_split],
-                                    adapter=Adapter({'categorical_features': E(feature_by_type_split.name,
-                                                                               'categorical_features'),
-                                                     'numerical_features': E(feature_by_type_split.name,
-                                                                             'numerical_features')
-                                                     }),
-                                    experiment_directory=config.pipeline.experiment_directory,
-                                    **kwargs)
-
-        return groupby_aggregations
-
-
-def _bureau(config, train_mode, **kwargs):
-    if train_mode:
-        bureau = Step(name='bureau',
-                      transformer=fe.GroupbyAggregationFromFile(**config.bureau),
-                      input_data=['input'],
-                      adapter=Adapter({'X': E('input', 'X')}),
-                      experiment_directory=config.pipeline.experiment_directory,
-                      **kwargs)
-
-        bureau_valid = Step(name='bureau_valid',
-                            transformer=bureau,
-                            input_data=['input'],
-                            adapter=Adapter({'X': E('input', 'X_valid')}),
-                            experiment_directory=config.pipeline.experiment_directory,
-                            **kwargs)
-
-        return bureau, bureau_valid
-
-    else:
-        bureau = Step(name='bureau',
-                      transformer=fe.GroupbyAggregationFromFile(**config.bureau),
-                      input_data=['input'],
-                      adapter=Adapter({'X': E('input', 'X')}),
-                      experiment_directory=config.pipeline.experiment_directory,
-                      **kwargs)
-
-        return bureau
-
-
-def _fillna(fillna_value):
-    def _inner_fillna(X, X_valid=None):
-        if X_valid is None:
-            return {'X': X.fillna(fillna_value)}
-        else:
-            return {'X': X.fillna(fillna_value),
-                    'X_valid': X_valid.fillna(fillna_value)}
-    return make_transformer(_inner_fillna)
-
-
-def _to_numpy_label(config, **kwargs):
-    to_numpy_label = Step(name='to_numpy_label',
-                          transformer=ToNumpyLabel(),
-                          input_data=['input'],
-                          adapter=Adapter({'y': [E('input', 'y')]}),
-                          experiment_directory=config.pipeline.experiment_directory,
-                          **kwargs)
-
-    to_numpy_label_valid = Step(name='to_numpy_label_valid',
-                                transformer=to_numpy_label,
-                                input_data=['input'],
-                                adapter=Adapter({'y': [E('input', 'y_valid')]}),
-                                experiment_directory=config.pipeline.experiment_directory,
-                                **kwargs)
-
-    return to_numpy_label, to_numpy_label_valid
diff --git a/postprocessing.py b/postprocessing.py
deleted file mode 100644
index a9edc86..0000000
--- a/postprocessing.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import numpy as np
-
-from steppy.base import BaseTransformer
-
-
-class Clipper(BaseTransformer):
-    def __init__(self, min_val=0, max_val=1):
-        super().__init__()
-        self.min_val = min_val
-        self.max_val = max_val
-
-    def transform(self, prediction):
-        prediction_ = np.clip(prediction, self.min_val, self.max_val)
-        return {'clipped_prediction': prediction_}
diff --git a/src/data_cleaning.py b/src/data_cleaning.py
new file mode 100644
index 0000000..914efea
--- /dev/null
+++ b/src/data_cleaning.py
@@ -0,0 +1,33 @@
+import numpy as np
+from steppy.base import BaseTransformer
+from steppy.utils import get_logger
+
+logger = get_logger()
+
+
+class ApplicationCleaning(BaseTransformer):
+    def __init__(self, **kwargs):
+        super().__init__()
+
+    def transform(self, X):
+        X['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
+        X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
+        X['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
+        X['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)
+
+        return {'X': X}
+
+
+class BureauCleaning(BaseTransformer):
+    def __init__(self, fill_missing=False, fill_value=0, **kwargs):
+        self.fill_missing = fill_missing
+        self.fill_value = fill_value
+
+    def transform(self, bureau):
+        if self.fill_missing:
+            bureau['AMT_CREDIT_SUM'].fillna(self.fill_value, inplace=True)
+            bureau['AMT_CREDIT_SUM_DEBT'].fillna(self.fill_value, inplace=True)
+            bureau['AMT_CREDIT_SUM_OVERDUE'].fillna(self.fill_value, inplace=True)
+            bureau['CNT_CREDIT_PROLONG'].fillna(self.fill_value, inplace=True)
+
+        return {'bureau': bureau}
diff --git a/src/feature_extraction.py b/src/feature_extraction.py
new file mode 100644
index 0000000..a726ebb
--- /dev/null
+++ b/src/feature_extraction.py
@@ -0,0 +1,370 @@
+from copy import deepcopy
+
+import category_encoders as ce
+import numpy as np
+import pandas as pd
+from sklearn.externals import joblib
+from steppy.base import BaseTransformer
+from steppy.utils import get_logger
+
+logger = get_logger()
+
+
+class FeatureJoiner(BaseTransformer):
+    def transform(self, numerical_feature_list, categorical_feature_list, **kwargs):
+        features = numerical_feature_list + categorical_feature_list
+        for feature in features:
+            feature.reset_index(drop=True, inplace=True)
+        outputs = dict()
+        outputs['features'] = pd.concat(features, axis=1).astype(np.float32)
+        outputs['feature_names'] = self._get_feature_names(features)
+        outputs['categorical_features'] = self._get_feature_names(categorical_feature_list)
+        return outputs
+
+    def _get_feature_names(self, dataframes):
+        feature_names = []
+        for dataframe in dataframes:
+            try:
+                feature_names.extend(list(dataframe.columns))
+            except Exception as e:
+                print(e)
+                feature_names.append(dataframe.name)
+
+        return feature_names
+
+
+class CategoricalEncoder(BaseTransformer):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.categorical_columns = kwargs['categorical_columns']
+        params = deepcopy(kwargs)
+        params.pop('categorical_columns', None)
+        self.params = params
+        self.encoder_class = ce.OrdinalEncoder
+        self.categorical_encoder = None
+
+    def fit(self, X, y, **kwargs):
+        X_ = X[self.categorical_columns]
+        self.categorical_encoder = self.encoder_class(cols=self.categorical_columns, **self.params)
+        self.categorical_encoder.fit(X_, y)
+        return self
+
+    def transform(self, X, **kwargs):
+        X_ = X[self.categorical_columns]
+        X_ = self.categorical_encoder.transform(X_)
+        return {'categorical_features': X_}
+
+    def load(self, filepath):
+        self.categorical_encoder = joblib.load(filepath)
+        return self
+
+    def persist(self, filepath):
+        joblib.dump(self.categorical_encoder, filepath)
+
+
+class GroupbyAggregate(BaseTransformer):
+    def __init__(self, groupby_aggregations):
+        super().__init__()
+        self.groupby_aggregations = groupby_aggregations
+        self.features = []
+        self.feature_names = []
+
+    def fit(self, main_table, **kwargs):
+        for groupby_cols, specs in self.groupby_aggregations:
+            group_object = main_table.groupby(groupby_cols)
+            for select, agg in specs:
+                groupby_aggregate_name = self._create_colname_from_specs(groupby_cols, select, agg)
+
+                group_features = group_object[select].agg(agg).reset_index() \
+                    .rename(index=str,
+                            columns={select: groupby_aggregate_name})[groupby_cols + [groupby_aggregate_name]]
+
+                self.features.append((groupby_cols, group_features))
+                self.feature_names.append(groupby_aggregate_name)
+        return self
+
+    def transform(self, main_table, **kwargs):
+        for groupby_cols, groupby_features in self.features:
+            main_table = main_table.merge(groupby_features,
+                                          on=groupby_cols,
+                                          how='left')
+
+        return {'numerical_features': main_table[self.feature_names].astype(np.float32)}
+
+    def load(self, filepath):
+        params = joblib.load(filepath)
+        self.features = params['features']
+        self.feature_names = params['feature_names']
+        return self
+
+    def persist(self, filepath):
+        params = {'features': self.features,
+                  'feature_names': self.feature_names}
+        joblib.dump(params, filepath)
+
+    def _create_colname_from_specs(self, groupby_cols, agg, select):
+        return '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
+
+
+class GroupbyAggregateMerge(BaseTransformer):
+    def __init__(self, table_name, id_columns, groupby_aggregations):
+        super().__init__()
+        self.table_name = table_name
+        self.id_columns = id_columns
+        self.groupby_aggregations = groupby_aggregations
+
+    @property
+    def feature_names(self):
+        feature_names = list(self.features.columns)
+        feature_names.remove(self.id_columns[0])
+        return feature_names
+
+    def fit(self, main_table, side_table, **kwargs):
+        features = pd.DataFrame({self.id_columns[0]: side_table[self.id_columns[0]].unique()})
+
+        for groupby_cols, specs in self.groupby_aggregations:
+            group_object = side_table.groupby(groupby_cols)
+            for select, agg in specs:
+                groupby_aggregate_name = self._create_colname_from_specs(groupby_cols, select, agg)
+                features = features.merge(group_object[select]
+                                          .agg(agg)
+                                          .reset_index()
+                                          .rename(index=str,
+                                                  columns={select: groupby_aggregate_name})
+                                          [groupby_cols + [groupby_aggregate_name]],
+                                          on=groupby_cols,
+                                          how='left')
+        self.features = features
+        return self
+
+    def transform(self, main_table, side_table, **kwargs):
+        main_table = main_table.merge(self.features,
+                                      left_on=[self.id_columns[0]],
+                                      right_on=[self.id_columns[1]],
+                                      how='left',
+                                      validate='one_to_one')
+
+        return {'numerical_features': main_table[self.feature_names].astype(np.float32)}
+
+    def load(self, filepath):
+        self.features = joblib.load(filepath)
+        return self
+
+    def persist(self, filepath):
+        joblib.dump(self.features, filepath)
+
+    def _create_colname_from_specs(self, groupby_cols, select, agg):
+        return '{}_{}_{}_{}'.format(self.table_name, '_'.join(groupby_cols), agg, select)
+
+
+class ApplicationFeatures(BaseTransformer):
+    def __init__(self, categorical_columns, numerical_columns):
+        self.categorical_columns = categorical_columns
+        self.numerical_columns = numerical_columns
+        self.engineered_numerical_columns = ['annuity_income_percentage',
+                                             'car_to_birth_ratio',
+                                             'car_to_employ_ratio',
+                                             'children_ratio',
+                                             'credit_to_annuity_ratio',
+                                             'credit_to_goods_ratio',
+                                             'credit_to_income_ratio',
+                                             'days_employed_percentage',
+                                             'income_credit_percentage',
+                                             'income_per_child',
+                                             'income_per_person',
+                                             'payment_rate',
+                                             'phone_to_birth_ratio',
+                                             'phone_to_employ_ratio',
+                                             'external_sources_weighted',
+                                             'external_sources_min',
+                                             'external_sources_max',
+                                             'external_sources_sum',
+                                             'external_sources_mean',
+                                             'external_sources_nanmedian']
+
+    def transform(self, X, **kwargs):
+        X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']
+        X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']
+        X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']
+        X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']
+        X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']
+        X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']
+        X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']
+        X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']
+        X['income_credit_percentage'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']
+        X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])
+        X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']
+        X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']
+        X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH']
+        X['phone_to_employ_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_EMPLOYED']
+        X['external_sources_weighted'] = X.EXT_SOURCE_1 * 2 + X.EXT_SOURCE_2 * 3 + X.EXT_SOURCE_3 * 4
+        for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
+            X['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(
+                X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
+
+        return {'numerical_features': X[self.engineered_numerical_columns + self.numerical_columns],
+                'categorical_features': X[self.categorical_columns]
+                }
+
+
+class BureauFeatures(BaseTransformer):
+    def __init__(self, **kwargs):
+        self.features = None
+
+    @property
+    def feature_names(self):
+        feature_names = list(self.features.columns)
+        feature_names.remove('SK_ID_CURR')
+        return feature_names
+
+    def fit(self, X, bureau, **kwargs):
+        bureau['bureau_credit_active_binary'] = (bureau['CREDIT_ACTIVE'] != 'Closed').astype(int)
+        bureau['bureau_credit_enddate_binary'] = (bureau['DAYS_CREDIT_ENDDATE'] > 0).astype(int)
+        groupby_SK_ID_CURR = bureau.groupby(by=['SK_ID_CURR'])
+        features = pd.DataFrame({'SK_ID_CURR': bureau['SK_ID_CURR'].unique()})
+
+        group_object = groupby_SK_ID_CURR['DAYS_CREDIT'].agg('count').reset_index()
+        group_object.rename(index=str, columns={'DAYS_CREDIT': 'bureau_number_of_past_loans'}, inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        group_object = groupby_SK_ID_CURR['CREDIT_TYPE'].agg('nunique').reset_index()
+        group_object.rename(index=str, columns={'CREDIT_TYPE': 'bureau_number_of_loan_types'}, inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        features['bureau_average_of_past_loans_per_type'] = \
+            features['bureau_number_of_past_loans'] / features['bureau_number_of_loan_types']
+
+        group_object = groupby_SK_ID_CURR['bureau_credit_active_binary'].agg('mean').reset_index()
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index()
+        group_object.rename(index=str, columns={'AMT_CREDIT_SUM_DEBT': 'bureau_total_customer_debt'}, inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM'].agg('sum').reset_index()
+        group_object.rename(index=str, columns={'AMT_CREDIT_SUM': 'bureau_total_customer_credit'}, inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        features['bureau_debt_credit_ratio'] = \
+            features['bureau_total_customer_debt'] / features['bureau_total_customer_credit']
+
+        group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index()
+        group_object.rename(index=str, columns={'AMT_CREDIT_SUM_OVERDUE': 'bureau_total_customer_overdue'},
+                            inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        features['bureau_overdue_debt_ratio'] = \
+            features['bureau_total_customer_overdue'] / features['bureau_total_customer_debt']
+
+        group_object = groupby_SK_ID_CURR['CNT_CREDIT_PROLONG'].agg('sum').reset_index()
+        group_object.rename(index=str, columns={'CNT_CREDIT_PROLONG': 'bureau_average_creditdays_prolonged'},
+                            inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        group_object = groupby_SK_ID_CURR['bureau_credit_enddate_binary'].agg('mean').reset_index()
+        group_object.rename(index=str, columns={'bureau_credit_enddate_binary': 'bureau_credit_enddate_percentage'},
+                            inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        self.features = features
+        return self
+
+    def transform(self, X, **kwargs):
+        X = X.merge(self.features,
+                    left_on=['SK_ID_CURR'],
+                    right_on=['SK_ID_CURR'],
+                    how='left',
+                    validate='one_to_one')
+
+        return {'numerical_features': X[self.feature_names]}
+
+    def load(self, filepath):
+        self.features = joblib.load(filepath)
+        return self
+
+    def persist(self, filepath):
+        joblib.dump(self.features, filepath)
+
+
+class CreditCardBalanceFeatures(BaseTransformer):
+    def __init__(self, **kwargs):
+        self.features = None
+
+    @property
+    def feature_names(self):
+        feature_names = list(self.features.columns)
+        feature_names.remove('SK_ID_CURR')
+        return feature_names
+
+    def fit(self, X, credit_card, **kwargs):
+        credit_card['number_of_instalments'] = credit_card.groupby(
+            by=['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].agg('max').reset_index()[
+            'CNT_INSTALMENT_MATURE_CUM']
+
+        credit_card['credit_card_max_loading_of_credit_limit'] = credit_card.groupby(
+            by=['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply(
+            lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0]
+
+        features = pd.DataFrame({'SK_ID_CURR': credit_card['SK_ID_CURR'].unique()})
+
+        group_object = credit_card.groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].agg('nunique').reset_index()
+        group_object.rename(index=str, columns={'SK_ID_PREV': 'credit_card_number_of_loans'}, inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        group_object = credit_card.groupby(by=['SK_ID_CURR'])['number_of_instalments'].sum().reset_index()
+        group_object.rename(index=str, columns={'number_of_instalments': 'credit_card_total_instalments'}, inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        features['credit_card_installments_per_loan'] = (
+            features['credit_card_total_instalments'] / features['credit_card_number_of_loans'])
+
+        group_object = credit_card.groupby(by=['SK_ID_CURR'])['credit_card_max_loading_of_credit_limit'].agg(
+            'mean').reset_index()
+        group_object.rename(index=str, columns={
+            'credit_card_max_loading_of_credit_limit': 'credit_card_avg_loading_of_credit_limit'}, inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        group_object = credit_card.groupby(
+            by=['SK_ID_CURR'])['SK_DPD'].agg('mean').reset_index()
+        group_object.rename(index=str, columns={'SK_DPD': 'credit_card_average_of_days_past_due'}, inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        group_object = credit_card.groupby(by=['SK_ID_CURR'])['AMT_DRAWINGS_ATM_CURRENT'].agg('sum').reset_index()
+        group_object.rename(index=str, columns={'AMT_DRAWINGS_ATM_CURRENT': 'credit_card_drawings_atm'}, inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        group_object = credit_card.groupby(by=['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].agg('sum').reset_index()
+        group_object.rename(index=str, columns={'AMT_DRAWINGS_CURRENT': 'credit_card_drawings_total'}, inplace=True)
+        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
+
+        features['credit_card_cash_card_ratio'] = features['credit_card_drawings_atm'] / features[
+            'credit_card_drawings_total']
+
+        self.features = features
+        return self
+
+    def transform(self, X, **kwargs):
+        X = X.merge(self.features,
+                    left_on=['SK_ID_CURR'],
+                    right_on=['SK_ID_CURR'],
+                    how='left',
+                    validate='one_to_one')
+
+        return {'numerical_features': X[self.feature_names]}
+
+    def load(self, filepath):
+        self.features = joblib.load(filepath)
+        return self
+
+    def persist(self, filepath):
+        joblib.dump(self.features, filepath)
+
+
+class ConcatFeatures(BaseTransformer):
+    def transform(self, **kwargs):
+        features_concat = []
+        for _, feature in kwargs.items():
+            feature.reset_index(drop=True, inplace=True)
+            features_concat.append(feature)
+        features_concat = pd.concat(features_concat, axis=1)
+        return {'concatenated_features': features_concat}
diff --git a/hyperparameter_tuning.py b/src/hyperparameter_tuning.py
similarity index 99%
rename from hyperparameter_tuning.py
rename to src/hyperparameter_tuning.py
index 5bdab15..b5cd2cf 100644
--- a/hyperparameter_tuning.py
+++ b/src/hyperparameter_tuning.py
@@ -6,7 +6,7 @@
 from steppy.base import BaseTransformer
 from steppy.utils import get_logger
 
-from utils import set_seed
+from .utils import set_seed
 
 logger = get_logger()
 
diff --git a/src/models.py b/src/models.py
new file mode 100644
index 0000000..3598eba
--- /dev/null
+++ b/src/models.py
@@ -0,0 +1,208 @@
+from attrdict import AttrDict
+from deepsense import neptune
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+from sklearn.externals import joblib
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from steppy.base import BaseTransformer
+from toolkit.sklearn_transformers.models import SklearnClassifier
+import xgboost as xgb
+
+from .utils import get_logger
+
+logger = get_logger()
+ctx = neptune.Context()
+
+
+class XGBoost(BaseTransformer):
+    def __init__(self, **params):
+        super().__init__()
+        logger.info('initializing XGBoost...')
+        self.params = params
+        self.training_params = ['nrounds', 'early_stopping_rounds']
+        self.evaluation_function = None
+
+    @property
+    def model_config(self):
+        return AttrDict({param: value for param, value in self.params.items()
+                         if param not in self.training_params})
+
+    @property
+    def training_config(self):
+        return AttrDict({param: value for param, value in self.params.items()
+                         if param in self.training_params})
+
+    def fit(self,
+            X, y,
+            X_valid, y_valid,
+            feature_names=None,
+            feature_types=None,
+            **kwargs):
+        train = xgb.DMatrix(X,
+                            label=y,
+                            feature_names=feature_names,
+                            feature_types=feature_types)
+        valid = xgb.DMatrix(X_valid,
+                            label=y_valid,
+                            feature_names=feature_names,
+                            feature_types=feature_types)
+
+        evaluation_results = {}
+        self.estimator = xgb.train(params=self.model_config,
+                                   dtrain=train,
+                                   evals=[(train, 'train'), (valid, 'valid')],
+                                   evals_result=evaluation_results,
+                                   num_boost_round=self.training_config.nrounds,
+                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
+                                   verbose_eval=self.model_config.verbose,
+                                   feval=self.evaluation_function)
+        return self
+
+    def transform(self, X, y=None, feature_names=None, feature_types=None, **kwargs):
+        X_DMatrix = xgb.DMatrix(X,
+                                label=y,
+                                feature_names=feature_names,
+                                feature_types=feature_types)
+        prediction = self.estimator.predict(X_DMatrix)
+        return {'prediction': prediction}
+
+    def load(self, filepath):
+        self.estimator = xgb.Booster(params=self.model_config)
+        self.estimator.load_model(filepath)
+        return self
+
+    def persist(self, filepath):
+        self.estimator.save_model(filepath)
+
+
+class LightGBM(BaseTransformer):
+    def __init__(self, name=None, **params):
+        super().__init__()
+        logger.info('initializing LightGBM...')
+        self.params = params
+        self.training_params = ['number_boosting_rounds', 'early_stopping_rounds']
+        self.evaluation_function = None
+        self.callbacks = callbacks(channel_prefix=name)
+
+    @property
+    def model_config(self):
+        return AttrDict({param: value for param, value in self.params.items()
+                         if param not in self.training_params})
+
+    @property
+    def training_config(self):
+        return AttrDict({param: value for param, value in self.params.items()
+                         if param in self.training_params})
+
+    def fit(self,
+            X,
+            y,
+            X_valid,
+            y_valid,
+            feature_names='auto',
+            categorical_features='auto',
+            **kwargs):
+        evaluation_results = {}
+
+        self._check_target_shape_and_type(y, 'y')
+        self._check_target_shape_and_type(y_valid, 'y_valid')
+        y = self._format_target(y)
+        y_valid = self._format_target(y_valid)
+
+        logger.info('LightGBM, train data shape        {}'.format(X.shape))
+        logger.info('LightGBM, validation data shape   {}'.format(X_valid.shape))
+        logger.info('LightGBM, train labels shape      {}'.format(y.shape))
+        logger.info('LightGBM, validation labels shape {}'.format(y_valid.shape))
+
+        data_train = lgb.Dataset(data=X,
+                                 label=y,
+                                 feature_name=feature_names,
+                                 categorical_feature=categorical_features,
+                                 **kwargs)
+        data_valid = lgb.Dataset(X_valid,
+                                 label=y_valid,
+                                 feature_name=feature_names,
+                                 categorical_feature=categorical_features,
+                                 **kwargs)
+
+        self.estimator = lgb.train(self.model_config,
+                                   data_train,
+                                   feature_name=feature_names,
+                                   categorical_feature=categorical_features,
+                                   valid_sets=[data_train, data_valid],
+                                   valid_names=['data_train', 'data_valid'],
+                                   evals_result=evaluation_results,
+                                   num_boost_round=self.training_config.number_boosting_rounds,
+                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
+                                   verbose_eval=self.model_config.verbose,
+                                   feval=self.evaluation_function,
+                                   callbacks=self.callbacks,
+                                   **kwargs)
+        return self
+
+    def transform(self, X, **kwargs):
+        prediction = self.estimator.predict(X)
+        return {'prediction': prediction}
+
+    def load(self, filepath):
+        self.estimator = joblib.load(filepath)
+        return self
+
+    def persist(self, filepath):
+        joblib.dump(self.estimator, filepath)
+
+    def _check_target_shape_and_type(self, target, name):
+        if not any([isinstance(target, obj_type) for obj_type in [pd.Series, np.ndarray, list]]):
+            raise TypeError(
+                '"target" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(type(target)))
+        try:
+            assert len(target.shape) == 1, '"{}" must be 1-D. It is {}-D instead.'.format(name,
+                                                                                          len(target.shape))
+        except AttributeError:
+            print('Cannot determine shape of the {}. '
+                  'Type must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead'.format(name,
+                                                                                                     type(target)))
+
+    def _format_target(self, target):
+
+        if isinstance(target, pd.Series):
+            return target.values
+        elif isinstance(target, np.ndarray):
+            return target
+        elif isinstance(target, list):
+            return np.array(target)
+        else:
+            raise TypeError(
+                '"target" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(type(target)))
+
+
+def get_sklearn_classifier(ClassifierClass, normalize, **kwargs):
+    class SklearnBinaryClassifier(SklearnClassifier):
+        def transform(self, X, y=None, target=1, **kwargs):
+            prediction = self.estimator.predict_proba(X)[:, target]
+            return {SklearnClassifier.RESULT_KEY: prediction}
+
+    if normalize:
+        return SklearnBinaryClassifier(Pipeline([('standarizer', StandardScaler()),
+                                                 ('classifier', ClassifierClass(**kwargs))]))
+
+    return SklearnBinaryClassifier(ClassifierClass(**kwargs))
+
+
+def callbacks(channel_prefix):
+    neptune_monitor = neptune_monitor_lgbm(channel_prefix)
+    return [neptune_monitor]
+
+
+def neptune_monitor_lgbm(channel_prefix=''):
+    def callback(env):
+        for name, loss_name, loss_value, _ in env.evaluation_result_list:
+            if channel_prefix != '':
+                channel_name = '{}_{}_{}'.format(channel_prefix, name, loss_name)
+            else:
+                channel_name = '{}_{}'.format(name, loss_name)
+            ctx.channel_send(channel_name, x=env.iteration, y=loss_value)
+
+    return callback
diff --git a/src/pipeline_blocks.py b/src/pipeline_blocks.py
new file mode 100644
index 0000000..d07fb4d
--- /dev/null
+++ b/src/pipeline_blocks.py
@@ -0,0 +1,582 @@
+from functools import partial
+
+from sklearn.metrics import roc_auc_score
+from steppy.adapter import Adapter, E
+from steppy.base import Step, make_transformer
+
+from . import feature_extraction as fe
+from . import data_cleaning as dc
+from .hyperparameter_tuning import RandomSearchOptimizer, NeptuneMonitor, PersistResults
+from .models import get_sklearn_classifier, XGBoost, LightGBM
+
+
+def classifier_light_gbm(features, config, train_mode, suffix, **kwargs):
+    model_name = 'light_gbm{}'.format(suffix)
+
+    if train_mode:
+        features_train, features_valid = features
+        if config.random_search.light_gbm.n_runs:
+            transformer = RandomSearchOptimizer(TransformerClass=LightGBM,
+                                                params=config.light_gbm,
+                                                train_input_keys=[],
+                                                valid_input_keys=['X_valid', 'y_valid'],
+                                                score_func=roc_auc_score,
+                                                maximize=True,
+                                                n_runs=config.random_search.light_gbm.n_runs,
+                                                callbacks=[
+                                                    NeptuneMonitor(
+                                                        **config.random_search.light_gbm.callbacks.neptune_monitor),
+                                                    PersistResults(
+                                                        **config.random_search.light_gbm.callbacks.persist_results)]
+                                                )
+        else:
+            transformer = LightGBM(name=model_name, **config.light_gbm)
+
+        light_gbm = Step(name=model_name,
+                         transformer=transformer,
+                         input_data=['application'],
+                         input_steps=[features_train, features_valid],
+                         adapter=Adapter({'X': E(features_train.name, 'features'),
+                                          'y': E('application', 'y'),
+                                          'feature_names': E(features_train.name, 'feature_names'),
+                                          'categorical_features': E(features_train.name, 'categorical_features'),
+                                          'X_valid': E(features_valid.name, 'features'),
+                                          'y_valid': E('application', 'y_valid'),
+                                          }),
+                         experiment_directory=config.pipeline.experiment_directory,
+                         **kwargs)
+    else:
+        light_gbm = Step(name=model_name,
+                         transformer=LightGBM(name=model_name, **config.light_gbm),
+                         input_steps=[features],
+                         adapter=Adapter({'X': E(features.name, 'features')}),
+                         experiment_directory=config.pipeline.experiment_directory,
+                         **kwargs)
+    return light_gbm
+
+
+def classifier_xgb(features, config, train_mode, suffix, **kwargs):
+    if train_mode:
+        features_train, features_valid = features
+        if config.random_search.xgboost.n_runs:
+            transformer = RandomSearchOptimizer(TransformerClass=XGBoost,
+                                                params=config.xgboost,
+                                                train_input_keys=[],
+                                                valid_input_keys=['X_valid', 'y_valid'],
+                                                score_func=roc_auc_score,
+                                                maximize=True,
+                                                n_runs=config.random_search.xgboost.n_runs,
+                                                callbacks=[
+                                                    NeptuneMonitor(
+                                                        **config.random_search.xgboost.callbacks.neptune_monitor),
+                                                    PersistResults(
+                                                        **config.random_search.xgboost.callbacks.persist_results)]
+                                                )
+        else:
+            transformer = XGBoost(**config.xgboost)
+
+        xgboost = Step(name='xgboost{}'.format(suffix),
+                       transformer=transformer,
+                       input_data=['application'],
+                       input_steps=[features_train, features_valid],
+                       adapter=Adapter({'X': E(features_train.name, 'features'),
+                                        'y': E('application', 'y'),
+                                        'feature_names': E(features_train.name, 'feature_names'),
+                                        'X_valid': E(features_valid.name, 'features'),
+                                        'y_valid': E('application', 'y_valid'),
+                                        }),
+                       experiment_directory=config.pipeline.experiment_directory,
+                       **kwargs)
+    else:
+        xgboost = Step(name='xgboost{}'.format(suffix),
+                       transformer=XGBoost(**config.xgboost),
+                       input_steps=[features],
+                       adapter=Adapter({'X': E(features.name, 'features')}),
+                       experiment_directory=config.pipeline.experiment_directory,
+                       **kwargs)
+    return xgboost
+
+
+def classifier_sklearn(sklearn_features,
+                       ClassifierClass,
+                       full_config,
+                       clf_name,
+                       train_mode,
+                       suffix,
+                       normalize,
+                       **kwargs):
+    config, model_params, rs_config = full_config
+    if train_mode:
+        if config.random_search.random_forest.n_runs:
+            transformer = RandomSearchOptimizer(
+                partial(get_sklearn_classifier,
+                        ClassifierClass=ClassifierClass,
+                        normalize=normalize),
+                model_params,
+                train_input_keys=[],
+                valid_input_keys=['X_valid', 'y_valid'],
+                score_func=roc_auc_score,
+                maximize=True,
+                n_runs=rs_config.n_runs,
+                callbacks=[NeptuneMonitor(**rs_config.callbacks.neptune_monitor),
+                           PersistResults(**rs_config.callbacks.persist_results)]
+            )
+        else:
+            transformer = get_sklearn_classifier(ClassifierClass, normalize, **model_params)
+
+        sklearn_clf = Step(name='{}{}'.format(clf_name, suffix),
+                           transformer=transformer,
+                           input_data=['application'],
+                           input_steps=[sklearn_features],
+                           adapter=Adapter({'X': E(sklearn_features.name, 'X'),
+                                            'y': E('application', 'y'),
+                                            'X_valid': E(sklearn_features.name, 'X_valid'),
+                                            'y_valid': E('application', 'y_valid'),
+                                            }),
+                           experiment_directory=config.pipeline.experiment_directory,
+                           **kwargs)
+    else:
+        sklearn_clf = Step(name='{}{}'.format(clf_name, suffix),
+                           transformer=get_sklearn_classifier(ClassifierClass, normalize, **model_params),
+                           input_steps=[sklearn_features],
+                           adapter=Adapter({'X': E(sklearn_features.name, 'X')}),
+                           experiment_directory=config.pipeline.experiment_directory,
+                           **kwargs)
+    return sklearn_clf
+
+
+def feature_extraction(config, train_mode, suffix, **kwargs):
+    if train_mode:
+        application, application_valid = _application(config, train_mode, suffix, **kwargs)
+        bureau, bureau_valid = _bureau(config, train_mode, suffix, **kwargs)
+        credit_card_balance, credit_card_balance_valid = _credit_card_balance(config, train_mode, suffix, **kwargs)
+
+        application_agg, application_agg_valid = _application_groupby_agg(config, train_mode, suffix, **kwargs)
+        bureau_agg, bureau_agg_valid = _bureau_groupby_agg(config, train_mode, suffix, **kwargs)
+        credit_card_balance_agg, credit_card_balance_agg_valid = _credit_card_balance_groupby_agg(
+            config,
+            train_mode, suffix,
+            **kwargs)
+        installments_payments_agg, installments_payments_agg_valid = _installments_payments_groupby_agg(
+            config,
+            train_mode, suffix,
+            **kwargs)
+        pos_cash_balance_agg, pos_cash_balance_agg_valid = _pos_cash_balance_groupby_agg(
+            config,
+            train_mode, suffix,
+            **kwargs)
+        previous_applications_agg, previous_applications_agg_valid = _previous_applications_groupby_agg(
+            config,
+            train_mode, suffix,
+            **kwargs)
+
+        categorical_encoder, categorical_encoder_valid = _categorical_encoders(config, train_mode, suffix, **kwargs)
+
+        feature_combiner, feature_combiner_valid = _join_features(
+            numerical_features=[application,
+                                application_agg,
+                                previous_applications_agg,
+                                bureau,
+                                bureau_agg,
+                                credit_card_balance,
+                                credit_card_balance_agg,
+                                installments_payments_agg,
+                                pos_cash_balance_agg,
+                                ],
+            numerical_features_valid=[application_valid,
+                                      application_agg_valid,
+                                      previous_applications_agg_valid,
+                                      bureau_valid,
+                                      bureau_agg_valid,
+                                      credit_card_balance_valid,
+                                      credit_card_balance_agg_valid,
+                                      installments_payments_agg_valid,
+                                      pos_cash_balance_agg_valid,
+                                      ],
+            categorical_features=[categorical_encoder
+                                  ],
+            categorical_features_valid=[categorical_encoder_valid
+                                        ],
+            config=config,
+            train_mode=train_mode,
+            suffix=suffix,
+            **kwargs)
+
+        return feature_combiner, feature_combiner_valid
+    else:
+        application = _application(config, train_mode, suffix, **kwargs)
+        bureau = _bureau(config, train_mode, suffix, **kwargs)
+        credit_card_balance = _credit_card_balance(config, train_mode, suffix, **kwargs)
+
+        application_agg = _application_groupby_agg(config, train_mode, suffix, **kwargs)
+        bureau_agg = _bureau_groupby_agg(config, train_mode, suffix, **kwargs)
+        credit_card_balance_agg = _credit_card_balance_groupby_agg(config, train_mode, suffix, **kwargs)
+        installments_payments_agg = _installments_payments_groupby_agg(config, train_mode, suffix, **kwargs)
+        pos_cash_balance_agg = _pos_cash_balance_groupby_agg(config, train_mode, suffix, **kwargs)
+        previous_applications_agg = _previous_applications_groupby_agg(config, train_mode, suffix, **kwargs)
+        categorical_encoder = _categorical_encoders(config, train_mode, suffix, **kwargs)
+        feature_combiner = _join_features(numerical_features=[application,
+                                                              application_agg,
+                                                              previous_applications_agg,
+                                                              bureau,
+                                                              bureau_agg,
+                                                              credit_card_balance,
+                                                              credit_card_balance_agg,
+                                                              installments_payments_agg,
+                                                              pos_cash_balance_agg,
+                                                              ],
+                                          numerical_features_valid=[],
+                                          categorical_features=[categorical_encoder
+                                                                ],
+                                          categorical_features_valid=[],
+                                          config=config,
+                                          train_mode=train_mode,
+                                          suffix=suffix,
+                                          **kwargs)
+
+        return feature_combiner
+
+
+def preprocessing_fillna(features, config, train_mode, suffix, **kwargs):
+    if train_mode:
+        features_train, features_valid = features
+        fillna = Step(name='fillna{}'.format(suffix),
+                      transformer=_fillna(**config.preprocessing),
+                      input_steps=[features_train, features_valid],
+                      adapter=Adapter({'X': E(features_train.name, 'features'),
+                                       'X_valid': E(features_valid.name, 'features'),
+                                       }),
+                      experiment_directory=config.pipeline.experiment_directory,
+                      **kwargs
+                      )
+    else:
+        fillna = Step(name='fillna{}'.format(suffix),
+                      transformer=_fillna(**config.preprocessing),
+                      input_steps=[features],
+                      adapter=Adapter({'X': E(features.name, 'features')}),
+                      experiment_directory=config.pipeline.experiment_directory,
+                      **kwargs
+                      )
+    return fillna
+
+
+def _join_features(numerical_features,
+                   numerical_features_valid,
+                   categorical_features,
+                   categorical_features_valid,
+                   config, train_mode, suffix,
+                   **kwargs):
+    if train_mode:
+        persist_output = True
+        cache_output = True
+        load_persisted_output = True
+    else:
+        persist_output = False
+        cache_output = True
+        load_persisted_output = False
+
+    feature_joiner = Step(name='feature_joiner{}'.format(suffix),
+                          transformer=fe.FeatureJoiner(),
+                          input_steps=numerical_features + categorical_features,
+                          adapter=Adapter({
+                              'numerical_feature_list': [
+                                  E(feature.name, 'numerical_features') for feature in numerical_features],
+                              'categorical_feature_list': [
+                                  E(feature.name, 'categorical_features') for feature in categorical_features],
+                          }),
+                          experiment_directory=config.pipeline.experiment_directory,
+                          persist_output=persist_output,
+                          cache_output=cache_output,
+                          load_persisted_output=load_persisted_output)
+    if train_mode:
+        feature_joiner_valid = Step(name='feature_joiner_valid{}'.format(suffix),
+                                    transformer=feature_joiner,
+                                    input_steps=numerical_features_valid + categorical_features_valid,
+                                    adapter=Adapter({
+                                        'numerical_feature_list': [
+                                            E(feature.name,
+                                              'numerical_features') for feature in numerical_features_valid],
+                                        'categorical_feature_list': [
+                                            E(feature.name,
+                                              'categorical_features') for feature in categorical_features_valid],
+                                    }),
+                                    experiment_directory=config.pipeline.experiment_directory,
+                                    persist_output=persist_output,
+                                    cache_output=cache_output,
+                                    load_persisted_output=load_persisted_output)
+
+        return feature_joiner, feature_joiner_valid
+
+    else:
+        return feature_joiner
+
+
+def _categorical_encoders(config, train_mode, suffix, **kwargs):
+    categorical_encoder = Step(name='categorical_encoder{}'.format(suffix),
+                               transformer=fe.CategoricalEncoder(**config.preprocessing.categorical_encoder),
+                               input_data=['application'],
+                               adapter=Adapter({'X': E('application', 'X'),
+                                                'y': E('application', 'y')}
+                                               ),
+                               experiment_directory=config.pipeline.experiment_directory,
+                               **kwargs)
+    if train_mode:
+        categorical_encoder_valid = Step(name='categorical_encoder_valid{}'.format(suffix),
+                                         transformer=categorical_encoder,
+                                         input_data=['application'],
+                                         adapter=Adapter(
+                                             {'X': E('application', 'X_valid'),
+                                              'y': E('application', 'y_valid')}
+                                         ),
+                                         experiment_directory=config.pipeline.experiment_directory,
+                                         **kwargs)
+        return categorical_encoder, categorical_encoder_valid
+    else:
+        return categorical_encoder
+
+
+def _application_groupby_agg(config, train_mode, suffix, **kwargs):
+    application_groupby_agg = Step(name='application_groupby_agg{}'.format(suffix),
+                                   transformer=fe.GroupbyAggregate(**config.applications.aggregations),
+                                   input_data=['application'],
+                                   adapter=Adapter(
+                                       {'main_table': E('application', 'X')}),
+                                   experiment_directory=config.pipeline.experiment_directory,
+                                   **kwargs)
+
+    if train_mode:
+
+        application_groupby_agg_valid = Step(name='application_groupby_agg_valid{}'.format(suffix),
+                                             transformer=application_groupby_agg,
+                                             input_data=['application'],
+                                             adapter=Adapter(
+                                                 {'main_table': E('application', 'X_valid'),
+                                                  }),
+                                             experiment_directory=config.pipeline.experiment_directory,
+                                             **kwargs)
+
+        return application_groupby_agg, application_groupby_agg_valid
+
+    else:
+        return application_groupby_agg
+
+
+def _bureau_groupby_agg(config, train_mode, suffix, **kwargs):
+    bureau_groupby_agg = Step(name='bureau_groupby_agg{}'.format(suffix),
+                              transformer=fe.GroupbyAggregateMerge(**config.bureau),
+                              input_data=['application', 'bureau'],
+                              adapter=Adapter({'main_table': E('application', 'X'),
+                                               'side_table': E('bureau', 'X')}),
+                              experiment_directory=config.pipeline.experiment_directory,
+                              **kwargs)
+
+    if train_mode:
+        bureau_groupby_agg_valid = Step(name='bureau_groupby_agg_valid{}'.format(suffix),
+                                        transformer=bureau_groupby_agg,
+                                        input_data=['application', 'bureau'],
+                                        adapter=Adapter({'main_table': E('application', 'X_valid'),
+                                                         'side_table': E('bureau', 'X')}),
+                                        experiment_directory=config.pipeline.experiment_directory,
+                                        **kwargs)
+        return bureau_groupby_agg, bureau_groupby_agg_valid
+    else:
+        return bureau_groupby_agg
+
+
+def _credit_card_balance_groupby_agg(config, train_mode, suffix, **kwargs):
+    credit_card_balance_groupby_agg = Step(name='credit_card_balance_groupby_agg{}'.format(suffix),
+                                           transformer=fe.GroupbyAggregateMerge(**config.credit_card_balance),
+                                           input_data=['application', 'credit_card_balance'],
+                                           adapter=Adapter({'main_table': E('application', 'X'),
+                                                            'side_table': E('credit_card_balance', 'X')}),
+                                           experiment_directory=config.pipeline.experiment_directory,
+                                           **kwargs)
+    if train_mode:
+        credit_card_balance_groupby_agg_valid = Step(name='credit_card_balance_groupby_agg_valid{}'.format(suffix),
+                                                     transformer=credit_card_balance_groupby_agg,
+                                                     input_data=['application', 'credit_card_balance'],
+                                                     adapter=Adapter({'main_table': E('application', 'X_valid'),
+                                                                      'side_table': E('credit_card_balance', 'X')}),
+                                                     experiment_directory=config.pipeline.experiment_directory,
+                                                     **kwargs)
+        return credit_card_balance_groupby_agg, credit_card_balance_groupby_agg_valid
+
+    else:
+        return credit_card_balance_groupby_agg
+
+
+def _installments_payments_groupby_agg(config, train_mode, suffix, **kwargs):
+    installments_payments_groupby_agg = Step(name='installments_payments_groupby_agg{}'.format(suffix),
+                                             transformer=fe.GroupbyAggregateMerge(**config.installments_payments),
+                                             input_data=['application', 'installments_payments'],
+                                             adapter=Adapter({'main_table': E('application', 'X'),
+                                                              'side_table': E('installments_payments', 'X')}),
+                                             experiment_directory=config.pipeline.experiment_directory,
+                                             **kwargs)
+    if train_mode:
+        installments_payments_groupby_agg_valid = Step(name='installments_payments_groupby_agg_valid{}'.format(suffix),
+                                                       transformer=installments_payments_groupby_agg,
+                                                       input_data=['application', 'installments_payments'],
+                                                       adapter=Adapter({'main_table': E('application', 'X_valid'),
+                                                                        'side_table': E('installments_payments', 'X')}),
+                                                       experiment_directory=config.pipeline.experiment_directory,
+                                                       **kwargs)
+
+        return installments_payments_groupby_agg, installments_payments_groupby_agg_valid
+
+    else:
+        return installments_payments_groupby_agg
+
+
+def _pos_cash_balance_groupby_agg(config, train_mode, suffix, **kwargs):
+    pos_cash_balance_groupby_agg = Step(name='pos_cash_balance_groupby_agg{}'.format(suffix),
+                                        transformer=fe.GroupbyAggregateMerge(**config.pos_cash_balance),
+                                        input_data=['application', 'pos_cash_balance'],
+                                        adapter=Adapter({'main_table': E('application', 'X'),
+                                                         'side_table': E('pos_cash_balance', 'X')}),
+                                        experiment_directory=config.pipeline.experiment_directory,
+                                        **kwargs)
+    if train_mode:
+        pos_cash_balance_groupby_agg_valid = Step(name='pos_cash_balance_groupby_agg_valid{}'.format(suffix),
+                                                  transformer=pos_cash_balance_groupby_agg,
+                                                  input_data=['application', 'pos_cash_balance'],
+                                                  adapter=Adapter({'main_table': E('application', 'X_valid'),
+                                                                   'side_table': E('pos_cash_balance', 'X')}),
+                                                  experiment_directory=config.pipeline.experiment_directory,
+                                                  **kwargs)
+
+        return pos_cash_balance_groupby_agg, pos_cash_balance_groupby_agg_valid
+
+    else:
+        return pos_cash_balance_groupby_agg
+
+
+def _previous_applications_groupby_agg(config, train_mode, suffix, **kwargs):
+    previous_applications_groupby_agg = Step(name='previous_applications_groupby_agg{}'.format(suffix),
+                                             transformer=fe.GroupbyAggregateMerge(**config.previous_applications),
+                                             input_data=['application', 'previous_application'],
+                                             adapter=Adapter({'main_table': E('application', 'X'),
+                                                              'side_table': E('previous_application', 'X')}),
+                                             experiment_directory=config.pipeline.experiment_directory,
+                                             **kwargs)
+    if train_mode:
+        previous_applications_groupby_agg_valid = Step(name='previous_applications_groupby_agg_valid{}'.format(suffix),
+                                                       transformer=previous_applications_groupby_agg,
+                                                       input_data=['application', 'previous_application'],
+                                                       adapter=Adapter({'main_table': E('application', 'X_valid'),
+                                                                        'side_table': E('previous_application', 'X')}),
+                                                       experiment_directory=config.pipeline.experiment_directory,
+                                                       **kwargs)
+        return previous_applications_groupby_agg, previous_applications_groupby_agg_valid
+    else:
+        return previous_applications_groupby_agg
+
+
+def _application_cleaning(config, train_mode, suffix, **kwargs):
+    application_cleaning = Step(name='application_cleaning{}'.format(suffix),
+                                transformer=dc.ApplicationCleaning(**config.preprocessing.impute_missing),
+                                input_data=['application'],
+                                adapter=Adapter({'X': E('application', 'X')}),
+                                experiment_directory=config.pipeline.experiment_directory,
+                                **kwargs)
+    if train_mode:
+        application_cleaning_valid = Step(name='application_cleaning_valid{}'.format(suffix),
+                                          transformer=dc.ApplicationCleaning(),
+                                          input_data=['application'],
+                                          adapter=Adapter({'X': E('application', 'X_valid')}),
+                                          experiment_directory=config.pipeline.experiment_directory,
+                                          **kwargs)
+        return application_cleaning, application_cleaning_valid
+    else:
+        return application_cleaning
+
+
+def _application(config, train_mode, suffix, **kwargs):
+    if train_mode:
+        application_cleaning, application_cleaning_valid = _application_cleaning(config, train_mode, suffix, **kwargs)
+    else:
+        application_cleaning = _application_cleaning(config, train_mode, suffix, **kwargs)
+
+    application = Step(name='application_hand_crafted{}'.format(suffix),
+                       transformer=fe.ApplicationFeatures(**config.applications.columns),
+                       input_steps=[application_cleaning],
+                       adapter=Adapter({'X': E(application_cleaning.name, 'X')}),
+                       experiment_directory=config.pipeline.experiment_directory,
+                       **kwargs)
+    if train_mode:
+        application_valid = Step(name='application_hand_crafted_valid{}'.format(suffix),
+                                 transformer=application,
+                                 input_steps=[application_cleaning_valid],
+                                 adapter=Adapter({'X': E(application_cleaning_valid.name, 'X')}),
+                                 experiment_directory=config.pipeline.experiment_directory,
+                                 **kwargs)
+        return application, application_valid
+    else:
+        return application
+
+
+def _bureau_cleaning(config, suffix, **kwargs):
+    bureau_cleaning = Step(name='bureau_cleaning{}'.format(suffix),
+                           transformer=dc.BureauCleaning(**config.preprocessing.impute_missing),
+                           input_data=['bureau'],
+                           adapter=Adapter({'bureau': E('bureau', 'X')}),
+                           experiment_directory=config.pipeline.experiment_directory,
+                           **kwargs)
+
+    return bureau_cleaning
+
+
+def _bureau(config, train_mode, suffix, **kwargs):
+    bureau_cleaned = _bureau_cleaning(config, suffix, **kwargs)
+
+    bureau = Step(name='bureau_hand_crafted{}'.format(suffix),
+                  transformer=fe.BureauFeatures(),
+                  input_data=['application'],
+                  input_steps=[bureau_cleaned],
+                  adapter=Adapter({'X': E('application', 'X'),
+                                   'bureau': E(bureau_cleaned.name, 'bureau')}),
+                  experiment_directory=config.pipeline.experiment_directory,
+                  **kwargs)
+    if train_mode:
+        bureau_valid = Step(name='bureau__hand_crafted_valid{}'.format(suffix),
+                            transformer=bureau,
+                            input_data=['application'],
+                            adapter=Adapter({'X': E('application', 'X_valid')}),
+                            experiment_directory=config.pipeline.experiment_directory,
+                            **kwargs)
+        return bureau, bureau_valid
+    else:
+        return bureau
+
+
+def _credit_card_balance(config, train_mode, suffix, **kwargs):
+    credit_card_balance = Step(name='credit_card_balance_hand_crafted{}'.format(suffix),
+                               transformer=fe.CreditCardBalanceFeatures(**config.credit_card_balance),
+                               input_data=['application', 'credit_card_balance'],
+                               adapter=Adapter({'X': E('application', 'X'),
+                                                'credit_card': E('credit_card_balance', 'X')}),
+                               experiment_directory=config.pipeline.experiment_directory,
+                               **kwargs)
+    if train_mode:
+        credit_card_balance_valid = Step(name='credit_card_balance__hand_crafted_valid{}'.format(suffix),
+                                         transformer=credit_card_balance,
+                                         input_data=['application'],
+                                         adapter=Adapter({'X': E('application', 'X_valid')}),
+                                         experiment_directory=config.pipeline.experiment_directory,
+                                         **kwargs)
+
+        return credit_card_balance, credit_card_balance_valid
+
+    else:
+        return credit_card_balance
+
+
+def _fillna(fillna_value):
+    def _inner_fillna(X, X_valid=None):
+        if X_valid is None:
+            return {'X': X.fillna(fillna_value)}
+        else:
+            return {'X': X.fillna(fillna_value),
+                    'X_valid': X_valid.fillna(fillna_value)}
+
+    return make_transformer(_inner_fillna)
diff --git a/pipeline_config.py b/src/pipeline_config.py
similarity index 53%
rename from pipeline_config.py
rename to src/pipeline_config.py
index bf4312f..2f8f298 100644
--- a/pipeline_config.py
+++ b/src/pipeline_config.py
@@ -3,25 +3,17 @@
 from attrdict import AttrDict
 from deepsense import neptune
 
-from utils import read_params, parameter_eval
+from .utils import read_params, parameter_eval
 
 ctx = neptune.Context()
-params = read_params(ctx)
+params = read_params(ctx, fallback_file='../neptune.yaml')
 
 RANDOM_SEED = 90210
 DEV_SAMPLE_SIZE = 1000
 
-BUREAU_BALANCE = params.bureau_balance_filepath
-BUREAU = params.bureau_filepath
-CREDIT_CARD_BALANCE = params.credit_card_balance_filepath
-INSTALLMENTS_PAYMENTS = params.installments_payments_filepath
-POS_CASH_BALANCE = params.POS_CASH_balance_filepath
-PREVIOUS_APPLICATION = params.previous_application_filepath
+ID_COLUMNS = ['SK_ID_CURR']
+TARGET_COLUMNS = ['TARGET']
 
-ID_COLUMN = 'SK_ID_CURR'
-TARGET_COLUMN = 'TARGET'
-
-TIMESTAMP_COLUMNS = []
 CATEGORICAL_COLUMNS = ['CODE_GENDER',
                        'EMERGENCYSTATE_MODE',
                        'FLAG_CONT_MOBILE',
@@ -60,9 +52,9 @@
                        'REG_REGION_NOT_WORK_REGION',
                        'WALLSMATERIAL_MODE',
                        'WEEKDAY_APPR_PROCESS_START']
+
 NUMERICAL_COLUMNS = ['AMT_ANNUITY',
                      'AMT_CREDIT',
-                     'AMT_GOODS_PRICE',
                      'AMT_INCOME_TOTAL',
                      'AMT_REQ_CREDIT_BUREAU_HOUR',
                      'AMT_REQ_CREDIT_BUREAU_DAY',
@@ -71,14 +63,8 @@
                      'AMT_REQ_CREDIT_BUREAU_QRT',
                      'AMT_REQ_CREDIT_BUREAU_YEAR',
                      'APARTMENTS_AVG',
-                     'APARTMENTS_MEDI',
-                     'APARTMENTS_MODE',
                      'BASEMENTAREA_AVG',
-                     'BASEMENTAREA_MEDI',
-                     'BASEMENTAREA_MODE',
                      'COMMONAREA_AVG',
-                     'COMMONAREA_MEDI',
-                     'COMMONAREA_MODE',
                      'CNT_CHILDREN',
                      'CNT_FAM_MEMBERS',
                      'DAYS_BIRTH',
@@ -89,48 +75,25 @@
                      'DEF_30_CNT_SOCIAL_CIRCLE',
                      'DEF_60_CNT_SOCIAL_CIRCLE',
                      'ELEVATORS_AVG',
-                     'ELEVATORS_MEDI',
-                     'ELEVATORS_MODE',
                      'ENTRANCES_AVG',
-                     'ENTRANCES_MEDI',
-                     'ENTRANCES_MODE',
                      'EXT_SOURCE_1',
                      'EXT_SOURCE_2',
                      'EXT_SOURCE_3',
                      'FLOORSMAX_AVG',
-                     'FLOORSMAX_MEDI',
-                     'FLOORSMAX_MODE',
                      'FLOORSMIN_AVG',
-                     'FLOORSMIN_MEDI',
-                     'FLOORSMIN_MODE',
                      'LANDAREA_AVG',
-                     'LANDAREA_MEDI',
-                     'LANDAREA_MODE',
                      'LIVINGAPARTMENTS_AVG',
-                     'LIVINGAPARTMENTS_MEDI',
-                     'LIVINGAPARTMENTS_MODE',
                      'LIVINGAREA_AVG',
-                     'LIVINGAREA_MEDI',
-                     'LIVINGAREA_MODE',
                      'NONLIVINGAPARTMENTS_AVG',
-                     'NONLIVINGAPARTMENTS_MEDI',
-                     'NONLIVINGAPARTMENTS_MODE',
                      'NONLIVINGAREA_AVG',
-                     'NONLIVINGAREA_MEDI',
-                     'NONLIVINGAREA_MODE',
                      'OBS_30_CNT_SOCIAL_CIRCLE',
-                     'OBS_60_CNT_SOCIAL_CIRCLE',
                      'OWN_CAR_AGE',
                      'REGION_POPULATION_RELATIVE',
                      'REGION_RATING_CLIENT',
-                     'REGION_RATING_CLIENT_W_CITY',
                      'TOTALAREA_MODE',
                      'YEARS_BEGINEXPLUATATION_AVG',
-                     'YEARS_BEGINEXPLUATATION_MEDI',
-                     'YEARS_BEGINEXPLUATATION_MODE',
-                     'YEARS_BUILD_AVG',
-                     'YEARS_BUILD_MEDI',
-                     'YEARS_BUILD_MODE']
+                     'YEARS_BUILD_AVG']
+
 USELESS_COLUMNS = ['FLAG_DOCUMENT_10',
                    'FLAG_DOCUMENT_12',
                    'FLAG_DOCUMENT_13',
@@ -143,46 +106,212 @@
                    'FLAG_DOCUMENT_20',
                    'FLAG_DOCUMENT_21']
 
-AGGREGATION_RECIPIES = []
-for agg in ['mean', 'size', 'var', 'min', 'max']:
-    for select in NUMERICAL_COLUMNS:
-        for group in [['CODE_GENDER'],
-                      ['CODE_GENDER', 'OCCUPATION_TYPE'],
-                      ['CODE_GENDER', 'FLAG_OWN_REALTY'],
-                      ['CODE_GENDER', 'ORGANIZATION_TYPE'],
-                      ['CODE_GENDER', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE'],
-                      ['FLAG_OWN_REALTY', 'NAME_HOUSING_TYPE'],
-                      ['FLAG_OWN_REALTY', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE'],
-                      ['OCCUPATION_TYPE', 'ORGANIZATION_TYPE'],
-                      ]:
-            AGGREGATION_RECIPIES.append({'groupby': group, 'select': select, 'agg': agg})
+HIGHLY_CORRELATED_NUMERICAL_COLUMNS = ['AMT_GOODS_PRICE',
+                                       'APARTMENTS_MEDI',
+                                       'APARTMENTS_MODE',
+                                       'BASEMENTAREA_MEDI',
+                                       'BASEMENTAREA_MODE',
+                                       'COMMONAREA_MEDI',
+                                       'COMMONAREA_MODE',
+                                       'ELEVATORS_MEDI',
+                                       'ELEVATORS_MODE',
+                                       'ENTRANCES_MEDI',
+                                       'ENTRANCES_MODE',
+                                       'FLAG_EMP_PHONE',
+                                       'FLOORSMAX_MEDI',
+                                       'FLOORSMAX_MODE',
+                                       'FLOORSMIN_MEDI',
+                                       'FLOORSMIN_MODE',
+                                       'LANDAREA_MEDI',
+                                       'LANDAREA_MODE',
+                                       'LIVINGAPARTMENTS_MEDI',
+                                       'LIVINGAPARTMENTS_MODE',
+                                       'LIVINGAREA_MEDI',
+                                       'LIVINGAREA_MODE',
+                                       'NONLIVINGAPARTMENTS_MEDI',
+                                       'NONLIVINGAPARTMENTS_MODE',
+                                       'NONLIVINGAREA_MEDI',
+                                       'NONLIVINGAREA_MODE',
+                                       'OBS_60_CNT_SOCIAL_CIRCLE',
+                                       'REGION_RATING_CLIENT_W_CITY',
+                                       'YEARS_BEGINEXPLUATATION_MEDI',
+                                       'YEARS_BEGINEXPLUATATION_MODE',
+                                       'YEARS_BUILD_MEDI',
+                                       'YEARS_BUILD_MODE']
+
+APPLICATION_AGGREGATION_RECIPIES = [
+    (['CODE_GENDER', 'NAME_EDUCATION_TYPE'], [('AMT_ANNUITY', 'max'),
+                                              ('AMT_CREDIT', 'max'),
+                                              ('EXT_SOURCE_1', 'mean'),
+                                              ('EXT_SOURCE_2', 'mean'),
+                                              ('OWN_CAR_AGE', 'max'),
+                                              ('OWN_CAR_AGE', 'sum')]),
+    (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'),
+                                            ('AMT_INCOME_TOTAL', 'mean'),
+                                            ('DAYS_REGISTRATION', 'mean'),
+                                            ('EXT_SOURCE_1', 'mean')]),
+    (['CODE_GENDER', 'REG_CITY_NOT_WORK_CITY'], [('AMT_ANNUITY', 'mean'),
+                                                 ('CNT_CHILDREN', 'mean'),
+                                                 ('DAYS_ID_PUBLISH', 'mean')]),
+    (['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('EXT_SOURCE_1', 'mean'),
+                                                                                           ('EXT_SOURCE_2', 'mean')]),
+    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'], [('AMT_CREDIT', 'mean'),
+                                                  ('AMT_REQ_CREDIT_BUREAU_YEAR', 'mean'),
+                                                  ('APARTMENTS_AVG', 'mean'),
+                                                  ('BASEMENTAREA_AVG', 'mean'),
+                                                  ('EXT_SOURCE_1', 'mean'),
+                                                  ('EXT_SOURCE_2', 'mean'),
+                                                  ('EXT_SOURCE_3', 'mean'),
+                                                  ('NONLIVINGAREA_AVG', 'mean'),
+                                                  ('OWN_CAR_AGE', 'mean'),
+                                                  ('YEARS_BUILD_AVG', 'mean')]),
+    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('ELEVATORS_AVG', 'mean'),
+                                                                            ('EXT_SOURCE_1', 'mean')]),
+    (['OCCUPATION_TYPE'], [('AMT_ANNUITY', 'mean'),
+                           ('CNT_CHILDREN', 'mean'),
+                           ('CNT_FAM_MEMBERS', 'mean'),
+                           ('DAYS_BIRTH', 'mean'),
+                           ('DAYS_EMPLOYED', 'mean'),
+                           ('DAYS_ID_PUBLISH', 'mean'),
+                           ('DAYS_REGISTRATION', 'mean'),
+                           ('EXT_SOURCE_1', 'mean'),
+                           ('EXT_SOURCE_2', 'mean'),
+                           ('EXT_SOURCE_3', 'mean')]),
+]
+
+BUREAU_AGGREGATION_RECIPIES = [('CREDIT_TYPE', 'count'),
+                               ('CREDIT_ACTIVE', 'size')
+                               ]
+for agg in ['mean', 'min', 'max', 'sum', 'var']:
+    for select in ['AMT_ANNUITY',
+                   'AMT_CREDIT_SUM',
+                   'AMT_CREDIT_SUM_DEBT',
+                   'AMT_CREDIT_SUM_LIMIT',
+                   'AMT_CREDIT_SUM_OVERDUE',
+                   'AMT_CREDIT_MAX_OVERDUE',
+                   'CNT_CREDIT_PROLONG',
+                   'CREDIT_DAY_OVERDUE',
+                   'DAYS_CREDIT',
+                   'DAYS_CREDIT_ENDDATE',
+                   'DAYS_CREDIT_UPDATE'
+                   ]:
+        BUREAU_AGGREGATION_RECIPIES.append((select, agg))
+BUREAU_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], BUREAU_AGGREGATION_RECIPIES)]
+
+CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = []
+for agg in ['mean', 'min', 'max', 'sum', 'var']:
+    for select in ['AMT_BALANCE',
+                   'AMT_CREDIT_LIMIT_ACTUAL',
+                   'AMT_DRAWINGS_ATM_CURRENT',
+                   'AMT_DRAWINGS_CURRENT',
+                   'AMT_DRAWINGS_OTHER_CURRENT',
+                   'AMT_DRAWINGS_POS_CURRENT',
+                   'AMT_PAYMENT_CURRENT',
+                   'CNT_DRAWINGS_ATM_CURRENT',
+                   'CNT_DRAWINGS_CURRENT',
+                   'CNT_DRAWINGS_OTHER_CURRENT',
+                   'CNT_INSTALMENT_MATURE_CUM',
+                   'MONTHS_BALANCE',
+                   'SK_DPD',
+                   'SK_DPD_DEF'
+                   ]:
+        CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES.append((select, agg))
+CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES)]
+
+INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = []
+for agg in ['mean', 'min', 'max', 'sum', 'var']:
+    for select in ['AMT_INSTALMENT',
+                   'AMT_PAYMENT',
+                   'DAYS_ENTRY_PAYMENT',
+                   'DAYS_INSTALMENT',
+                   'NUM_INSTALMENT_NUMBER',
+                   'NUM_INSTALMENT_VERSION'
+                   ]:
+        INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES.append((select, agg))
+INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES)]
+
+POS_CASH_BALANCE_AGGREGATION_RECIPIES = []
+for agg in ['mean', 'min', 'max', 'sum', 'var']:
+    for select in ['MONTHS_BALANCE',
+                   'SK_DPD',
+                   'SK_DPD_DEF'
+                   ]:
+        POS_CASH_BALANCE_AGGREGATION_RECIPIES.append((select, agg))
+POS_CASH_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], POS_CASH_BALANCE_AGGREGATION_RECIPIES)]
+
+PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = []
+for agg in ['mean', 'min', 'max', 'sum', 'var']:
+    for select in ['AMT_ANNUITY',
+                   'AMT_APPLICATION',
+                   'AMT_CREDIT',
+                   'AMT_DOWN_PAYMENT',
+                   'AMT_GOODS_PRICE',
+                   'CNT_PAYMENT',
+                   'DAYS_DECISION',
+                   'HOUR_APPR_PROCESS_START',
+                   'RATE_DOWN_PAYMENT'
+                   ]:
+        PREVIOUS_APPLICATION_AGGREGATION_RECIPIES.append((select, agg))
+PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], PREVIOUS_APPLICATION_AGGREGATION_RECIPIES)]
 
 SOLUTION_CONFIG = AttrDict({
     'pipeline': {'experiment_directory': params.experiment_directory
                  },
 
-    'preprocessing': {'fillna_value': params.fillna_value},
+    'preprocessing': {'impute_missing': {'fill_missing': params.fill_missing,
+                                         'fill_value': params.fill_value},
+                      'categorical_encoder': {'categorical_columns': CATEGORICAL_COLUMNS
+                                              },
+                      },
+
+    'applications': {'columns': {'categorical_columns': CATEGORICAL_COLUMNS,
+                                 'numerical_columns': NUMERICAL_COLUMNS
+                                 },
+                     'aggregations': {'groupby_aggregations': APPLICATION_AGGREGATION_RECIPIES
+                                      }
+                     },
+
+    'bureau': {'table_name': 'bureau',
+               'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
+               'groupby_aggregations': BUREAU_AGGREGATION_RECIPIES
+               },
+
+    'credit_card_balance': {'table_name': 'credit_card_balance',
+                            'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
+                            'groupby_aggregations': CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES
+                            },
 
-    'dataframe_by_type_splitter': {'numerical_columns': NUMERICAL_COLUMNS,
-                                   'categorical_columns': CATEGORICAL_COLUMNS,
-                                   'timestamp_columns': TIMESTAMP_COLUMNS,
-                                   },
+    'installments_payments': {'table_name': 'installments_payments',
+                              'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
+                              'groupby_aggregations': INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES
+                              },
+
+    'pos_cash_balance': {'table_name': 'POS_CASH_balance',
+                         'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
+                         'groupby_aggregations': POS_CASH_BALANCE_AGGREGATION_RECIPIES
+                         },
+
+    'previous_applications': {'table_name': 'previous_application',
+                              'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
+                              'groupby_aggregations': PREVIOUS_APPLICATION_AGGREGATION_RECIPIES
+                              },
 
     'light_gbm': {'device': parameter_eval(params.lgbm__device),
                   'boosting_type': parameter_eval(params.lgbm__boosting_type),
                   'objective': parameter_eval(params.lgbm__objective),
                   'metric': parameter_eval(params.lgbm__metric),
+                  'scale_pos_weight': parameter_eval(params.lgbm__scale_pos_weight),
                   'learning_rate': parameter_eval(params.lgbm__learning_rate),
+                  'max_bin': parameter_eval(params.lgbm__max_bin),
                   'max_depth': parameter_eval(params.lgbm__max_depth),
+                  'num_leaves': parameter_eval(params.lgbm__num_leaves),
+                  'min_child_samples': parameter_eval(params.lgbm__min_child_samples),
                   'subsample': parameter_eval(params.lgbm__subsample),
                   'colsample_bytree': parameter_eval(params.lgbm__colsample_bytree),
-                  'min_child_weight': parameter_eval(params.lgbm__min_child_weight),
+                  'subsample_freq': parameter_eval(params.lgbm__subsample_freq),
+                  'min_gain_to_split': parameter_eval(params.lgbm__min_gain_to_split),
                   'reg_lambda': parameter_eval(params.lgbm__reg_lambda),
                   'reg_alpha': parameter_eval(params.lgbm__reg_alpha),
-                  'subsample_freq': parameter_eval(params.lgbm__subsample_freq),
-                  'max_bin': parameter_eval(params.lgbm__max_bin),
-                  'min_child_samples': parameter_eval(params.lgbm__min_child_samples),
-                  'num_leaves': parameter_eval(params.lgbm__num_leaves),
                   'nthread': parameter_eval(params.num_workers),
                   'number_boosting_rounds': parameter_eval(params.lgbm__number_boosting_rounds),
                   'early_stopping_rounds': parameter_eval(params.lgbm__early_stopping_rounds),
@@ -282,21 +411,4 @@
                               },
                       },
 
-    'bureau': {'filepath': BUREAU,
-               'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'),
-               'groupby_aggregations': [
-                   {'groupby': ['SK_ID_CURR'], 'select': 'DAYS_CREDIT', 'agg': 'count'},        # 1
-                   {'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_TYPE', 'agg': 'nunique'},      # 2
-                   {'groupby': ['SK_ID_CURR'], 'select': 'CNT_CREDIT_PROLONG', 'agg': 'mean'},  # 10
-                   {'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_DAY_OVERDUE', 'agg': 'count'},
-                   {'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_ACTIVE', 'agg': 'size'},
-                   {'groupby': ['SK_ID_CURR'], 'select': 'AMT_CREDIT_SUM', 'agg': 'count'},
-               ]},
-
-    'clipper': {'min_val': 0,
-                'max_val': 1
-                },
-
-    'groupby_aggregation': {'groupby_aggregations': AGGREGATION_RECIPIES
-                            },
 })
diff --git a/src/pipeline_manager.py b/src/pipeline_manager.py
new file mode 100644
index 0000000..7e405a4
--- /dev/null
+++ b/src/pipeline_manager.py
@@ -0,0 +1,402 @@
+import os
+import shutil
+
+from attrdict import AttrDict
+import numpy as np
+import pandas as pd
+from scipy.stats import gmean
+from deepsense import neptune
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
+
+from . import pipeline_config as cfg
+from .pipelines import PIPELINES
+from .utils import init_logger, read_params, set_seed, create_submission, verify_submission, calculate_rank
+
+set_seed(cfg.RANDOM_SEED)
+logger = init_logger()
+ctx = neptune.Context()
+params = read_params(ctx, fallback_file='neptune.yaml')
+
+
+class PipelineManager():
+    def train(self, pipeline_name, dev_mode):
+        train(pipeline_name, dev_mode)
+
+    def evaluate(self, pipeline_name, dev_mode, ):
+        evaluate(pipeline_name, dev_mode)
+
+    def predict(self, pipeline_name, dev_mode, submit_predictions):
+        predict(pipeline_name, dev_mode, submit_predictions)
+
+    def train_evaluate_cv(self, pipeline_name, dev_mode):
+        train_evaluate_cv(pipeline_name, dev_mode)
+
+    def train_evaluate_predict_cv(self, pipeline_name, dev_mode, submit_predictions):
+        train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions)
+
+
+def train(pipeline_name, dev_mode):
+    logger.info('TRAINING')
+    if bool(params.clean_experiment_directory_before_training) and os.path.isdir(params.experiment_directory):
+        logger.info('Cleaning experiment_directory...')
+        shutil.rmtree(params.experiment_directory)
+
+    tables = _read_data(dev_mode, read_train=True, read_test=False)
+
+    logger.info('Shuffling and splitting into train and test...')
+    train_data_split, valid_data_split = train_test_split(tables.application_train,
+                                                          test_size=params.validation_size,
+                                                          random_state=cfg.RANDOM_SEED,
+                                                          shuffle=params.shuffle)
+
+    logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMNS].mean()))
+    logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMNS].mean()))
+    logger.info('Train shape: {}'.format(train_data_split.shape))
+    logger.info('Valid shape: {}'.format(valid_data_split.shape))
+
+    train_data = {'application': {'X': train_data_split.drop(cfg.TARGET_COLUMNS, axis=1),
+                                  'y': train_data_split[cfg.TARGET_COLUMNS].values.reshape(-1),
+                                  'X_valid': valid_data_split.drop(cfg.TARGET_COLUMNS, axis=1),
+                                  'y_valid': valid_data_split[cfg.TARGET_COLUMNS].values.reshape(-1)
+                                  },
+                  'bureau_balance': {'X': tables.bureau_balance},
+                  'bureau': {'X': tables.bureau},
+                  'credit_card_balance': {'X': tables.credit_card_balance},
+                  'installments_payments': {'X': tables.installments_payments},
+                  'pos_cash_balance': {'X': tables.pos_cash_balance},
+                  'previous_application': {'X': tables.previous_application},
+                  }
+
+    pipeline = PIPELINES[pipeline_name](config=cfg.SOLUTION_CONFIG, train_mode=True)
+    pipeline.clean_cache()
+    logger.info('Start pipeline fit and transform')
+    pipeline.fit_transform(train_data)
+    pipeline.clean_cache()
+
+
+def evaluate(pipeline_name, dev_mode):
+    logger.info('EVALUATION')
+    logger.info('Reading data...')
+
+    tables = _read_data(dev_mode, read_train=True, read_test=False)
+
+    logger.info('Shuffling and splitting to get validation split...')
+    _, valid_data_split = train_test_split(tables.application_train,
+                                           test_size=params.validation_size,
+                                           random_state=cfg.RANDOM_SEED,
+                                           shuffle=params.shuffle)
+
+    logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMNS].mean()))
+    logger.info('Valid shape: {}'.format(valid_data_split.shape))
+
+    y_true = valid_data_split[cfg.TARGET_COLUMNS].values
+
+    eval_data = {'application': {'X': valid_data_split.drop(cfg.TARGET_COLUMNS, axis=1),
+                                 'y': None,
+                                 },
+                 'bureau_balance': {'X': tables.bureau_balance},
+                 'bureau': {'X': tables.bureau},
+                 'credit_card_balance': {'X': tables.credit_card_balance},
+                 'installments_payments': {'X': tables.installments_payments},
+                 'pos_cash_balance': {'X': tables.pos_cash_balance},
+                 'previous_application': {'X': tables.previous_application},
+                 }
+
+    pipeline = PIPELINES[pipeline_name](config=cfg.SOLUTION_CONFIG, train_mode=False)
+    pipeline.clean_cache()
+    logger.info('Start pipeline transform')
+    output = pipeline.transform(eval_data)
+    pipeline.clean_cache()
+
+    y_pred = output['prediction']
+
+    logger.info('Calculating ROC_AUC on validation set')
+    score = roc_auc_score(y_true, y_pred)
+    logger.info('ROC_AUC score on validation is {}'.format(score))
+    ctx.channel_send('ROC_AUC', 0, score)
+
+
+def predict(pipeline_name, dev_mode, submit_predictions):
+    logger.info('PREDICTION')
+
+    tables = _read_data(dev_mode, read_train=False, read_test=True)
+
+    test_data = {'application': {'X': tables.application_test,
+                                 'y': None,
+                                 },
+                 'bureau_balance': {'X': tables.bureau_balance},
+                 'bureau': {'X': tables.bureau},
+                 'credit_card_balance': {'X': tables.credit_card_balance},
+                 'installments_payments': {'X': tables.installments_payments},
+                 'pos_cash_balance': {'X': tables.pos_cash_balance},
+                 'previous_application': {'X': tables.previous_application},
+                 }
+
+    pipeline = PIPELINES[pipeline_name](config=cfg.SOLUTION_CONFIG, train_mode=False)
+
+    pipeline.clean_cache()
+    logger.info('Start pipeline transform')
+    output = pipeline.transform(test_data)
+    pipeline.clean_cache()
+    y_pred = output['prediction']
+
+    if not dev_mode:
+        logger.info('creating submission file...')
+        submission = create_submission(tables.application_test, y_pred)
+
+        logger.info('verifying submission...')
+        sample_submission = pd.read_csv(params.sample_submission_filepath)
+        verify_submission(submission, sample_submission)
+
+        submission_filepath = os.path.join(params.experiment_directory, 'submission.csv')
+        submission.to_csv(submission_filepath, index=None, encoding='utf-8')
+        logger.info('submission persisted to {}'.format(submission_filepath))
+        logger.info('submission head \n\n{}'.format(submission.head()))
+
+        if submit_predictions and params.kaggle_api:
+            make_submission(submission_filepath)
+
+
+def train_evaluate_cv(pipeline_name, dev_mode):
+    if bool(params.clean_experiment_directory_before_training) and os.path.isdir(params.experiment_directory):
+        logger.info('Cleaning experiment_directory...')
+        shutil.rmtree(params.experiment_directory)
+
+    tables = _read_data(dev_mode, read_train=True, read_test=False)
+
+    target_values = tables.application_train[cfg.TARGET_COLUMNS].values.reshape(-1)
+    fold_generator = _get_fold_generator(target_values)
+
+    fold_scores = []
+    for fold_id, (train_idx, valid_idx) in enumerate(fold_generator):
+        (train_data_split,
+         valid_data_split) = tables.application_train.iloc[train_idx], tables.application_train.iloc[valid_idx]
+
+        logger.info('Started fold {}'.format(fold_id))
+        logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMNS].mean()))
+        logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMNS].mean()))
+        logger.info('Train shape: {}'.format(train_data_split.shape))
+        logger.info('Valid shape: {}'.format(valid_data_split.shape))
+
+        score, _, _ = _fold_fit_evaluate_loop(train_data_split, valid_data_split, tables, fold_id, pipeline_name)
+
+        logger.info('Fold {} ROC_AUC {}'.format(fold_id, score))
+        ctx.channel_send('Fold {} ROC_AUC'.format(fold_id), 0, score)
+
+        fold_scores.append(score)
+
+    score_mean, score_std = np.mean(fold_scores), np.std(fold_scores)
+
+    logger.info('ROC_AUC mean {}, ROC_AUC std {}'.format(score_mean, score_std))
+    ctx.channel_send('ROC_AUC', 0, score_mean)
+    ctx.channel_send('ROC_AUC STD', 0, score_std)
+
+
+def train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions):
+    if bool(params.clean_experiment_directory_before_training) and os.path.isdir(params.experiment_directory):
+        logger.info('Cleaning experiment_directory...')
+        shutil.rmtree(params.experiment_directory)
+
+    tables = _read_data(dev_mode, read_train=True, read_test=True)
+
+    target_values = tables.application_train[cfg.TARGET_COLUMNS].values.reshape(-1)
+    fold_generator = _get_fold_generator(target_values)
+
+    fold_scores, out_of_fold_train_predictions, out_of_fold_test_predictions = [], [], []
+    for fold_id, (train_idx, valid_idx) in enumerate(fold_generator):
+        (train_data_split,
+         valid_data_split) = tables.application_train.iloc[train_idx], tables.application_train.iloc[valid_idx]
+
+        logger.info('Started fold {}'.format(fold_id))
+        logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMNS].mean()))
+        logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMNS].mean()))
+        logger.info('Train shape: {}'.format(train_data_split.shape))
+        logger.info('Valid shape: {}'.format(valid_data_split.shape))
+
+        score, out_of_fold_prediction, test_prediction = _fold_fit_evaluate_predict_loop(train_data_split,
+                                                                                         valid_data_split,
+                                                                                         tables,
+                                                                                         fold_id, pipeline_name)
+
+        logger.info('Fold {} ROC_AUC {}'.format(fold_id, score))
+        ctx.channel_send('Fold {} ROC_AUC'.format(fold_id), 0, score)
+
+        out_of_fold_train_predictions.append(out_of_fold_prediction)
+        out_of_fold_test_predictions.append(test_prediction)
+        fold_scores.append(score)
+
+    out_of_fold_train_predictions = pd.concat(out_of_fold_train_predictions, axis=0)
+    out_of_fold_test_predictions = pd.concat(out_of_fold_test_predictions, axis=0)
+
+    test_prediction_aggregated = _aggregate_test_prediction(out_of_fold_test_predictions)
+    score_mean, score_std = np.mean(fold_scores), np.std(fold_scores)
+
+    logger.info('ROC_AUC mean {}, ROC_AUC std {}'.format(score_mean, score_std))
+    ctx.channel_send('ROC_AUC', 0, score_mean)
+    ctx.channel_send('ROC_AUC STD', 0, score_std)
+
+    logger.info('Saving predictions')
+    out_of_fold_train_predictions.to_csv(os.path.join(params.experiment_directory,
+                                                      '{}_out_of_fold_train_predictions.csv'.format(pipeline_name)),
+                                         index=None)
+    out_of_fold_test_predictions.to_csv(os.path.join(params.experiment_directory,
+                                                     '{}_out_of_fold_test_predictions.csv'.format(pipeline_name)),
+                                        index=None)
+    test_aggregated_file_path = os.path.join(params.experiment_directory,
+                                             '{}_test_predictions_{}.csv'.format(pipeline_name,
+                                                                                 params.aggregation_method))
+    test_prediction_aggregated.to_csv(test_aggregated_file_path, index=None)
+
+    if not dev_mode:
+        logger.info('verifying submission...')
+        sample_submission = pd.read_csv(params.sample_submission_filepath)
+        verify_submission(test_prediction_aggregated, sample_submission)
+
+        if submit_predictions and params.kaggle_api:
+            make_submission(test_aggregated_file_path)
+
+
+def make_submission(submission_filepath):
+    logger.info('making Kaggle submit...')
+    os.system('kaggle competitions submit -c home-credit-default-risk -f {} -m {}'
+              .format(submission_filepath, params.kaggle_message))
+
+
+def _read_data(dev_mode, read_train=True, read_test=False):
+    logger.info('Reading data...')
+    if dev_mode:
+        nrows = cfg.DEV_SAMPLE_SIZE
+        logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
+    else:
+        nrows = None
+
+    raw_data = {}
+
+    if read_train:
+        raw_data['application_train'] = pd.read_csv(params.train_filepath, nrows=nrows)
+
+    if read_test:
+        raw_data['application_test'] = pd.read_csv(params.test_filepath, nrows=nrows)
+
+    raw_data['bureau'] = pd.read_csv(params.bureau_filepath, nrows=nrows)
+    raw_data['credit_card_balance'] = pd.read_csv(params.credit_card_balance_filepath, nrows=nrows)
+    raw_data['installments_payments'] = pd.read_csv(params.installments_payments_filepath, nrows=nrows)
+    raw_data['pos_cash_balance'] = pd.read_csv(params.POS_CASH_balance_filepath, nrows=nrows)
+    raw_data['previous_application'] = pd.read_csv(params.previous_application_filepath, nrows=nrows)
+    raw_data['bureau_balance'] = pd.read_csv(params.bureau_balance_filepath, nrows=nrows)
+
+    return AttrDict(raw_data)
+
+
+def _get_fold_generator(target_values):
+    if params.stratified_cv:
+        cv = StratifiedKFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
+        cv.get_n_splits(target_values)
+        fold_generator = cv.split(target_values, target_values)
+    else:
+        cv = KFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
+        fold_generator = cv.split(target_values)
+    return fold_generator
+
+
+def _fold_fit_evaluate_predict_loop(train_data_split, valid_data_split, tables, fold_id, pipeline_name):
+    score, y_valid_pred, pipeline = _fold_fit_evaluate_loop(train_data_split, valid_data_split, tables,
+                                                            fold_id, pipeline_name)
+
+    test_data = {'application': {'X': tables.application_test,
+                                 'y': None,
+                                 },
+                 'bureau_balance': {'X': tables.bureau_balance},
+                 'bureau': {'X': tables.bureau},
+                 'credit_card_balance': {'X': tables.credit_card_balance},
+                 'installments_payments': {'X': tables.installments_payments},
+                 'pos_cash_balance': {'X': tables.pos_cash_balance},
+                 'previous_application': {'X': tables.previous_application},
+                 }
+
+    logger.info('Start pipeline transform on test')
+    pipeline.clean_cache()
+    output_test = pipeline.transform(test_data)
+    pipeline.clean_cache()
+    y_test_pred = output_test['prediction']
+
+    train_out_of_fold_prediction_chunk = valid_data_split[cfg.ID_COLUMNS]
+    train_out_of_fold_prediction_chunk['fold_id'] = fold_id
+    train_out_of_fold_prediction_chunk['{}_prediction'.format(pipeline_name)] = y_valid_pred
+
+    test_out_of_fold_prediction_chunk = tables.application_test[cfg.ID_COLUMNS]
+    test_out_of_fold_prediction_chunk['fold_id'] = fold_id
+    test_out_of_fold_prediction_chunk['{}_prediction'.format(pipeline_name)] = y_test_pred
+
+    return score, train_out_of_fold_prediction_chunk, test_out_of_fold_prediction_chunk
+
+
+def _fold_fit_evaluate_loop(train_data_split, valid_data_split, tables, fold_id, pipeline_name):
+    train_data = {'application': {'X': train_data_split.drop(cfg.TARGET_COLUMNS, axis=1),
+                                  'y': train_data_split[cfg.TARGET_COLUMNS].values.reshape(-1),
+                                  'X_valid': valid_data_split.drop(cfg.TARGET_COLUMNS, axis=1),
+                                  'y_valid': valid_data_split[cfg.TARGET_COLUMNS].values.reshape(-1),
+                                  },
+                  'bureau_balance': {'X': tables.bureau_balance},
+                  'bureau': {'X': tables.bureau},
+                  'credit_card_balance': {'X': tables.credit_card_balance},
+                  'installments_payments': {'X': tables.installments_payments},
+                  'pos_cash_balance': {'X': tables.pos_cash_balance},
+                  'previous_application': {'X': tables.previous_application},
+                  }
+
+    valid_data = {'application': {'X': valid_data_split.drop(cfg.TARGET_COLUMNS, axis=1),
+                                  'y': None,
+                                  },
+                  'bureau_balance': {'X': tables.bureau_balance},
+                  'bureau': {'X': tables.bureau},
+                  'credit_card_balance': {'X': tables.credit_card_balance},
+                  'installments_payments': {'X': tables.installments_payments},
+                  'pos_cash_balance': {'X': tables.pos_cash_balance},
+                  'previous_application': {'X': tables.previous_application},
+                  }
+
+    pipeline = PIPELINES[pipeline_name](config=cfg.SOLUTION_CONFIG, train_mode=True,
+                                        suffix='_fold_{}'.format(fold_id))
+
+    logger.info('Start pipeline fit and transform on train')
+    pipeline.clean_cache()
+    pipeline.fit_transform(train_data)
+    pipeline.clean_cache()
+
+    pipeline = PIPELINES[pipeline_name](config=cfg.SOLUTION_CONFIG, train_mode=False,
+                                        suffix='_fold_{}'.format(fold_id))
+    logger.info('Start pipeline transform on valid')
+    pipeline.clean_cache()
+    output_valid = pipeline.transform(valid_data)
+    pipeline.clean_cache()
+
+    y_valid_pred = output_valid['prediction']
+    y_valid_true = valid_data_split[cfg.TARGET_COLUMNS].values
+    score = roc_auc_score(y_valid_true, y_valid_pred)
+
+    return score, y_valid_pred, pipeline
+
+
+def _aggregate_test_prediction(out_of_fold_test_predictions):
+    agg_methods = {'mean': np.mean,
+                   'gmean': gmean}
+    prediction_column = [col for col in out_of_fold_test_predictions.columns if '_prediction' in col][0]
+    if params.aggregation_method == 'rank_mean':
+        rank_column = prediction_column.replace('_prediction', '_rank')
+        test_predictions_with_ranks = []
+        for fold_id, fold_df in out_of_fold_test_predictions.groupby('fold_id'):
+            fold_df[rank_column] = calculate_rank(fold_df[prediction_column])
+            test_predictions_with_ranks.append(fold_df)
+        test_predictions_with_ranks = pd.concat(test_predictions_with_ranks, axis=0)
+
+        test_prediction_aggregated = test_predictions_with_ranks.groupby(cfg.ID_COLUMNS)[rank_column].apply(
+            np.mean).reset_index()
+    else:
+        test_prediction_aggregated = out_of_fold_test_predictions.groupby(cfg.ID_COLUMNS)[prediction_column].apply(
+            agg_methods[params.aggregation_method]).reset_index()
+
+    test_prediction_aggregated.columns = [cfg.ID_COLUMNS + cfg.TARGET_COLUMNS]
+
+    return test_prediction_aggregated
diff --git a/pipelines.py b/src/pipelines.py
similarity index 71%
rename from pipelines.py
rename to src/pipelines.py
index d2df898..2fc94e3 100644
--- a/pipelines.py
+++ b/src/pipelines.py
@@ -3,107 +3,91 @@
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
-from steppy.adapter import Adapter, E
-from steppy.base import Step
 
-from pipeline_blocks import feature_extraction, classifier_light_gbm, preprocessing_fillna, classifier_sklearn, \
+from .pipeline_blocks import feature_extraction, classifier_light_gbm, preprocessing_fillna, classifier_sklearn, \
     classifier_xgb
-from postprocessing import Clipper
 
 
-def lightGBM(config, train_mode):
+def lightGBM(config, train_mode, suffix=''):
     if train_mode:
         features, features_valid = feature_extraction(config,
                                                       train_mode,
-                                                      persist_output=True,
-                                                      cache_output=True,
-                                                      load_persisted_output=True)
+                                                      suffix,
+                                                      persist_output=False,
+                                                      cache_output=False,
+                                                      load_persisted_output=False)
         light_gbm = classifier_light_gbm((features, features_valid),
                                          config,
-                                         train_mode)
+                                         train_mode, suffix)
     else:
         features = feature_extraction(config,
                                       train_mode,
-                                      cache_output=True)
+                                      suffix,
+                                      cache_output=False)
         light_gbm = classifier_light_gbm(features,
                                          config,
-                                         train_mode)
+                                         train_mode, suffix)
 
-    clipper = Step(name='clipper',
-                   transformer=Clipper(**config.clipper),
-                   input_steps=[light_gbm],
-                   adapter=Adapter({'prediction': E(light_gbm.name, 'prediction')}),
-                   experiment_directory=config.pipeline.experiment_directory)
+    return light_gbm
 
-    return clipper
 
-
-def xgboost(config, train_mode):
+def xgboost(config, train_mode, suffix=''):
     if train_mode:
         features, features_valid = feature_extraction(config,
                                                       train_mode,
+                                                      suffix,
                                                       persist_output=True,
                                                       cache_output=True,
                                                       load_persisted_output=True)
         xgb = classifier_xgb((features, features_valid),
                              config,
-                             train_mode)
+                             train_mode,
+                             suffix)
     else:
         features = feature_extraction(config,
                                       train_mode,
+                                      suffix,
                                       cache_output=True)
         xgb = classifier_xgb(features,
                              config,
-                             train_mode)
-
-    clipper = Step(name='clipper',
-                   transformer=Clipper(**config.clipper),
-                   input_steps=[xgb],
-                   adapter=Adapter({'prediction': E(xgb.name, 'prediction')}),
-                   experiment_directory=config.pipeline.experiment_directory)
+                             train_mode,
+                             suffix)
 
-    return clipper
+    return xgb
 
 
-def sklearn_main(config, ClassifierClass, clf_name, train_mode, normalize=False):
+def sklearn_main(config, ClassifierClass, clf_name, train_mode, suffix='', normalize=False):
     model_params = getattr(config, clf_name)
     random_search_config = getattr(config.random_search, clf_name)
     full_config = (config, model_params, random_search_config)
     if train_mode:
         features, features_valid = feature_extraction(config,
                                                       train_mode,
+                                                      suffix,
                                                       persist_output=True,
                                                       cache_output=True,
                                                       load_persisted_output=True)
 
-        sklearn_preproc = preprocessing_fillna((features, features_valid), config, train_mode)
+        sklearn_preproc = preprocessing_fillna((features, features_valid), config, train_mode, suffix)
     else:
         features = feature_extraction(config,
                                       train_mode,
+                                      suffix,
                                       cache_output=True)
-        sklearn_preproc = preprocessing_fillna(features, config, train_mode)
+        sklearn_preproc = preprocessing_fillna(features, config, train_mode, suffix)
 
     sklearn_clf = classifier_sklearn(sklearn_preproc,
                                      ClassifierClass,
                                      full_config,
                                      clf_name,
                                      train_mode,
+                                     suffix,
                                      normalize)
+    return sklearn_clf
 
-    clipper = Step(name='clipper',
-                   transformer=Clipper(**config.clipper),
-                   input_steps=[sklearn_clf],
-                   adapter=Adapter({'prediction': E(sklearn_clf.name, 'predicted')}),
-                   experiment_directory=config.pipeline.experiment_directory)
-    return clipper
 
-
-PIPELINES = {'lightGBM': {'train': partial(lightGBM, train_mode=True),
-                          'inference': partial(lightGBM, train_mode=False)
-                          },
-             'XGBoost': {'train': partial(xgboost, train_mode=True),
-                         'inference': partial(xgboost, train_mode=False)
-                         },
+PIPELINES = {'lightGBM': lightGBM,
+             'XGBoost': xgboost,
              'random_forest': {'train': partial(sklearn_main,
                                                 ClassifierClass=RandomForestClassifier,
                                                 clf_name='random_forest',
diff --git a/utils.py b/src/utils.py
similarity index 80%
rename from utils.py
rename to src/utils.py
index b1eb89c..bdbbbab 100644
--- a/utils.py
+++ b/src/utils.py
@@ -7,7 +7,6 @@
 import pandas as pd
 import yaml
 from attrdict import AttrDict
-from steppy.base import BaseTransformer
 
 
 def create_submission(meta, predictions):
@@ -18,7 +17,6 @@ def create_submission(meta, predictions):
 
 
 def verify_submission(submission, sample_submission):
-
     assert submission.shape == sample_submission.shape, \
         'Expected submission to have shape {} but got {}'.format(sample_submission.shape, submission.shape)
 
@@ -49,12 +47,9 @@ def init_logger():
     return logger
 
 
-def read_params(ctx):
+def read_params(ctx, fallback_file):
     if ctx.params.__class__.__name__ == 'OfflineContextParams':
-        try:
-            neptune_config = read_yaml('neptune.yaml')
-        except FileNotFoundError:
-            neptune_config = read_yaml('../neptune.yaml')
+        neptune_config = read_yaml(fallback_file)
         params = neptune_config.parameters
     else:
         params = ctx.params
@@ -86,17 +81,6 @@ def set_seed(seed=90210):
     random.seed(seed)
     np.random.seed(seed)
 
-
-class ToNumpyLabel(BaseTransformer):
-    def __init__(self, **kwargs):
-        super().__init__()
-        self.y = None
-
-    def fit(self, y, **kwargs):
-        self.y = y[0].values.reshape(-1)
-        return self
-
-    def transform(self, **kwargs):
-        if self.y.any():
-            return {'y': self.y}
-        return {}
+def calculate_rank(predictions):
+    rank = (1 + predictions.rank().values) / (predictions.shape[0] + 1)
+    return rank
\ No newline at end of file