Skip to content
This repository has been archived by the owner on Jun 22, 2022. It is now read-only.

Commit

Permalink
Dev (#96)
Browse files Browse the repository at this point in the history
* Smart features (#61)

* Update README.md

* Update README.md

* Update

* Smart features update

* More descriptive transformer name

* Reading all data in main

* More application features

* Transformer for cleaning

* Multiinput data dictionary

* Fix (#63)

* fixed configs

* dropped redundand steps, moved stuff to cleaning, refactored groupby (#64)

* dropped redundand steps, moved stuff to cleanining, refactored groupby

* restructured, added stacking + CV

* Fix format string

* Update pipeline_manager.py

clipped prediction -> prediction

* added stratified kfold option (#77)

* Update config (#79)

* dropped redundand steps, moved stuff to cleanining, refactored groupby

* restructured, added stacking + CV

* Update pipeline_config.py

* Dev review (#81)

* dropped feature by type split, refactored pipleine_config

* dropped feature by type split method

* explored application features

* trash

* reverted refactor of aggs

* fixed/updated bureau features

* cleared notebooks

* agg features added to notebook bureau

* credit card cleaned

* added other feature notebooks

* added rank mean

* updated model arch

* reverted to old params

* fixed rank mean calculations

* ApplicationCleaning update (#84)

* Cleaning - application

* Clear output in notebook

* clenaed names in steps, refactored mergeaggregate transformer, changed caching/saving specs (#85)

* local trash

* External sources notebook (#86)

* Update

* External sources notebook

* Dev lgbm params (#88)

* local trash

* updated configs

* dropped comment

* updated lgb params

* Dev app agg fix (#90)

* dropped app_aggs

* app agg features fixed

* cleaned leftovers

* dropped fast read-in for debug

* External_sources statistics (#89)

* Speed-up ext_src notebook

* exernal_sources statistics

* Weighted mean and notebook fix

* application notebook update

* clear notebook output

* Fix auto submission (#95)

* updated best model name

* changed best model path

* corrections
  • Loading branch information
jakubczakon authored and Kamil A. Kaczmarek committed Jul 3, 2018
1 parent b488060 commit 428d89e
Show file tree
Hide file tree
Showing 26 changed files with 4,134 additions and 1,005 deletions.
142 changes: 0 additions & 142 deletions feature_extraction.py

This file was deleted.

203 changes: 42 additions & 161 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,196 +1,77 @@
import os
import shutil

import click
import pandas as pd
from deepsense import neptune
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import pipeline_config as cfg
from pipelines import PIPELINES
from utils import create_submission, init_logger, read_params, persist_evaluation_predictions, \
set_seed, verify_submission
from src.pipeline_manager import PipelineManager

set_seed()
logger = init_logger()
ctx = neptune.Context()
params = read_params(ctx)
pipeline_manager = PipelineManager()


@click.group()
def action():
def main():
pass


@action.command()
@main.command()
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
def train(pipeline_name, dev_mode):
_train(pipeline_name, dev_mode)
pipeline_manager.train(pipeline_name, dev_mode)


@action.command()
@main.command()
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
def evaluate(pipeline_name, dev_mode):
_evaluate(pipeline_name, dev_mode)
pipeline_manager.evaluate(pipeline_name, dev_mode)


@action.command()
@main.command()
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
def predict(pipeline_name, dev_mode):
_predict(pipeline_name, dev_mode)
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
def predict(pipeline_name, dev_mode, submit_predictions):
pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)


@action.command()
@main.command()
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
def train_evaluate_predict(pipeline_name, dev_mode):
_train(pipeline_name, dev_mode)
_evaluate(pipeline_name, dev_mode)
_predict(pipeline_name, dev_mode)
def train_evaluate_predict(pipeline_name, submit_predictions, dev_mode):
pipeline_manager.train(pipeline_name, dev_mode)
pipeline_manager.evaluate(pipeline_name, dev_mode)
pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)


@action.command()
@main.command()
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
def evaluate_predict(pipeline_name, dev_mode):
_evaluate(pipeline_name, dev_mode)
_predict(pipeline_name, dev_mode)
def train_evaluate(pipeline_name, dev_mode):
pipeline_manager.train(pipeline_name, dev_mode)
pipeline_manager.evaluate(pipeline_name, dev_mode)


@action.command()
@main.command()
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
def train_evaluate(pipeline_name, dev_mode):
_train(pipeline_name, dev_mode)
_evaluate(pipeline_name, dev_mode)


def _train(pipeline_name, dev_mode):
logger.info('TRAINING')
if bool(params.clean_experiment_directory_before_training) and os.path.isdir(params.experiment_directory):
logger.info('Cleaning experiment_directory...')
shutil.rmtree(params.experiment_directory)

logger.info('Reading data...')
if dev_mode:
logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
application_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
else:
application_train = pd.read_csv(params.train_filepath)

logger.info('Shuffling and splitting into train and test...')
train_data_split, valid_data_split = train_test_split(application_train,
test_size=params.validation_size,
random_state=cfg.RANDOM_SEED,
shuffle=params.shuffle)

logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMN].mean()))
logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMN].mean()))
logger.info('Train shape: {}'.format(train_data_split.shape))
logger.info('Valid shape: {}'.format(valid_data_split.shape))

data = {'input': {'X': train_data_split.drop(cfg.TARGET_COLUMN, axis=1),
'y': train_data_split[cfg.TARGET_COLUMN],
'X_valid': valid_data_split.drop(cfg.TARGET_COLUMN, axis=1),
'y_valid': valid_data_split[cfg.TARGET_COLUMN],
},
}

pipeline = PIPELINES[pipeline_name]['train'](cfg.SOLUTION_CONFIG)
pipeline.clean_cache()
logger.info('Start pipeline fit and transform')
pipeline.fit_transform(data)
pipeline.clean_cache()


def _evaluate(pipeline_name, dev_mode):
logger.info('EVALUATION')
logger.info('reading data...')
if dev_mode:
logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
application_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
else:
application_train = pd.read_csv(params.train_filepath)

logger.info('Shuffling and splitting to get validation split...')
_, valid_data_split = train_test_split(application_train,
test_size=params.validation_size,
random_state=cfg.RANDOM_SEED,
shuffle=params.shuffle)

logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMN].mean()))
logger.info('Valid shape: {}'.format(valid_data_split.shape))

y_true = valid_data_split[cfg.TARGET_COLUMN].values
data = {'input': {'X': valid_data_split.drop(cfg.TARGET_COLUMN, axis=1),
'y': valid_data_split[cfg.TARGET_COLUMN],
},
}

pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
pipeline.clean_cache()
logger.info('Start pipeline transform')
output = pipeline.transform(data)
pipeline.clean_cache()

y_pred = output['clipped_prediction']

logger.info('Saving evaluation predictions to the {}'.format(params.experiment_directory))
persist_evaluation_predictions(params.experiment_directory,
y_pred,
valid_data_split,
cfg.ID_COLUMN,
cfg.TARGET_COLUMN)

logger.info('Calculating ROC_AUC on validation set')
score = roc_auc_score(y_true, y_pred)
logger.info('ROC_AUC score on validation is {}'.format(score))
ctx.channel_send('ROC_AUC', 0, score)


def _predict(pipeline_name, dev_mode):
logger.info('PREDICTION')
logger.info('reading data...')
if dev_mode:
logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
application_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
else:
application_test = pd.read_csv(params.test_filepath)

data = {'input': {'X': application_test,
'y': None,
},
}

pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
pipeline.clean_cache()
logger.info('Start pipeline transform')
output = pipeline.transform(data)
pipeline.clean_cache()
y_pred = output['clipped_prediction']

if not dev_mode:
logger.info('creating submission file...')
submission = create_submission(application_test, y_pred)

logger.info('verifying submission...')
sample_submission = pd.read_csv(params.sample_submission_filepath)
verify_submission(submission, sample_submission)

submission_filepath = os.path.join(params.experiment_directory, 'submission.csv')
submission.to_csv(submission_filepath, index=None, encoding='utf-8')
logger.info('submission persisted to {}'.format(submission_filepath))
logger.info('submission head \n\n{}'.format(submission.head()))

if params.kaggle_api:
logger.info('making Kaggle submit...')
os.system('kaggle competitions submit -c home-credit-default-risk -f {} -m {}'
.format(submission_filepath, params.kaggle_message))
def evaluate_predict(pipeline_name, submit_predictions, dev_mode):
pipeline_manager.evaluate(pipeline_name, dev_mode)
pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)


@main.command()
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
def train_evaluate_cv(pipeline_name, dev_mode):
pipeline_manager.train_evaluate_cv(pipeline_name, dev_mode)


@main.command()
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
def train_evaluate_predict_cv(pipeline_name, submit_predictions, dev_mode):
pipeline_manager.train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions)


if __name__ == "__main__":
action()
main()
Loading

0 comments on commit 428d89e

Please sign in to comment.