This repository has been archived by the owner on Jun 22, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 170
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Smart features (#61) * Update README.md * Update README.md * Update * Smart features update * More descriptive transformer name * Reading all data in main * More application features * Transformer for cleaning * Multiinput data dictionary * Fix (#63) * fixed configs * dropped redundand steps, moved stuff to cleaning, refactored groupby (#64) * dropped redundand steps, moved stuff to cleanining, refactored groupby * restructured, added stacking + CV * Fix format string * Update pipeline_manager.py clipped prediction -> prediction * added stratified kfold option (#77) * Update config (#79) * dropped redundand steps, moved stuff to cleanining, refactored groupby * restructured, added stacking + CV * Update pipeline_config.py * Dev review (#81) * dropped feature by type split, refactored pipleine_config * dropped feature by type split method * explored application features * trash * reverted refactor of aggs * fixed/updated bureau features * cleared notebooks * agg features added to notebook bureau * credit card cleaned * added other feature notebooks * added rank mean * updated model arch * reverted to old params * fixed rank mean calculations * ApplicationCleaning update (#84) * Cleaning - application * Clear output in notebook * clenaed names in steps, refactored mergeaggregate transformer, changed caching/saving specs (#85) * local trash * External sources notebook (#86) * Update * External sources notebook * Dev lgbm params (#88) * local trash * updated configs * dropped comment * updated lgb params * Dev app agg fix (#90) * dropped app_aggs * app agg features fixed * cleaned leftovers * dropped fast read-in for debug * External_sources statistics (#89) * Speed-up ext_src notebook * exernal_sources statistics * Weighted mean and notebook fix * application notebook update * clear notebook output * Fix auto submission (#95) * updated best model name * changed best model path * corrections
- Loading branch information
1 parent
b488060
commit 428d89e
Showing
26 changed files
with
4,134 additions
and
1,005 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,196 +1,77 @@ | ||
import os | ||
import shutil | ||
|
||
import click | ||
import pandas as pd | ||
from deepsense import neptune | ||
from sklearn.metrics import roc_auc_score | ||
from sklearn.model_selection import train_test_split | ||
|
||
import pipeline_config as cfg | ||
from pipelines import PIPELINES | ||
from utils import create_submission, init_logger, read_params, persist_evaluation_predictions, \ | ||
set_seed, verify_submission | ||
from src.pipeline_manager import PipelineManager | ||
|
||
set_seed() | ||
logger = init_logger() | ||
ctx = neptune.Context() | ||
params = read_params(ctx) | ||
pipeline_manager = PipelineManager() | ||
|
||
|
||
@click.group() | ||
def action(): | ||
def main(): | ||
pass | ||
|
||
|
||
@action.command() | ||
@main.command() | ||
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) | ||
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) | ||
def train(pipeline_name, dev_mode): | ||
_train(pipeline_name, dev_mode) | ||
pipeline_manager.train(pipeline_name, dev_mode) | ||
|
||
|
||
@action.command() | ||
@main.command() | ||
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) | ||
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) | ||
def evaluate(pipeline_name, dev_mode): | ||
_evaluate(pipeline_name, dev_mode) | ||
pipeline_manager.evaluate(pipeline_name, dev_mode) | ||
|
||
|
||
@action.command() | ||
@main.command() | ||
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) | ||
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) | ||
def predict(pipeline_name, dev_mode): | ||
_predict(pipeline_name, dev_mode) | ||
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False) | ||
def predict(pipeline_name, dev_mode, submit_predictions): | ||
pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions) | ||
|
||
|
||
@action.command() | ||
@main.command() | ||
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) | ||
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False) | ||
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) | ||
def train_evaluate_predict(pipeline_name, dev_mode): | ||
_train(pipeline_name, dev_mode) | ||
_evaluate(pipeline_name, dev_mode) | ||
_predict(pipeline_name, dev_mode) | ||
def train_evaluate_predict(pipeline_name, submit_predictions, dev_mode): | ||
pipeline_manager.train(pipeline_name, dev_mode) | ||
pipeline_manager.evaluate(pipeline_name, dev_mode) | ||
pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions) | ||
|
||
|
||
@action.command() | ||
@main.command() | ||
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) | ||
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) | ||
def evaluate_predict(pipeline_name, dev_mode): | ||
_evaluate(pipeline_name, dev_mode) | ||
_predict(pipeline_name, dev_mode) | ||
def train_evaluate(pipeline_name, dev_mode): | ||
pipeline_manager.train(pipeline_name, dev_mode) | ||
pipeline_manager.evaluate(pipeline_name, dev_mode) | ||
|
||
|
||
@action.command() | ||
@main.command() | ||
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) | ||
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False) | ||
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) | ||
def train_evaluate(pipeline_name, dev_mode): | ||
_train(pipeline_name, dev_mode) | ||
_evaluate(pipeline_name, dev_mode) | ||
|
||
|
||
def _train(pipeline_name, dev_mode): | ||
logger.info('TRAINING') | ||
if bool(params.clean_experiment_directory_before_training) and os.path.isdir(params.experiment_directory): | ||
logger.info('Cleaning experiment_directory...') | ||
shutil.rmtree(params.experiment_directory) | ||
|
||
logger.info('Reading data...') | ||
if dev_mode: | ||
logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE)) | ||
application_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE) | ||
else: | ||
application_train = pd.read_csv(params.train_filepath) | ||
|
||
logger.info('Shuffling and splitting into train and test...') | ||
train_data_split, valid_data_split = train_test_split(application_train, | ||
test_size=params.validation_size, | ||
random_state=cfg.RANDOM_SEED, | ||
shuffle=params.shuffle) | ||
|
||
logger.info('Target mean in train: {}'.format(train_data_split[cfg.TARGET_COLUMN].mean())) | ||
logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMN].mean())) | ||
logger.info('Train shape: {}'.format(train_data_split.shape)) | ||
logger.info('Valid shape: {}'.format(valid_data_split.shape)) | ||
|
||
data = {'input': {'X': train_data_split.drop(cfg.TARGET_COLUMN, axis=1), | ||
'y': train_data_split[cfg.TARGET_COLUMN], | ||
'X_valid': valid_data_split.drop(cfg.TARGET_COLUMN, axis=1), | ||
'y_valid': valid_data_split[cfg.TARGET_COLUMN], | ||
}, | ||
} | ||
|
||
pipeline = PIPELINES[pipeline_name]['train'](cfg.SOLUTION_CONFIG) | ||
pipeline.clean_cache() | ||
logger.info('Start pipeline fit and transform') | ||
pipeline.fit_transform(data) | ||
pipeline.clean_cache() | ||
|
||
|
||
def _evaluate(pipeline_name, dev_mode): | ||
logger.info('EVALUATION') | ||
logger.info('reading data...') | ||
if dev_mode: | ||
logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE)) | ||
application_train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE) | ||
else: | ||
application_train = pd.read_csv(params.train_filepath) | ||
|
||
logger.info('Shuffling and splitting to get validation split...') | ||
_, valid_data_split = train_test_split(application_train, | ||
test_size=params.validation_size, | ||
random_state=cfg.RANDOM_SEED, | ||
shuffle=params.shuffle) | ||
|
||
logger.info('Target mean in valid: {}'.format(valid_data_split[cfg.TARGET_COLUMN].mean())) | ||
logger.info('Valid shape: {}'.format(valid_data_split.shape)) | ||
|
||
y_true = valid_data_split[cfg.TARGET_COLUMN].values | ||
data = {'input': {'X': valid_data_split.drop(cfg.TARGET_COLUMN, axis=1), | ||
'y': valid_data_split[cfg.TARGET_COLUMN], | ||
}, | ||
} | ||
|
||
pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) | ||
pipeline.clean_cache() | ||
logger.info('Start pipeline transform') | ||
output = pipeline.transform(data) | ||
pipeline.clean_cache() | ||
|
||
y_pred = output['clipped_prediction'] | ||
|
||
logger.info('Saving evaluation predictions to the {}'.format(params.experiment_directory)) | ||
persist_evaluation_predictions(params.experiment_directory, | ||
y_pred, | ||
valid_data_split, | ||
cfg.ID_COLUMN, | ||
cfg.TARGET_COLUMN) | ||
|
||
logger.info('Calculating ROC_AUC on validation set') | ||
score = roc_auc_score(y_true, y_pred) | ||
logger.info('ROC_AUC score on validation is {}'.format(score)) | ||
ctx.channel_send('ROC_AUC', 0, score) | ||
|
||
|
||
def _predict(pipeline_name, dev_mode): | ||
logger.info('PREDICTION') | ||
logger.info('reading data...') | ||
if dev_mode: | ||
logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE)) | ||
application_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE) | ||
else: | ||
application_test = pd.read_csv(params.test_filepath) | ||
|
||
data = {'input': {'X': application_test, | ||
'y': None, | ||
}, | ||
} | ||
|
||
pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) | ||
pipeline.clean_cache() | ||
logger.info('Start pipeline transform') | ||
output = pipeline.transform(data) | ||
pipeline.clean_cache() | ||
y_pred = output['clipped_prediction'] | ||
|
||
if not dev_mode: | ||
logger.info('creating submission file...') | ||
submission = create_submission(application_test, y_pred) | ||
|
||
logger.info('verifying submission...') | ||
sample_submission = pd.read_csv(params.sample_submission_filepath) | ||
verify_submission(submission, sample_submission) | ||
|
||
submission_filepath = os.path.join(params.experiment_directory, 'submission.csv') | ||
submission.to_csv(submission_filepath, index=None, encoding='utf-8') | ||
logger.info('submission persisted to {}'.format(submission_filepath)) | ||
logger.info('submission head \n\n{}'.format(submission.head())) | ||
|
||
if params.kaggle_api: | ||
logger.info('making Kaggle submit...') | ||
os.system('kaggle competitions submit -c home-credit-default-risk -f {} -m {}' | ||
.format(submission_filepath, params.kaggle_message)) | ||
def evaluate_predict(pipeline_name, submit_predictions, dev_mode): | ||
pipeline_manager.evaluate(pipeline_name, dev_mode) | ||
pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions) | ||
|
||
|
||
@main.command() | ||
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) | ||
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) | ||
def train_evaluate_cv(pipeline_name, dev_mode): | ||
pipeline_manager.train_evaluate_cv(pipeline_name, dev_mode) | ||
|
||
|
||
@main.command() | ||
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True) | ||
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False) | ||
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False) | ||
def train_evaluate_predict_cv(pipeline_name, submit_predictions, dev_mode): | ||
pipeline_manager.train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions) | ||
|
||
|
||
if __name__ == "__main__": | ||
action() | ||
main() |
Oops, something went wrong.