diff --git a/feature_extraction.py b/feature_extraction.py index ddce1e9..e0c1dd4 100644 --- a/feature_extraction.py +++ b/feature_extraction.py @@ -140,3 +140,206 @@ def transform(self, X): how='left') return {'numerical_features': X[self.groupby_aggregations_names].astype(np.float32)} + + +class Application(BaseTransformer): + def __init__(self): + super().__init__() + self.application_names = ['ANNUITY_INCOME_PERCENTAGE', + 'CREDIT_TO_GOODS_RATIO', + 'DAYS_EMPLOYED_PERCENTAGE', + 'EXT_SOURCES_MEAN', + 'INCOME_CREDIT_PERCENTAGE', + 'INCOME_PER_PERSON', + 'PAYMENT_RATE'] + + def transform(self, X, y=None): + X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) + # SIMPLE + X['ANNUITY_INCOME_PERCENTAGE'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL'] + X['CREDIT_TO_GOODS_RATIO'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE'] + X['DAYS_EMPLOYED_PERCENTAGE'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH'] + X['EXT_SOURCES_MEAN'] = X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1) + X['INCOME_CREDIT_PERCENTAGE'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT'] + X['INCOME_PER_PERSON'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS'] + X['PAYMENT_RATE'] = X['AMT_ANNUITY'] / X['AMT_CREDIT'] + + return {'numerical_features': X[self.application_names]} + + +class Bureau(BaseTransformer): + def __init__(self, filepath, id_columns, **kwargs): + self.filepath = filepath + self.id_columns = id_columns + self.bureau_names = ['bureau_active_loans_percentage', + 'bureau_average_creditdays_prolonged', + 'bureau_average_enddate_future', + 'bureau_average_loan_type', + 'bureau_credit_enddate_percentage', + 'bureau_days_credit_diff', + 'bureau_debt_credit_ratio', + 'bureau_loan_count', + 'bureau_loan_types', + 'bureau_overdue_debt_ratio' + ] + + def fit(self, X): + bureau = pd.read_csv(self.filepath) + bureau['AMT_CREDIT_SUM'].fillna(0, inplace=True) + bureau['AMT_CREDIT_SUM_DEBT'].fillna(0, inplace=True) + bureau['AMT_CREDIT_SUM_OVERDUE'].fillna(0, inplace=True) + bureau['CNT_CREDIT_PROLONG'].fillna(0, inplace=True) + + # NUMBER OF PAST LOANS PER CUSTOMER + bureau['bureau_loan_count'] = bureau.groupby( + by=['SK_ID_CURR'])['DAYS_CREDIT'].agg('count').reset_index()['DAYS_CREDIT'] + + # NUMBER OF TYPES OF PAST LOANS PER CUSTOMER + bureau['bureau_loan_types'] = bureau.groupby( + by=['SK_ID_CURR'])['CREDIT_TYPE'].agg('nunique').reset_index()['CREDIT_TYPE'] + + # AVERAGE NUMBER OF PAST LOANS PER TYPE PER CUSTOMER + bureau['bureau_average_loan_type'] = bureau['bureau_loan_count'] / bureau['bureau_loan_types'] + + # % OF ACTIVE LOANS FROM BUREAU DATA + bureau['bureau_credit_active_binary'] = bureau.apply(lambda x: int(x.CREDIT_ACTIVE != 'Closed'), axis=1) + bureau['bureau_active_loans_percentage'] = bureau.groupby( + by=['SK_ID_CURR'])['bureau_credit_active_binary'].agg('mean').reset_index()['bureau_credit_active_binary'] + + # AVERAGE NUMBER OF DAYS BETWEEN SUCCESSIVE PAST APPLICATIONS FOR EACH CUSTOMER + bureau['bureau_days_credit_diff'] = bureau.groupby( + by=['SK_ID_CURR']).apply( + lambda x: x.sort_values(['DAYS_CREDIT'], ascending=False)).reset_index(drop=True)['DAYS_CREDIT'] + bureau['bureau_days_credit_diff'] *= -1 + bureau['bureau_days_credit_diff'] = bureau.groupby(by=['SK_ID_CURR'])['bureau_days_credit_diff'].diff() + bureau['bureau_days_credit_diff'] = bureau['bureau_days_credit_diff'].fillna(0) + + # % of LOANS PER CUSTOMER WHERE END DATE FOR CREDIT IS PAST + bureau['bureau_credit_enddate_binary'] = bureau.apply(lambda x: int(x.DAYS_CREDIT_ENDDATE > 0), axis=1) + bureau['bureau_credit_enddate_percentage'] = bureau.groupby( + by=['SK_ID_CURR'])['bureau_credit_enddate_binary'].agg('mean').reset_index()['bureau_credit_enddate_binary'] + + # AVERAGE NUMBER OF DAYS IN WHICH CREDIT EXPIRES IN FUTURE + group = bureau[bureau['bureau_credit_enddate_binary'] == 1].groupby( + by=['SK_ID_CURR']).apply( + lambda x: x.sort_values(['DAYS_CREDIT_ENDDATE'], ascending=True)).reset_index(drop=True) + group['bureau_days_enddate_diff'] = group.groupby(by=['SK_ID_CURR'])['DAYS_CREDIT_ENDDATE'].diff() + group['bureau_days_enddate_diff'] = group['bureau_days_enddate_diff'].fillna(0).astype('uint32') + + bureau = bureau.merge(group[['bureau_days_enddate_diff', 'SK_ID_BUREAU']], on=['SK_ID_BUREAU'], how='left') + bureau['bureau_average_enddate_future'] = bureau.groupby( + by=['SK_ID_CURR'])['bureau_days_enddate_diff'].mean().reset_index()['bureau_days_enddate_diff'] + + # DEBT OVER CREDIT RATIO + bureau['bureau_total_customer_debt'] = bureau.groupby( + by=['SK_ID_CURR'])['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index()['AMT_CREDIT_SUM_DEBT'] + bureau['bureau_total_customer_credit'] = bureau.groupby( + by=['SK_ID_CURR'])['AMT_CREDIT_SUM'].agg('sum').reset_index()['AMT_CREDIT_SUM'] + bureau['bureau_debt_credit_ratio'] = bureau['bureau_total_customer_debt'] / bureau['bureau_total_customer_credit'] + + # OVERDUE OVER DEBT RATIO + bureau['bureau_total_customer_overdue'] = bureau.groupby( + by=['SK_ID_CURR'])['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index()['AMT_CREDIT_SUM_OVERDUE'] + bureau['bureau_overdue_debt_ratio'] = bureau['bureau_total_customer_overdue'] / bureau['bureau_total_customer_debt'] + + # 10 AVERAGE NUMBER OF LOANS PROLONGED + bureau['bureau_average_creditdays_prolonged'] = bureau.groupby( + by=['SK_ID_CURR'])['CNT_CREDIT_PROLONG'].agg('mean').reset_index()['CNT_CREDIT_PROLONG'] + + self.bureau_features = bureau[self.bureau_names + + [self.id_columns[1]]].drop_duplicates(subset=self.id_columns[1]) + + return self + + def transform(self, X, **kwargs): + X = X.merge(self.bureau_features, + left_on=self.id_columns[0], + right_on=self.id_columns[1], + how='left', + validate='one_to_one') + + return {'numerical_features': X[self.bureau_names]} + + def load(self, filepath): + self.bureau_features = joblib.load(filepath) + return self + + def save(self, filepath): + joblib.dump(self.bureau_features, filepath) + + +class CreditCardBalance(BaseTransformer): + def __init__(self, filepath, id_columns, **kwargs): + self.filepath = filepath + self.id_columns = id_columns + self.credit_card_names = ['CreditCard_AVG_DPD', + 'CreditCard_CASH_CARD_RATIO', + 'CreditCard_CREDIT_LOAD', + 'CreditCard_DRAWINGS_RATIO', + 'CreditCard_INSTALLMENTS_PER_LOAN', + 'CreditCard_NO_LOANS', + ] + + def fit(self, X): + credit_card = pd.read_csv(self.filepath) + + # NUMBER OF LOANS PER CUSTOMER + credit_card['CreditCard_NO_LOANS'] = credit_card.groupby( + by=['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index()['SK_ID_PREV'] + + # RATE OF PAYBACK OF LOANS - NO OF INSTALMENTS PAID BY CUSTOMER PER LOAN + credit_card['NO_INSTALMENTS'] = credit_card.groupby( + by=['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].max().reset_index()['CNT_INSTALMENT_MATURE_CUM'] + credit_card['TOTAL_INSTALMENTS'] = credit_card.groupby( + by=['SK_ID_CURR'])['NO_INSTALMENTS'].sum().reset_index()['NO_INSTALMENTS'] + credit_card['CreditCard_INSTALLMENTS_PER_LOAN'] = ( + credit_card['TOTAL_INSTALMENTS'] / credit_card['CreditCard_NO_LOANS']) + + # AVG % LOADING OF CREDIT LIMIT PER CUSTOMER + credit_card['CreditCard_CREDIT_LOAD'] = credit_card.groupby( + by=['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply( + lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0] + credit_card['CreditCard_CREDIT_LOAD'] = credit_card.groupby( + by=['SK_ID_CURR'])['CreditCard_CREDIT_LOAD'].mean().reset_index()['CreditCard_CREDIT_LOAD'] + + # AVERAGE OF DAYS PAST DUE PER CUSTOMER + credit_card['CreditCard_AVG_DPD'] = credit_card.groupby(by=['SK_ID_CURR'])['SK_DPD'].mean().reset_index()['SK_DPD'] + + # RATIO OF CASH VS CARD SWIPES + credit_card['DRAWINGS_ATM'] = credit_card.groupby( + by=['SK_ID_CURR'])['AMT_DRAWINGS_ATM_CURRENT'].sum().reset_index()['AMT_DRAWINGS_ATM_CURRENT'] + credit_card['DRAWINGS_TOTAL'] = credit_card.groupby( + by=['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].sum().reset_index()['AMT_DRAWINGS_CURRENT'] + credit_card['CreditCard_CASH_CARD_RATIO'] = 100 * (credit_card['DRAWINGS_ATM'] / credit_card['DRAWINGS_TOTAL']) + credit_card['CreditCard_CASH_CARD_RATIO'] = credit_card.groupby( + by=['SK_ID_CURR'])['CreditCard_CASH_CARD_RATIO'].mean().reset_index()['CreditCard_CASH_CARD_RATIO'] + + # AVERAGE DRAWING PER CUSTOMER + credit_card['TOTAL_DRAWINGS'] = credit_card.groupby( + by=['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].sum().reset_index()['AMT_DRAWINGS_CURRENT'] + credit_card['NO_DRAWINGS'] = credit_card.groupby( + by=['SK_ID_CURR'])['CNT_DRAWINGS_CURRENT'].sum().reset_index()['CNT_DRAWINGS_CURRENT'] + credit_card['CreditCard_DRAWINGS_RATIO'] = 100 * (credit_card['TOTAL_DRAWINGS'] / credit_card['NO_DRAWINGS']) + credit_card['CreditCard_DRAWINGS_RATIO'] = credit_card.groupby( + by=['SK_ID_CURR'])['CreditCard_DRAWINGS_RATIO'].mean().reset_index()['CreditCard_DRAWINGS_RATIO'] + + self.credit_card_features = credit_card[self.credit_card_names + + [self.id_columns[1]]].drop_duplicates(subset=self.id_columns[1]) + + return self + + def transform(self, X, **kwargs): + X = X.merge(self.credit_card_features, + left_on=self.id_columns[0], + right_on=self.id_columns[1], + how='left', + validate='one_to_one') + + return {'numerical_features': X[self.credit_card_names]} + + def load(self, filepath): + self.credit_card_features = joblib.load(filepath) + return self + + def save(self, filepath): + joblib.dump(self.credit_card_features, filepath) diff --git a/neptune.yaml b/neptune.yaml index fec954c..b89e7f4 100644 --- a/neptune.yaml +++ b/neptune.yaml @@ -1,7 +1,7 @@ project: ORGANIZATION/home-credit name: home-credit-default-risk -tags: [solution-2] +tags: [solution-3] metric: channel: 'ROC_AUC' @@ -33,7 +33,7 @@ parameters: # Kaggle kaggle_api: 0 - kaggle_message: 'solution-2' + kaggle_message: 'solution-3' # Data preparation validation_size: 0.2 diff --git a/neptune_random_search.yaml b/neptune_random_search.yaml index e1e765e..82d3e2f 100644 --- a/neptune_random_search.yaml +++ b/neptune_random_search.yaml @@ -1,7 +1,7 @@ project: ORGANIZATION/home-credit name: home-credit-default-risk -tags: [solution-2] +tags: [solution-3] metric: channel: 'ROC_AUC' @@ -33,7 +33,7 @@ parameters: # Kaggle kaggle_api: 0 - kaggle_message: 'solution-2' + kaggle_message: 'solution-3' # Data preparation validation_size: 0.2 diff --git a/pipeline_blocks.py b/pipeline_blocks.py index bd8939c..4e0dcd9 100644 --- a/pipeline_blocks.py +++ b/pipeline_blocks.py @@ -419,10 +419,184 @@ def _groupby_aggregations(dispatchers, config, train_mode, **kwargs): return groupby_aggregations +def _bureau_groupby_agg(config, train_mode, **kwargs): + if train_mode: + bureau_groupby_agg = Step(name='bureau_groupby_agg', + transformer=fe.GroupbyAggregationFromFile(**config.bureau), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + bureau_groupby_agg_valid = Step(name='bureau_groupby_agg_valid', + transformer=bureau_groupby_agg, + input_data=['input'], + adapter=Adapter({'X': E('input', 'X_valid')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return bureau_groupby_agg, bureau_groupby_agg_valid + + else: + bureau_groupby_agg = Step(name='bureau_groupby_agg', + transformer=fe.GroupbyAggregationFromFile(**config.bureau), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return bureau_groupby_agg + + +def _credit_card_balance_groupby_agg(config, train_mode, **kwargs): + if train_mode: + credit_card_balance_groupby_agg = Step(name='credit_card_balance_groupby_agg', + transformer=fe.GroupbyAggregationFromFile(**config.credit_card_balance), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + credit_card_balance_groupby_agg_valid = Step(name='credit_card_balance_groupby_agg_valid', + transformer=credit_card_balance_groupby_agg, + input_data=['input'], + adapter=Adapter({'X': E('input', 'X_valid')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return credit_card_balance_groupby_agg, credit_card_balance_groupby_agg_valid + + else: + credit_card_balance_groupby_agg = Step(name='credit_card_balance_groupby_agg', + transformer=fe.GroupbyAggregationFromFile(**config.credit_card_balance), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return credit_card_balance_groupby_agg + + +def _installments_payments_groupby_agg(config, train_mode, **kwargs): + if train_mode: + installments_payments_groupby_agg = Step(name='installments_payments_groupby_agg', + transformer=fe.GroupbyAggregationFromFile(**config.installments_payments), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + installments_payments_groupby_agg_valid = Step(name='installments_payments_groupby_agg_valid', + transformer=installments_payments_groupby_agg, + input_data=['input'], + adapter=Adapter({'X': E('input', 'X_valid')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return installments_payments_groupby_agg, installments_payments_groupby_agg_valid + + else: + installments_payments_groupby_agg = Step(name='installments_payments_groupby_agg', + transformer=fe.GroupbyAggregationFromFile(**config.installments_payments), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return installments_payments_groupby_agg + + +def _pos_cash_balance_groupby_agg(config, train_mode, **kwargs): + if train_mode: + pos_cash_balance_groupby_agg = Step(name='pos_cash_balance_groupby_agg', + transformer=fe.GroupbyAggregationFromFile(**config.pos_cash_balance), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + pos_cash_balance_groupby_agg_valid = Step(name='pos_cash_balance_groupby_agg_valid', + transformer=pos_cash_balance_groupby_agg, + input_data=['input'], + adapter=Adapter({'X': E('input', 'X_valid')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return pos_cash_balance_groupby_agg, pos_cash_balance_groupby_agg_valid + + else: + pos_cash_balance_groupby_agg = Step(name='pos_cash_balance_groupby_agg', + transformer=fe.GroupbyAggregationFromFile(**config.pos_cash_balance), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return pos_cash_balance_groupby_agg + + +def _previous_applications_groupby_agg(config, train_mode, **kwargs): + if train_mode: + previous_applications_groupby_agg = Step(name='previous_applications_groupby_agg', + transformer=fe.GroupbyAggregationFromFile(**config.previous_applications), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + previous_applications_groupby_agg_valid = Step(name='previous_applications_groupby_agg_valid', + transformer=previous_applications_groupby_agg, + input_data=['input'], + adapter=Adapter({'X': E('input', 'X_valid')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return previous_applications_groupby_agg, previous_applications_groupby_agg_valid + + else: + previous_applications_groupby_agg = Step(name='previous_applications_groupby_agg', + transformer=fe.GroupbyAggregationFromFile(**config.previous_applications), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return previous_applications_groupby_agg + + +def _application(config, train_mode, **kwargs): + if train_mode: + application = Step(name='application', + transformer=fe.Application(), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + application_valid = Step(name='application_valid', + transformer=application, + input_data=['input'], + adapter=Adapter({'X': E('input', 'X_valid')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return application, application_valid + + else: + application = Step(name='application', + transformer=fe.Application(), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return application + + def _bureau(config, train_mode, **kwargs): if train_mode: bureau = Step(name='bureau', - transformer=fe.GroupbyAggregationFromFile(**config.bureau), + transformer=fe.Bureau(**config.bureau), input_data=['input'], adapter=Adapter({'X': E('input', 'X')}), experiment_directory=config.pipeline.experiment_directory, @@ -439,7 +613,7 @@ def _bureau(config, train_mode, **kwargs): else: bureau = Step(name='bureau', - transformer=fe.GroupbyAggregationFromFile(**config.bureau), + transformer=fe.Bureau(**config.bureau), input_data=['input'], adapter=Adapter({'X': E('input', 'X')}), experiment_directory=config.pipeline.experiment_directory, @@ -448,6 +622,35 @@ def _bureau(config, train_mode, **kwargs): return bureau +def _credit_card_balance(config, train_mode, **kwargs): + if train_mode: + credit_card_balance = Step(name='credit_card_balance', + transformer=fe.CreditCardBalance(**config.credit_card_balance), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + credit_card_balance_valid = Step(name='credit_card_balance_valid', + transformer=credit_card_balance, + input_data=['input'], + adapter=Adapter({'X': E('input', 'X_valid')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return credit_card_balance, credit_card_balance_valid + + else: + credit_card_balance = Step(name='credit_card_balance', + transformer=fe.CreditCardBalance(**config.credit_card_balance), + input_data=['input'], + adapter=Adapter({'X': E('input', 'X')}), + experiment_directory=config.pipeline.experiment_directory, + **kwargs) + + return credit_card_balance + + def _fillna(fillna_value): def _inner_fillna(X, X_valid=None): if X_valid is None: diff --git a/pipeline_config.py b/pipeline_config.py index bf4312f..93a9351 100644 --- a/pipeline_config.py +++ b/pipeline_config.py @@ -22,6 +22,7 @@ TARGET_COLUMN = 'TARGET' TIMESTAMP_COLUMNS = [] + CATEGORICAL_COLUMNS = ['CODE_GENDER', 'EMERGENCYSTATE_MODE', 'FLAG_CONT_MOBILE', @@ -157,6 +158,76 @@ ]: AGGREGATION_RECIPIES.append({'groupby': group, 'select': select, 'agg': agg}) +BUREAU_AGGREGATION_RECIPIES = [{'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_TYPE', 'agg': 'count'}, + {'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_ACTIVE', 'agg': 'size'}] +for agg in ['mean', 'min', 'max', 'sum', 'var']: + for select in ['AMT_ANNUITY', + 'AMT_CREDIT_SUM', + 'AMT_CREDIT_SUM_DEBT', + 'AMT_CREDIT_SUM_LIMIT', + 'AMT_CREDIT_SUM_OVERDUE', + 'AMT_CREDIT_MAX_OVERDUE', + 'CNT_CREDIT_PROLONG', + 'CREDIT_DAY_OVERDUE', + 'DAYS_CREDIT', + 'DAYS_CREDIT_ENDDATE', + 'DAYS_CREDIT_UPDATE' + ]: + BUREAU_AGGREGATION_RECIPIES.append({'groupby': ['SK_ID_CURR'], 'select': select, 'agg': agg}) + +CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES = [] +for agg in ['mean', 'min', 'max', 'sum', 'var']: + for select in ['AMT_BALANCE', + 'AMT_CREDIT_LIMIT_ACTUAL', + 'AMT_DRAWINGS_ATM_CURRENT', + 'AMT_DRAWINGS_CURRENT', + 'AMT_DRAWINGS_OTHER_CURRENT', + 'AMT_DRAWINGS_POS_CURRENT', + 'AMT_PAYMENT_CURRENT', + 'CNT_DRAWINGS_ATM_CURRENT', + 'CNT_DRAWINGS_CURRENT', + 'CNT_DRAWINGS_OTHER_CURRENT', + 'CNT_INSTALMENT_MATURE_CUM', + 'MONTHS_BALANCE', + 'SK_DPD', + 'SK_DPD_DEF' + ]: + CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES.append({'groupby': ['SK_ID_CURR'], 'select': select, 'agg': agg}) + +INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = [] +for agg in ['mean', 'min', 'max', 'sum', 'var']: + for select in ['AMT_INSTALMENT', + 'AMT_PAYMENT', + 'DAYS_ENTRY_PAYMENT', + 'DAYS_INSTALMENT', + 'NUM_INSTALMENT_NUMBER', + 'NUM_INSTALMENT_VERSION' + ]: + INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES.append({'groupby': ['SK_ID_CURR'], 'select': select, 'agg': agg}) + +POS_CASH_BALANCE_AGGREGATION_RECIPIES = [] +for agg in ['mean', 'min', 'max', 'sum', 'var']: + for select in ['MONTHS_BALANCE', + 'SK_DPD', + 'SK_DPD_DEF' + ]: + POS_CASH_BALANCE_AGGREGATION_RECIPIES.append({'groupby': ['SK_ID_CURR'], 'select': select, 'agg': agg}) + +PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = [] +for agg in ['mean', 'min', 'max', 'sum', 'var']: + for select in ['AMT_ANNUITY', + 'AMT_APPLICATION', + 'AMT_CREDIT', + 'AMT_DOWN_PAYMENT', + 'AMT_GOODS_PRICE', + 'CNT_PAYMENT', + 'DAYS_DECISION', + 'HOUR_APPR_PROCESS_START', + 'RATE_DOWN_PAYMENT' + ]: + PREVIOUS_APPLICATION_AGGREGATION_RECIPIES.append({'groupby': ['SK_ID_CURR'], 'select': select, 'agg': agg}) + + SOLUTION_CONFIG = AttrDict({ 'pipeline': {'experiment_directory': params.experiment_directory }, @@ -284,14 +355,28 @@ 'bureau': {'filepath': BUREAU, 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), - 'groupby_aggregations': [ - {'groupby': ['SK_ID_CURR'], 'select': 'DAYS_CREDIT', 'agg': 'count'}, # 1 - {'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_TYPE', 'agg': 'nunique'}, # 2 - {'groupby': ['SK_ID_CURR'], 'select': 'CNT_CREDIT_PROLONG', 'agg': 'mean'}, # 10 - {'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_DAY_OVERDUE', 'agg': 'count'}, - {'groupby': ['SK_ID_CURR'], 'select': 'CREDIT_ACTIVE', 'agg': 'size'}, - {'groupby': ['SK_ID_CURR'], 'select': 'AMT_CREDIT_SUM', 'agg': 'count'}, - ]}, + 'groupby_aggregations': BUREAU_AGGREGATION_RECIPIES + }, + + 'credit_card_balance': {'filepath': CREDIT_CARD_BALANCE, + 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), + 'groupby_aggregations': CREDIT_CARD_BALANCE_AGGREGATION_RECIPIES + }, + + 'installments_payments': {'filepath': INSTALLMENTS_PAYMENTS, + 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), + 'groupby_aggregations': INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES + }, + + 'pos_cash_balance': {'filepath': POS_CASH_BALANCE, + 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), + 'groupby_aggregations': POS_CASH_BALANCE_AGGREGATION_RECIPIES + }, + + 'previous_applications': {'filepath': PREVIOUS_APPLICATION, + 'id_columns': ('SK_ID_CURR', 'SK_ID_CURR'), + 'groupby_aggregations': PREVIOUS_APPLICATION_AGGREGATION_RECIPIES + }, 'clipper': {'min_val': 0, 'max_val': 1 diff --git a/pipelines.py b/pipelines.py index d2df898..029a73c 100644 --- a/pipelines.py +++ b/pipelines.py @@ -76,6 +76,7 @@ def sklearn_main(config, ClassifierClass, clf_name, train_mode, normalize=False) cache_output=True, load_persisted_output=True) + sklearn_preproc = preprocessing_fillna((features, features_valid), config, train_mode) else: features = feature_extraction(config,