Skip to content
This repository has been archived by the owner on Jun 22, 2022. It is now read-only.

Commit

Permalink
Smart features update
Browse files Browse the repository at this point in the history
  • Loading branch information
Paulina Knut committed Jun 22, 2018
1 parent 4f9e7f6 commit 0c21d67
Show file tree
Hide file tree
Showing 6 changed files with 506 additions and 14 deletions.
203 changes: 203 additions & 0 deletions feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,206 @@ def transform(self, X):
how='left')

return {'numerical_features': X[self.groupby_aggregations_names].astype(np.float32)}


class Application(BaseTransformer):
def __init__(self):
super().__init__()
self.application_names = ['ANNUITY_INCOME_PERCENTAGE',
'CREDIT_TO_GOODS_RATIO',
'DAYS_EMPLOYED_PERCENTAGE',
'EXT_SOURCES_MEAN',
'INCOME_CREDIT_PERCENTAGE',
'INCOME_PER_PERSON',
'PAYMENT_RATE']

def transform(self, X, y=None):
X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
# SIMPLE
X['ANNUITY_INCOME_PERCENTAGE'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']
X['CREDIT_TO_GOODS_RATIO'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']
X['DAYS_EMPLOYED_PERCENTAGE'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']
X['EXT_SOURCES_MEAN'] = X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
X['INCOME_CREDIT_PERCENTAGE'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']
X['INCOME_PER_PERSON'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']
X['PAYMENT_RATE'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']

return {'numerical_features': X[self.application_names]}


class Bureau(BaseTransformer):
def __init__(self, filepath, id_columns, **kwargs):
self.filepath = filepath
self.id_columns = id_columns
self.bureau_names = ['bureau_active_loans_percentage',
'bureau_average_creditdays_prolonged',
'bureau_average_enddate_future',
'bureau_average_loan_type',
'bureau_credit_enddate_percentage',
'bureau_days_credit_diff',
'bureau_debt_credit_ratio',
'bureau_loan_count',
'bureau_loan_types',
'bureau_overdue_debt_ratio'
]

def fit(self, X):
bureau = pd.read_csv(self.filepath)
bureau['AMT_CREDIT_SUM'].fillna(0, inplace=True)
bureau['AMT_CREDIT_SUM_DEBT'].fillna(0, inplace=True)
bureau['AMT_CREDIT_SUM_OVERDUE'].fillna(0, inplace=True)
bureau['CNT_CREDIT_PROLONG'].fillna(0, inplace=True)

# NUMBER OF PAST LOANS PER CUSTOMER
bureau['bureau_loan_count'] = bureau.groupby(
by=['SK_ID_CURR'])['DAYS_CREDIT'].agg('count').reset_index()['DAYS_CREDIT']

# NUMBER OF TYPES OF PAST LOANS PER CUSTOMER
bureau['bureau_loan_types'] = bureau.groupby(
by=['SK_ID_CURR'])['CREDIT_TYPE'].agg('nunique').reset_index()['CREDIT_TYPE']

# AVERAGE NUMBER OF PAST LOANS PER TYPE PER CUSTOMER
bureau['bureau_average_loan_type'] = bureau['bureau_loan_count'] / bureau['bureau_loan_types']

# % OF ACTIVE LOANS FROM BUREAU DATA
bureau['bureau_credit_active_binary'] = bureau.apply(lambda x: int(x.CREDIT_ACTIVE != 'Closed'), axis=1)
bureau['bureau_active_loans_percentage'] = bureau.groupby(
by=['SK_ID_CURR'])['bureau_credit_active_binary'].agg('mean').reset_index()['bureau_credit_active_binary']

# AVERAGE NUMBER OF DAYS BETWEEN SUCCESSIVE PAST APPLICATIONS FOR EACH CUSTOMER
bureau['bureau_days_credit_diff'] = bureau.groupby(
by=['SK_ID_CURR']).apply(
lambda x: x.sort_values(['DAYS_CREDIT'], ascending=False)).reset_index(drop=True)['DAYS_CREDIT']
bureau['bureau_days_credit_diff'] *= -1
bureau['bureau_days_credit_diff'] = bureau.groupby(by=['SK_ID_CURR'])['bureau_days_credit_diff'].diff()
bureau['bureau_days_credit_diff'] = bureau['bureau_days_credit_diff'].fillna(0)

# % of LOANS PER CUSTOMER WHERE END DATE FOR CREDIT IS PAST
bureau['bureau_credit_enddate_binary'] = bureau.apply(lambda x: int(x.DAYS_CREDIT_ENDDATE > 0), axis=1)
bureau['bureau_credit_enddate_percentage'] = bureau.groupby(
by=['SK_ID_CURR'])['bureau_credit_enddate_binary'].agg('mean').reset_index()['bureau_credit_enddate_binary']

# AVERAGE NUMBER OF DAYS IN WHICH CREDIT EXPIRES IN FUTURE
group = bureau[bureau['bureau_credit_enddate_binary'] == 1].groupby(
by=['SK_ID_CURR']).apply(
lambda x: x.sort_values(['DAYS_CREDIT_ENDDATE'], ascending=True)).reset_index(drop=True)
group['bureau_days_enddate_diff'] = group.groupby(by=['SK_ID_CURR'])['DAYS_CREDIT_ENDDATE'].diff()
group['bureau_days_enddate_diff'] = group['bureau_days_enddate_diff'].fillna(0).astype('uint32')

bureau = bureau.merge(group[['bureau_days_enddate_diff', 'SK_ID_BUREAU']], on=['SK_ID_BUREAU'], how='left')
bureau['bureau_average_enddate_future'] = bureau.groupby(
by=['SK_ID_CURR'])['bureau_days_enddate_diff'].mean().reset_index()['bureau_days_enddate_diff']

# DEBT OVER CREDIT RATIO
bureau['bureau_total_customer_debt'] = bureau.groupby(
by=['SK_ID_CURR'])['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index()['AMT_CREDIT_SUM_DEBT']
bureau['bureau_total_customer_credit'] = bureau.groupby(
by=['SK_ID_CURR'])['AMT_CREDIT_SUM'].agg('sum').reset_index()['AMT_CREDIT_SUM']
bureau['bureau_debt_credit_ratio'] = bureau['bureau_total_customer_debt'] / bureau['bureau_total_customer_credit']

# OVERDUE OVER DEBT RATIO
bureau['bureau_total_customer_overdue'] = bureau.groupby(
by=['SK_ID_CURR'])['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index()['AMT_CREDIT_SUM_OVERDUE']
bureau['bureau_overdue_debt_ratio'] = bureau['bureau_total_customer_overdue'] / bureau['bureau_total_customer_debt']

# 10 AVERAGE NUMBER OF LOANS PROLONGED
bureau['bureau_average_creditdays_prolonged'] = bureau.groupby(
by=['SK_ID_CURR'])['CNT_CREDIT_PROLONG'].agg('mean').reset_index()['CNT_CREDIT_PROLONG']

self.bureau_features = bureau[self.bureau_names +
[self.id_columns[1]]].drop_duplicates(subset=self.id_columns[1])

return self

def transform(self, X, **kwargs):
X = X.merge(self.bureau_features,
left_on=self.id_columns[0],
right_on=self.id_columns[1],
how='left',
validate='one_to_one')

return {'numerical_features': X[self.bureau_names]}

def load(self, filepath):
self.bureau_features = joblib.load(filepath)
return self

def save(self, filepath):
joblib.dump(self.bureau_features, filepath)


class CreditCardBalance(BaseTransformer):
def __init__(self, filepath, id_columns, **kwargs):
self.filepath = filepath
self.id_columns = id_columns
self.credit_card_names = ['CreditCard_AVG_DPD',
'CreditCard_CASH_CARD_RATIO',
'CreditCard_CREDIT_LOAD',
'CreditCard_DRAWINGS_RATIO',
'CreditCard_INSTALLMENTS_PER_LOAN',
'CreditCard_NO_LOANS',
]

def fit(self, X):
credit_card = pd.read_csv(self.filepath)

# NUMBER OF LOANS PER CUSTOMER
credit_card['CreditCard_NO_LOANS'] = credit_card.groupby(
by=['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index()['SK_ID_PREV']

# RATE OF PAYBACK OF LOANS - NO OF INSTALMENTS PAID BY CUSTOMER PER LOAN
credit_card['NO_INSTALMENTS'] = credit_card.groupby(
by=['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].max().reset_index()['CNT_INSTALMENT_MATURE_CUM']
credit_card['TOTAL_INSTALMENTS'] = credit_card.groupby(
by=['SK_ID_CURR'])['NO_INSTALMENTS'].sum().reset_index()['NO_INSTALMENTS']
credit_card['CreditCard_INSTALLMENTS_PER_LOAN'] = (
credit_card['TOTAL_INSTALMENTS'] / credit_card['CreditCard_NO_LOANS'])

# AVG % LOADING OF CREDIT LIMIT PER CUSTOMER
credit_card['CreditCard_CREDIT_LOAD'] = credit_card.groupby(
by=['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply(
lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0]
credit_card['CreditCard_CREDIT_LOAD'] = credit_card.groupby(
by=['SK_ID_CURR'])['CreditCard_CREDIT_LOAD'].mean().reset_index()['CreditCard_CREDIT_LOAD']

# AVERAGE OF DAYS PAST DUE PER CUSTOMER
credit_card['CreditCard_AVG_DPD'] = credit_card.groupby(by=['SK_ID_CURR'])['SK_DPD'].mean().reset_index()['SK_DPD']

# RATIO OF CASH VS CARD SWIPES
credit_card['DRAWINGS_ATM'] = credit_card.groupby(
by=['SK_ID_CURR'])['AMT_DRAWINGS_ATM_CURRENT'].sum().reset_index()['AMT_DRAWINGS_ATM_CURRENT']
credit_card['DRAWINGS_TOTAL'] = credit_card.groupby(
by=['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].sum().reset_index()['AMT_DRAWINGS_CURRENT']
credit_card['CreditCard_CASH_CARD_RATIO'] = 100 * (credit_card['DRAWINGS_ATM'] / credit_card['DRAWINGS_TOTAL'])
credit_card['CreditCard_CASH_CARD_RATIO'] = credit_card.groupby(
by=['SK_ID_CURR'])['CreditCard_CASH_CARD_RATIO'].mean().reset_index()['CreditCard_CASH_CARD_RATIO']

# AVERAGE DRAWING PER CUSTOMER
credit_card['TOTAL_DRAWINGS'] = credit_card.groupby(
by=['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].sum().reset_index()['AMT_DRAWINGS_CURRENT']
credit_card['NO_DRAWINGS'] = credit_card.groupby(
by=['SK_ID_CURR'])['CNT_DRAWINGS_CURRENT'].sum().reset_index()['CNT_DRAWINGS_CURRENT']
credit_card['CreditCard_DRAWINGS_RATIO'] = 100 * (credit_card['TOTAL_DRAWINGS'] / credit_card['NO_DRAWINGS'])
credit_card['CreditCard_DRAWINGS_RATIO'] = credit_card.groupby(
by=['SK_ID_CURR'])['CreditCard_DRAWINGS_RATIO'].mean().reset_index()['CreditCard_DRAWINGS_RATIO']

self.credit_card_features = credit_card[self.credit_card_names +
[self.id_columns[1]]].drop_duplicates(subset=self.id_columns[1])

return self

def transform(self, X, **kwargs):
X = X.merge(self.credit_card_features,
left_on=self.id_columns[0],
right_on=self.id_columns[1],
how='left',
validate='one_to_one')

return {'numerical_features': X[self.credit_card_names]}

def load(self, filepath):
self.credit_card_features = joblib.load(filepath)
return self

def save(self, filepath):
joblib.dump(self.credit_card_features, filepath)
4 changes: 2 additions & 2 deletions neptune.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
project: ORGANIZATION/home-credit

name: home-credit-default-risk
tags: [solution-2]
tags: [solution-3]

metric:
channel: 'ROC_AUC'
Expand Down Expand Up @@ -33,7 +33,7 @@ parameters:

# Kaggle
kaggle_api: 0
kaggle_message: 'solution-2'
kaggle_message: 'solution-3'

# Data preparation
validation_size: 0.2
Expand Down
4 changes: 2 additions & 2 deletions neptune_random_search.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
project: ORGANIZATION/home-credit

name: home-credit-default-risk
tags: [solution-2]
tags: [solution-3]

metric:
channel: 'ROC_AUC'
Expand Down Expand Up @@ -33,7 +33,7 @@ parameters:

# Kaggle
kaggle_api: 0
kaggle_message: 'solution-2'
kaggle_message: 'solution-3'

# Data preparation
validation_size: 0.2
Expand Down
Loading

0 comments on commit 0c21d67

Please sign in to comment.