diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index 2e5da7d..9451c1a 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc index 2f9a42a..4829d65 100644 Binary files a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc index 8248a16..c68da69 100644 Binary files a/q01_outlier_removal/__pycache__/build.cpython-36.pyc and b/q01_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py index ec278ba..24e028e 100644 --- a/q01_outlier_removal/build.py +++ b/q01_outlier_removal/build.py @@ -1,8 +1,56 @@ +# %load q01_outlier_removal/build.py # Default imports import pandas as pd +import numpy as np loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') loan_data = loan_data.drop('Loan_ID', 1) + +def outlier_removal(loan_data): +# col = loan_data[['ApplicantIncome','CoapplicantIncome','LoanAmount']] +# quantile_all = loan_data.quantile(0.95) + +# col_name = ['ApplicantIncome','CoapplicantIncome','LoanAmount'] +# for x in col: + +# loan_data = loan_data.drop(loan_data[loan_data[x]>quantile_all[x]].index) + #num_cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount'] + q_ai = loan_data['ApplicantIncome'].quantile(0.95) + q_ci = loan_data['CoapplicantIncome'].quantile(0.95) + q_la = loan_data['LoanAmount'].quantile(0.95) + l_ai = list(loan_data.index[loan_data['ApplicantIncome'] > q_ai]) + l_ci = list(loan_data.index[loan_data['CoapplicantIncome'] > q_ci]) + l_la = list(loan_data.index[loan_data['LoanAmount'] > q_la]) + + l_95 = list(set(l_ai+l_ci+l_la)) + l_95 = np.sort(l_95) + loan_data.drop(loan_data.index[l_95], inplace=True) + return loan_data + +outlier_removal(loan_data) + + + + + + + + + + + + + + + + + + + + + + + + -# Write your Solution here: diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc index 5a057ff..73f4509 100644 Binary files a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc index 4c0b6c7..d3392c3 100644 Binary files a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..926d25d Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..e8fde9c Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py index b56e2bc..685ecac 100644 --- a/q02_data_cleaning_all/build.py +++ b/q02_data_cleaning_all/build.py @@ -1,3 +1,4 @@ +# %load q02_data_cleaning_all/build.py # Default Imports import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__')))) @@ -12,3 +13,22 @@ # Write your solution here : +def data_cleaning(data): + np.random.seed(9) + data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mean()) + cat_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed', + 'Loan_Amount_Term', 'Credit_History'] + + for col in cat_cols: + data[col] = data[col].fillna(data[col].mode()) + + X = data.drop('Loan_Status',1) + y = data['Loan_Status'] + X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=9) + return X,y,X_train, X_test, y_train, y_test + + + + + + diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..0281b4d Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc new file mode 100644 index 0000000..3915b2d Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..625298f Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..35929e0 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/build.py b/q02_data_cleaning_all_2/build.py index e20ff7b..9fd796c 100644 --- a/q02_data_cleaning_all_2/build.py +++ b/q02_data_cleaning_all_2/build.py @@ -1,8 +1,10 @@ +# %load q02_data_cleaning_all_2/build.py # Default Imports import pandas as pd import numpy as np from greyatomlib.logistic_regression_project.q02_data_cleaning_all.build import data_cleaning from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal +from sklearn.preprocessing import LabelEncoder loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') loan_data = loan_data.drop('Loan_ID', 1) @@ -11,3 +13,47 @@ # Write your solution here : +def data_cleaning_2(X_train, X_test, y_train, y_test): + import numpy as np + num_df_train = X_train[['ApplicantIncome','CoapplicantIncome', + 'LoanAmount',]] + cat_df_train = X_train[['Gender', 'Married', 'Dependents','Education', 'Self_Employed', + 'Loan_Amount_Term', 'Credit_History','Property_Area']] + + num_df_test= X_test[['ApplicantIncome','CoapplicantIncome', + 'LoanAmount']] + cat_df_test = X_test[['Gender', 'Married', 'Dependents','Education', 'Self_Employed', + 'Loan_Amount_Term', 'Credit_History','Property_Area']] + + for col in num_df_train: + num_df_train[col] = np.sqrt(num_df_train[col]) + + for col in num_df_test: + num_df_test[col] = np.sqrt(num_df_test[col]) + + cat_df_train = pd.get_dummies(cat_df_train) + cat_df_test = pd.get_dummies(cat_df_test) + + cat_df_train = cat_df_train.drop(['Dependents_0','Gender_Female','Education_Graduate', + 'Self_Employed_No','Married_No','Property_Area_Rural'],axis=1) + cat_df_test = cat_df_test.drop(['Dependents_0','Gender_Female','Education_Graduate', + 'Self_Employed_No','Married_No','Property_Area_Rural'],axis=1) + + print('num_df_train ',num_df_train.shape) + print('cat_df_train ',cat_df_train.shape) + print('num_df_test ',num_df_test.shape) + print('cat_df_test ',cat_df_test.shape) + X_train = pd.concat([num_df_train,cat_df_train],axis=1) + X_test = pd.concat([num_df_test,cat_df_test],axis=1) + + print('X_train ',X_train.shape) + print('X_test ',X_test.shape) + return X_train, X_test, y_train, y_test + + + + + + + + diff --git a/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..fb238f9 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc new file mode 100644 index 0000000..5f256b4 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..e6ea174 Binary files /dev/null and b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/build.cpython-36.pyc b/q03_logistic_regression/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..b5e8292 Binary files /dev/null and b/q03_logistic_regression/__pycache__/build.cpython-36.pyc differ diff --git a/q03_logistic_regression/build.py b/q03_logistic_regression/build.py index cdbd506..690a99d 100644 --- a/q03_logistic_regression/build.py +++ b/q03_logistic_regression/build.py @@ -1,3 +1,4 @@ +# %load q03_logistic_regression/build.py # Default Imports import pandas as pd from sklearn.preprocessing import StandardScaler @@ -15,4 +16,31 @@ # Write your solution code here: +def logistic_regression(X_train,X_test,y_train,y_test): + import numpy as np + scaled_cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount'] + np.random.seed(9) + scaler = StandardScaler() + scaler.fit(X_train.loc[:,scaled_cols]) + scaler.transform(X_train.loc[:,scaled_cols]) + + scaler.fit(X_train.loc[:,scaled_cols]) + scaler.fit(X_test.loc[:,scaled_cols]) + +# for col in scaled_cols: +# X_train[col] = scaled_train[col] +# X_test[col] = scaled_test[col] + lr = LogisticRegression() + lr.fit(X_train,y_train) + y_pred = lr.predict(X_test) + cm = confusion_matrix(y_test,y_pred) + return cm + + + + + + + + diff --git a/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..e997bc1 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc new file mode 100644 index 0000000..ddd1eba Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc differ