-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCleaning.py
118 lines (76 loc) · 2.95 KB
/
Cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
def create_column_dict(path):
"""
Reads in the two rows of column names from the data set.
The first row is a descriptive column name
while the second row is simpler unique id.
Creates a dictionary of the two sets of column names.
Ignores the first 6 column names,
which are descriptive in both rows.
Simplifies the formatting and types of characters
within the descriptive column names.
Returns the dictionary.
"""
# read in the first two rows
columns = pd.read_csv(path, nrows=1)
# create the dictionary
columns_dict = columns.iloc[0, 7:].to_dict()
# swap the keys and values while also removing capitalizations
columns_dict = dict([(value, key.lower())
for key, value in columns_dict.items()])
# loop through the dictionary to make various string replacements
str_replace_list = [(' - ', '_'),
(' ', '_'),
('-', '_'),
('/', '_'),
('=', '_eqls_'),
('(', ''),
(')', ''),
('%', 'pct'),
('+', 'plus'),
('.', '')]
for key, value in columns_dict.items():
for replacement in str_replace_list:
value = value.replace(replacement[0],
replacement[1])
columns_dict.update({key: value})
return columns_dict
def column_na_df(df):
"""
Generates a dataframe with one column listing
every column in the DataFrame parameter (df),
and another column with the count of na's in the column.
"""
na_dict = {}
for col in df.columns:
na_count = df[col].isna().sum()
na_dict.update({col: na_count})
na_count_df = pd.DataFrame.from_dict(data=na_dict,
orient='index',
columns=['na_count'])
na_count_df.reset_index(inplace=True)
na_count_df.rename(columns={'index': 'column'},
inplace=True)
return na_count_df
"""Check for all duplicates in the Dataframe
If there are any in the dataframe , remove them ,
if there are no duplicates ,
print there arent any duplicates in the dataframe"""
def check_duplicates(df):
if df.duplicated().sum():
print("There were {} duplicates and they have been removed".format(df.duplicated().sum()))
df.drop_duplicates(inplace=True)
else:
print("You are all clear of duplicates")
return df
import pandas as pd
"""Iterate through all columns and fill them
with the median for all numerical columns"""
# def filla_null(df):
# for col in df.columns :
# try:
# median = df[col].median()
# df[col] = df[col].fillna(value = median)
# except:
# continue
# df.columns = df.columns.fillna('median')