-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcomparison_new_old_results.py
executable file
·233 lines (179 loc) · 8.63 KB
/
comparison_new_old_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/env python
# encoding: utf-8
import pandas as pd
import numpy as np
import csv
import math
# Get info from lookup files
from question_specific_lookups import universities_lookup
from question_specific_lookups import q4_lookup
from question_specific_lookups import q9_lookup
from question_specific_lookups import eq1_lookup
STOREFILENAME = './output/'
NEW_RESULTS = './output/summary_csvs/'
OLD_RESULTS = './results_from_original_2014_analysis/'
LIST_OF_RESULT_NAMES = [
'Question 1.csv',
'Question 2.csv',
'Question 3.csv',
'Question 4.csv',
'Question 5.csv',
'Question 6.csv',
'Question 7.csv',
'Question 8.csv',
'Question 9.csv',
'Question 10.csv',
'Question 11.csv',
'Extra question 1.csv',
'Extra question 2.csv',
'Extra question 3.csv',
'Extra question 4.csv',
]
def import_csv_to_df(filename):
"""
Imports a csv file into a Pandas dataframe
:params: get an xls file and a sheetname from that file
:return: a df
"""
return pd.read_csv(filename)
def export_to_csv(df, location, filename):
"""
Exports a df to a csv file
:params: a df and a location in which to save it
:return: nothing, saves a csv
"""
return df.to_csv(location + filename)
def create_dict_dfs(location, old):
"""
Creates a dict of dfs from a bunch of csvs, lowercases column names
:params: location of the csvs, a parameter which says whether working with the old or the new analysis results
:return: a dict of dfs
"""
def clean_by_replacing(df, dict_replace):
"""
Replaces the names used in the old analysis with the
names used in the new analysis to allow comparison
:params: a df using old names, and a dictionary imported from a lookup table
:return: a df using new names
"""
for key in dict_replace:
df.replace(key, dict_replace[key], inplace=True)
return df
dict_dfs = {}
for current in LIST_OF_RESULT_NAMES:
# Import
df_current = import_csv_to_df(location + current)
# make column names lowercase
df_current.columns = [x.lower() for x in df_current.columns]
# Go through the cols and if they're object types, convert the
# strings to lowercase
df_current = df_current.apply(lambda x: x.str.lower() if(x.dtype == 'object') else x)
if old == True:
# Deal with Q1 differences
if current == 'Question 1.csv':
# Call a local function that runs a replace function to change the names
df_current = clean_by_replacing(df_current, universities_lookup)
if current == 'Extra question 1.csv':
df_current = clean_by_replacing(df_current, eq1_lookup)
# Because we've replaced multiple old names with the same new one,
# we need to sum over the new names to get the total
df_current = df_current.groupby('unnamed: 0').number.sum().reset_index()
if old == False:
# Drop the percentage column (only in the new data), because I'm worried it might cause confusion
df_current.drop('percentage', 1, inplace=True)
# Deal with Q9 differences
if current == 'Question 4.csv':
df_current = clean_by_replacing(df_current, q4_lookup)
# Because we've replaced multiple old names with the same new one,
# we need to sum over the new names to get the total
df_current = df_current.groupby('unnamed: 0')['question 4: which funder currently provides the majority of your funding'].sum().reset_index()
df_current.sort_values('question 4: which funder currently provides the majority of your funding', ascending=False, inplace = True)
if current == 'Question 9.csv':
df_current = clean_by_replacing(df_current, q9_lookup)
# Store in dict of dfs
dict_dfs[current] = df_current
return dict_dfs
def compare_results(dfs_old, dfs_new):
"""
Compares results of two sets of dataframes
:params: two dicts of dfs containing results for comparison
:return: a dict of dfs showing percentage differences between
columns that have been compared
"""
dfs_summary_comparison = {}
# Go through the dfs containing the new results
for key in dfs_new:
# First of all, ensure that there's a corresponding
# dataframe in the old data, otherwise we're missing something
if dfs_old[key] is None:
print('We are missing a dataframe for ' + str(key))
# The field we want to join on is the first column in both dfs
# Hence grab the first column name of each df
old_df_join_col = dfs_old[key].columns[0]
new_df_join_col = dfs_new[key].columns[0]
# Now need to compare the new and old data. It's quite easy because in the old
# data, the relevant column is always called "number", so we set...
old_data_colname = 'number'
# and in the new data, the relevant column has the word "question"
# in it, so we set...
new_data_colname = [col for col in dfs_new[key].columns if 'question' in col][0]
# Now we create a new dataframe by merging the new and old data (for each question, we're still in the for loop above)
df_compare = pd.merge(dfs_new[key], dfs_old[key], left_on=new_df_join_col, right_on=old_df_join_col, how='outer')
# Make the col names more intuitive
df_compare.rename(columns = {old_data_colname:'old_analysis', new_data_colname:'new_analysis'}, inplace = True)
# Use the first column (called 'unnamed: 0'), which holds the answer options, as the index
df_compare.set_index('unnamed: 0', drop = True, inplace = True)
df_compare['new_analysis_percentage'] = round(100*(df_compare['new_analysis'])/df_compare['new_analysis'].sum(),0)
df_compare['old_analysis_percentage'] = round(100*(df_compare['old_analysis'])/df_compare['old_analysis'].sum(),0)
df_compare['new_minus_old_percentage'] = df_compare['new_analysis_percentage']-df_compare['old_analysis_percentage']
# Store results in dict of dfs
dfs_summary_comparison[key] = df_compare
return dfs_summary_comparison
def responses(dfs_summary_comparison):
question_list = []
new_list = []
old_list = []
for current in LIST_OF_RESULT_NAMES:
question_list.append(current[:-4])
new_list.append(int(dfs_summary_comparison[current]['new_analysis'].sum()))
old_list.append(int(dfs_summary_comparison[current]['old_analysis'].sum()))
responses = {'question': question_list, 'new_analysis': new_list, 'old_analysis': old_list}
df_responses = pd.DataFrame.from_dict(responses)
df_responses.set_index('question', drop = True, inplace = True)
df_responses['percentage_diff'] = round(100*(df_responses['new_analysis']-df_responses['old_analysis'])/df_responses['old_analysis'],0)
return df_responses
def differences(dfs_summary_comparison):
question_list = []
percentages = []
for current in LIST_OF_RESULT_NAMES:
question_list.append(current[:-4])
# Need the abs() to ensure that positive and negative differences don't cancel each other
# and make the results look better than they are
percentages.append(round(abs(dfs_summary_comparison[current]['new_minus_old_percentage']).mean(),1))
totals = {'question': question_list, 'av_percent_difference': percentages}
df_totals = pd.DataFrame.from_dict(totals)
df_totals.set_index('question', drop = True, inplace = True)
return df_totals
def main():
"""
Main function to run program
"""
# Read old and new results into separate dict of dfs
dfs_old = create_dict_dfs(OLD_RESULTS, True)
dfs_new = create_dict_dfs(NEW_RESULTS, False)
dfs_summary_comparison = {}
# Compare results
dfs_summary_comparison = compare_results(dfs_old, dfs_new)
# Save the comparisons to csvs
for key in dfs_summary_comparison:
export_to_csv(dfs_summary_comparison[key], STOREFILENAME + 'comparison_summary_csvs/', key)
# Get total responses per question and compare
df_responses = responses(dfs_summary_comparison)
# Save totals to csv
export_to_csv(df_responses, STOREFILENAME + 'comparison_summary_csvs/', 'responses_per_question.csv')
# Get total responses per question and compare
df_differences = differences(dfs_summary_comparison)
# Save totals to csv
export_to_csv(df_differences, STOREFILENAME + 'comparison_summary_csvs/', 'differences_per_question.csv')
if __name__ == '__main__':
main()