-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyse_survey.py
421 lines (314 loc) · 15.1 KB
/
analyse_survey.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
#!/usr/bin/env python
# encoding: utf-8
import pandas as pd
import sys
# Get details for plots from look up table
from column_name_renaming import col_shortener
from column_name_renaming import sort_no_further_analysis
from column_name_renaming import yes_no_analysis
from column_name_renaming import add_an_other_category
from column_name_renaming import scale_analysis
from column_name_renaming import worded_scale_analysis
from bivariate_instructions import which_by_which
DATAFILELOC = "./data/"
DATAFILENAME = "SoftwareSurveyInterimData.csv"
CSVSTORE = "./output_csv/"
BIVARIATESTORE = "./output_csv/bivariate/"
def print_df(df):
"""
Use this function BEFORE the df has been converted to dict
"""
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)
return df
def print_dict(df):
"""
Use this function AFTER the df has been converted to dict
"""
for x in df:
print('1: ',x)
for y in df[x]:
print('2: ',y)
print('3: ',df[x][y])
print('\n')
return df
def import_csv_to_df(location, filename):
"""
Imports a csv file into a Pandas dataframe
:params: an xls file and a sheetname from that file
:return: a df
"""
return pd.read_csv(location + filename, header=0)
def export_to_csv(df, location, filename, index_write):
"""
Exports a df to a csv file
:params: a df and a location in which to save it
:return: nothing, saves a csv
"""
return df.to_csv(location + filename + '.csv', index=index_write)
def remove_columns(df):
"""
Removes a specified column from the data. In this case we remove the RecordedDate column.
:param df
:return: the main df, but without the specified column
"""
df.drop(labels=['RecordedDate'], axis='columns', inplace=True)
return df
def remove_indices(df):
"""
Removes the 2nd and 3rd line and all unfinished surveys from the data set.
:param df
:return: the main df, but without the specified rows
"""
df.drop(df.index[0], inplace=True)
df.drop(df.index[0], inplace=True)
df = df[df.Finished != 'False']
return df
def clean_col_names(df):
"""
Takes a dict from a look up file and uses it to rename the columns
to something shorter and tidier
:param df: the main df
:return: the main df with new col names
"""
return df.rename(index=str, columns=col_shortener)
def shorten_faculties(df):
"""
There's no need to have "Faculty of" before each faculty name in the output data. It just makes it clunky
so removing it here.
:param df: the main df
:return: the main df with "Faculty of " removed from "faculty" col
"""
df['faculty'].replace(regex=True, inplace=True, to_replace=r'Faculty of ', value=r'')
df['faculty'].replace(regex=True, inplace=True, to_replace=r'Strathclyde Business School', value=r'Business School')
return df
def replace_separator(df):
"""
The Qualtrics separator of multiple choice answers is a comma. This needs to be replaced by a semicolon
to get the correct counts of the individual answers.
: param df: the main df
: return: the main df with ',' replaced by ';' in the funders column
"""
key = 'funders'
df[key] = df[key].str.replace(',',';')
return df
def get_counts(df):
"""
Get summary dataframes. Where needed split up the multiple answer responses so that they fall into simple bins.
:param df: the main df
:return: dict of dfs holding summaries on the answers to each question
"""
# Initialise a dict into which I shall store the results dfs
summary_dfs = {}
# Go through each col, get the counts of each question, calculate a percentage and then store as a result df
for current_col in df.columns:
df_counts = pd.DataFrame(data = (df[current_col].value_counts(sort=False)), columns = [current_col])
# Shift the index into a new column needed for the next bit
df_counts['answers'] = df_counts.index
# Multi-choice questions provide multiple answers separated by semicolon. Need to separate these up and count
# them individually. The method I use (with Stack etc) below fails on answers that are numeric, so I filter them
# out. Obviously, anything with a semicolon in it is not numeric!
if df_counts['answers'].dtype == 'object':
df_counts = (df_counts.set_index(current_col)['answers'].str.split(';', expand=True).stack().reset_index(name='answers').groupby('answers', as_index=False)[current_col].sum())
df_counts.set_index('answers', inplace=True)
df_counts['percentage'] = round(100 * df_counts[current_col] / df_counts[current_col].sum(), 0)
# Save as dict of dfs
summary_dfs[current_col] = df_counts
return summary_dfs
def clean_funders(summary_dfs):
"""
Removes the 'Other:' answer from the question and corrects the counts.
:param summary_dfs: dict of dfs holding summaries on the answers to each question
:return: dict of dfs holding summaries on the answers to each question - but with the funder question cleaned.
"""
key = 'funders'
df_temp = summary_dfs[key]
df_temp.drop(index='Other:', inplace=True)
# Obviously, removing one of the options changes the percentages, so have to re-do that calculation
df_temp['percentage'] = round(100 * df_temp[key] / df_temp[key].sum(), 0)
return summary_dfs
def change_lows_to_other(summary_dfs):
"""
Questions with an "other" response can generate lots of low value responses. Convert all answers that gain fewer
than 5 responses into an "other" category to make things cleaner.
:param summary_dfs: dict of dfs holding summaries on the answers to each question
:return: dict of dfs holding summaries on the answers to each question - but with an other response used to mop up
answers with low response numbers
"""
for key in add_an_other_category:
print(summary_dfs[key])
df_temp = summary_dfs[key]
# New index makes the current index a col that we can work with
df_temp.reset_index(inplace=True)
# Make a Boolean mask of all the rows that have fewer than 5 responses and use it
# to change the category to 'other'
mask = df_temp[key].le(5)
df_temp.loc[mask, 'answers'] = 'others_cummulative'
# Collapse all the 'other' rows into a single row, sum the result and convert back to a df
df_temp = pd.DataFrame(df_temp.groupby('answers')[key].sum())
df_temp.columns = [key]
# Add a percentage col, because the last one was stripped by the groupby operation
df_temp['percentage'] = round(100 * df_temp[key] / df_temp[key].sum(), 0)
# Replace the original df in the dict of dfs
summary_dfs[key] = df_temp
return summary_dfs
def find_number_responses(summary_dfs, df):
"""
Finds the number of people who responded to each question and saves it as a csv
:param df: the main dataframe
:return: nothing
"""
response_list = {}
for key in summary_dfs:
df_temp = summary_dfs[key]
response_list[key] = df_temp[key].sum()
# This approach doesn't work for multiple choice questions (you get over-counting for anyone that added more than
# one response. For these questions, to get the response number we must go back to the original df
multi_questions = ['funders']
for current_question in multi_questions:
df_temp = df.dropna(axis=0, subset=[current_question])
# Write the actual number of responses over the current number in the response list
response_list[current_question] = len(df_temp)
df_responses = pd.DataFrame(response_list.items(), columns=['answers', 'responses'])
export_to_csv(df_responses, CSVSTORE, 'responses', False)
return
def sort_and_save(summary_dfs):
"""
Takes the summary dfs that merely need sorted. Uses info from the sort_no_further_analysis var to know which dfs to
process
:param summary_dfs: dict of dfs holding summaries on the answers to each question
:return: a better ordered dict of dfs holding summaries on the answers to each question
"""
for key in sort_no_further_analysis:
df_temp = summary_dfs[key]
df_temp.sort_values(by=key, inplace=True, ascending=False)
# Replace the original df in the dict of dfs
summary_dfs[key] = df_temp
return summary_dfs
def yes_and_no(summary_dfs):
"""
Takes summary dfs of yes-and-no questions, sorts them yes first, then no. Uses the yes_no_analysis var to work out
which dfs to process
:param summary_dfs: dict of dfs holding summaries on the answers to each question
:return: a better ordered dict of dfs holding summaries on the answers to each question
"""
for key in yes_no_analysis:
df_temp = summary_dfs[key]
# Sorting an index of 'yes' and 'no' in descending order is the same as ensuring that yes comes first
# which is... you know... what I want
df_temp.sort_index(inplace=True, ascending=False)
# Replace the original df in the dict of dfs
summary_dfs[key] = df_temp
return summary_dfs
def scale_question_analysis(summary_dfs):
"""
To make plotting easy, this sorts the questions with a 1 to 5 answer scale, and saves them back into the dict of
dfs.
:param summary_dfs: dict of dfs holding summaries on the answers to each question
:return: a better ordered dict of dfs holding summaries on the answers to each question
"""
for key in scale_analysis:
df_temp = summary_dfs[key]
df_temp.sort_index(ascending=True, inplace=True)
# Replace the original df in the dict of dfs
summary_dfs[key] = df_temp
return summary_dfs
def scale_worded_questions(summary_dfs):
"""
Some questions have wordy answers that I want to present in a particular order. This puts them in that order using
a categorical. I feel that there must be an easier way of doing this, but I can't find it. The function saves the
re-ordered dfs back in the summary-dfs.
:param summary_dfs: dict of dfs holding summaries on the answers to each question
:return: a better ordered dict of dfs holding summaries on the answers to each question
"""
for key in worded_scale_analysis:
df_temp = summary_dfs[key]
# New index makes the current index a col that we can work with
df_temp.reset_index(inplace=True)
if key in ['version_control', 'continuous_integration', 'unit_regression_testing']:
# These categoricals allow you to order a column based on a list. Seems long winded, but it was the only
# straightforward way I could find to do it.
df_temp['answers'] = pd.Categorical(df_temp['answers'],
categories=['Not heard of it', 'Not confident','Confident',
'Very confident'], ordered=True)
df_temp.sort_values('answers', inplace=True)
elif key == 'funds_for_development':
df_temp['answers'] = pd.Categorical(df_temp['answers'],
categories=['Yes',
'No (but we DID expect to write software as part of the project)',
'No (we did NOT expect to write software as part of the project)'],
ordered=True)
df_temp.sort_values('answers', inplace=True)
elif key in ['hire_full_time_developer', 'hire_rse']:
df_temp['answers'] = pd.Categorical(df_temp['answers'],
categories=['Perfect', 'Suitable', 'Unsuitable'], ordered=True)
df_temp.sort_values('answers', inplace=True)
# Now that the hard work's done, we can set the answers col back to be the index and save back into the dict
# of dfs
df_temp.set_index('answers', inplace=True)
summary_dfs[key] = df_temp
return summary_dfs
def bivariate_analysis(df, summary_dfs):
"""
Conducts bivariate analysis as instructed by the imported which_by_which
dict. Each key in the dict is one question from the survey which will be segmented
by the question(s) held in the value(s) of the dict.
:param df:
:param summary_dfs:
:return:
"""
counter = 0
for denominator in which_by_which:
differentiator = list(summary_dfs[denominator].index)
counter += 1
for current_diff in differentiator:
temp_df=df[df[denominator]==current_diff]
for current_question in which_by_which[denominator]:
df_counts = pd.DataFrame(data=(temp_df[current_question].value_counts(sort=False)), columns=[current_question])
df_counts['answers'] = df_counts.index
# Multi-choice questions provide multiple answers separated by semicolon. Need to separate these up and count
# them printindividually. The method I use (with Stack etc) below fails on answers that are numeric, so I filter them
# out. Obviously, anything with a semicolon in it is not numeric!
# Needs an "and length isn't 0" bit because the function doesn't work for zero-length dfs
if df_counts['answers'].dtype == 'object' and len(df_counts) != 0:
df_counts = (
df_counts.set_index(current_question)['answers'].str.split(';', expand=True).stack().reset_index(
name='answers').groupby('answers', as_index=False)[current_question].sum())
df_counts.set_index('answers', inplace=True)
df_counts['percentage'] = round(100 * df_counts[current_question] / df_counts[current_question].sum(), 0)
filename = str(counter) + "_sep_" + str(current_diff) + "_sep_" + str(current_question) + '_' + str(denominator)
export_to_csv(df_counts, BIVARIATESTORE, filename, True)
return
def write_out_summaries(summary_dfs):
"""
Write the summary dfs out to csvs
:param summary_dfs: dict of dfs holding summaries on the answers to each question
:return: nothing
"""
for key in summary_dfs:
export_to_csv(summary_dfs[key], CSVSTORE, key, True)
return
def main():
"""
Main function to run program
"""
df = import_csv_to_df(DATAFILELOC, DATAFILENAME)
df = remove_columns(df)
df = remove_indices(df)
df = clean_col_names(df)
df = shorten_faculties(df)
df = replace_separator(df)
summary_dfs = get_counts(df)
# summary_dfs = clean_funders(summary_dfs)
# summary_dfs = change_lows_to_other(summary_dfs)
find_number_responses(summary_dfs, df)
# Prepare data for later graphing
summary_dfs = sort_and_save(summary_dfs)
summary_dfs = yes_and_no(summary_dfs)
summary_dfs = scale_question_analysis(summary_dfs)
summary_dfs = scale_worded_questions(summary_dfs)
write_out_summaries(summary_dfs)
# Conduct bivariate analysis
bivariate_analysis(df, summary_dfs)
if __name__ == '__main__':
main()