-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path01_PRE_PROCESS_RAW_SURVEY_DATA.py
336 lines (289 loc) · 19.1 KB
/
01_PRE_PROCESS_RAW_SURVEY_DATA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# -*- coding: utf-8 -*-
#
# a simple Python script that,
#
# * reformats the raw TAB-separated value file from Google Forms into a final
# data input (removes sensitive columns, adds new summary columns, fixes
# typos in country names, etc.)
# * generates an output format that covers free-form text from a manually
# curated list of responses.
import os
import sys
import anvio.utils as u
if not os.path.exists('mentorship-RAW.tsv'):
if os.path.exists('mentorship.tsv'):
print("You don't seem to have the raw input file available in this directory. But you do have "
"the formatted output. Well. GOOD. All OK, but there will be no data for wisdom.")
sys.exit()
else:
print("You don't seem to have either `mentorship-raw.tsv`, nor the `mentorship.tsv` (which is "
"generated from the `mentorship-raw.tsv` by this program) in this directory. Something "
"is wrong here but you can always reach out to meren at uchicago.edu.")
sys.exit(-1)
# read the raw data.
m = u.get_TAB_delimited_file_as_dictionary('mentorship-RAW.tsv')
# we will use this dictionary to turn crappy Google Forms keys into single-word
# keys for our sanity:
keys = {'You are currently':
"mentee_current",
'You worked (or are still working) with the mentor you have in mind as:':
"mentee_then",
'Your mentor was (or is):':
"mentor_then",
'If you had to chose one, which one of the following categories would best describe your relationship with science as an ECR or when you were an ECR?':
"discipline",
'How would you describe your ECR expertise? (e.g., Microbial Ecologist, Biochemist, Computer Scientist, Civil Engineer, etc.)':
"expertise",
'Please chose your gender (if you are no longer a trainee, respond with your gender at the time you were a trainee). The purpose of this question is to generate enough data, if possible, to not completely miss issues related or specific to gender.':
"mentee_gender",
"Please chose, to the best of your knowledge, your mentor's gender (similar to the previous one, this question is here in an attempt to generate enough data to not completely miss mentorship patterns related or specific to gender).":
"mentor_gender",
'Where were you (or still are) working as an ECR? Please enter a country name (please avoid using acronyms and spell out the full country name in English (i.g., United States, United Kingdom, Mexico, etc)).':
"country",
'If you are/were in the United States, which state? (please avoid using acronyms (i.e., Illinois, Massachusetts, etc)). Leave this blank if you were/are not working in the United Sates OR if you think this information, combined with your other responses, can identify you.':
"state",
'Except your gender, were you / are you a member of a minority group in your workplace based on your ethnic background and/or religion?':
"mentee_minority",
'Except their gender, and to the best of your knowledge, was/is your mentor a member of a minority group in the workplace based on their ethnic background and/or religion?':
"mentor_minority",
'Considering its entirety and its influence on your wellbeing and career, how would you characterize your experience with your mentor?':
"experience_with_mentor",
'Do you think your mentor was (or is) considered a good scientist by their colleagues?':
"mentor_seen_by_colleagues",
'How many trainees your mentor was (or is) responsible for during the time you worked with them as an ECR (or currently, if you are still advised by this mentor)?':
"mentor_num_trainees",
'On average, how much time your mentor was (or is) able to dedicate to you for 1-on-1 interactions outside of group meetings?':
"mentor_mentee_meeting_time",
'Was/is your experience with your mentor comparable to the experience of the other trainees who worked/are working with them?':
"mentee_experience_was_common",
'To what extent did/does your mentor encourage you to define your project or influence its trajectory?':
"mentee_influence_on_project",
'Do you think your mentor is/was aware of their shortcomings in mentoring trainees?':
"mentor_awareness_on_shortcomings",
'Do you think your mentor is/was aware of their strengths in mentoring trainees?':
"mentor_awareness_on_strengths",
'How frequently did/does your mentor ask for feedback from their trainees on their mentorship style?':
"mentor_asking_feedback",
"Do you think your mentor made/makes it easy for their trainees to raise their concerns about the mentorship they're receiving?":
"mentor_makes_easy_mentees_to_raise_concerns",
'Was/is there a public document defining the code of conduct, expectations, or the group culture provided by your mentor?':
"code_of_conduct",
'What is/was the BIGGEST SHORTCOMING of their mentorship style? Please consider describing how did (or does) this shortcoming influence your work, your wellbeing, or your career (you can list multiple, but please list each shortcoming and its influence as a separate paragraph).':
"mentor_biggest_shortcomings",
'What is/was the MOST ADMIRABLE trait of their mentorship style? Please consider describing how this trait positively influenced (or influences) your daily work, your wellbeing, or your career (you can list multiple, but please list each shortcoming and its influence as a separate paragraph).':
"mentor_biggest_strengths",
'Based on your experience, what advice would you give to your mentor so they could do better?':
"mentee_advice_to_mentor",
'Based on your experience, what advice would you give to those who are getting ready to pursue a career in science and are looking for mentors?':
"mentee_advice_to_mentees"}
# the headers we wish to report in the output file we will generate from the raw input:
headers = ["timestamp", "mentee_current", "mentee_current_summary", "mentee_then", "mentor_then",
"discipline", "mentee_gender", "mentor_gender", "country", "mentee_minority",
"mentor_minority", "experience_with_mentor", "mentor_seen_by_colleagues",
"mentor_num_trainees", "mentor_mentee_meeting_time", "mentee_experience_was_common",
'mentee_experience_was_common_summary', "mentee_influence_on_project",
"mentor_awareness_on_shortcomings", "mentor_awareness_on_shortcomings_summary",
"mentor_awareness_on_strengths", "mentor_awareness_on_strengths_summary",
"mentor_asking_feedback", 'mentor_has_ever_asked_feedback',
"mentor_makes_easy_mentees_to_raise_concerns",
"mentor_makes_easy_mentees_to_raise_concerns_summary", "code_of_conduct"]
# the poor man's loop to summarize complex answers into simpler ones, and to
# fix typos (i.e, those in country names)
d = {}
for k in m:
d[k] = {}
for key in m[k]:
d[k][keys[key]] = m[k][key]
if keys[key] == 'mentor_seen_by_colleagues':
if m[k][key] == "I don't know / Prefer not to comment":
d[k][keys[key]] = "No comment"
elif m[k][key].startswith('No'):
d[k][keys[key]] = "Not considered good"
elif m[k][key].startswith('Yes'):
d[k][keys[key]] = "Considered good"
if keys[key] in ['mentee_experience_was_common']:
if m[k][key] in ["1", "2"]:
d[k]['mentee_experience_was_common_summary'] = "Specific to few"
elif m[k][key] in ["4", "5"]:
d[k]['mentee_experience_was_common_summary'] = "Common to most"
else:
d[k]['mentee_experience_was_common_summary'] = None
if keys[key] in ['mentor_awareness_on_shortcomings']:
if m[k][key] in ["1", "2"]:
d[k]['mentor_awareness_on_shortcomings_summary'] = "Aware"
elif m[k][key] in ["4", "5"]:
d[k]['mentor_awareness_on_shortcomings_summary'] = "Not Aware"
else:
d[k]['mentor_awareness_on_shortcomings_summary'] = "Neutral"
if keys[key] in ['mentor_awareness_on_strengths']:
if m[k][key] in ["1", "2"]:
d[k]['mentor_awareness_on_strengths_summary'] = "Aware"
elif m[k][key] in ["4", "5"]:
d[k]['mentor_awareness_on_strengths_summary'] = "Not Aware"
else:
d[k]['mentor_awareness_on_strengths_summary'] = "Neutral"
if keys[key] in ['mentor_makes_easy_mentees_to_raise_concerns']:
if m[k][key] in ["1", "2"]:
d[k]['mentor_makes_easy_mentees_to_raise_concerns_summary'] = "Made it difficult"
elif m[k][key] in ["4", "5"]:
d[k]['mentor_makes_easy_mentees_to_raise_concerns_summary'] = "Made it easy"
else:
d[k]['mentor_makes_easy_mentees_to_raise_concerns_summary'] = "Neutral"
if keys[key] == 'discipline':
d[k][keys[key]] = m[k][key][:m[k][key].find('(') - 1].title()
if keys[key] in ['mentor_asking_feedback']:
if m[k][key] == "Never":
d[k]['mentor_has_ever_asked_feedback'] = "No"
else:
d[k]['mentor_has_ever_asked_feedback'] = "Yes"
if keys[key] in ['mentee_current']:
if m[k][key] == "an Associate Professor (or equivalent)":
d[k]['mentee_current_summary'] = "Academic (Non-ECR)"
elif m[k][key] == "a Graduate Student":
d[k]['mentee_current_summary'] = "Academic (ECR)"
elif m[k][key] == "a Postdoctoral Researcher":
d[k]['mentee_current_summary'] = "Academic (ECR)"
elif m[k][key] == "an Assistant Professor (or equivalent)":
d[k]['mentee_current_summary'] = "Academic (Non-ECR)"
elif m[k][key] == "an Assistant Professor (or equivalent)":
d[k]['mentee_current_summary'] = "Academic (Non-ECR)"
elif m[k][key] == "a Professor (or equivalent)":
d[k]['mentee_current_summary'] = "Academic (Non-ECR)"
elif m[k][key] == "working in academia as a non-ECR":
d[k]['mentee_current_summary'] = "Academic (Non-ECR)"
elif m[k][key] == "other / prefer not to say":
d[k]['mentee_current_summary'] = "Non-Academic"
elif m[k][key] == "working in industry":
d[k]['mentee_current_summary'] = "Non-Academic"
else:
d[k]['mentee_current_summary'] = m[k][key]
if keys[key] in ['mentor_then', 'mentee_current', 'mentee_then']:
if m[k][key] == "an Associate Professor (or equivalent)":
d[k][keys[key]] = "Associate Prof"
elif m[k][key] == "a Graduate Student":
d[k][keys[key]] = "Grad student"
elif m[k][key] == "a Postdoctoral Researcher":
d[k][keys[key]] = "Postdoc"
elif m[k][key] == "an Assistant Professor (or equivalent)":
d[k][keys[key]] = "Assistant Prof"
elif m[k][key] == "an Assistant Professor (or equivalent)":
d[k][keys[key]] = "Assistant Prof"
elif m[k][key] == "a Professor (or equivalent)":
d[k][keys[key]] = "Full Prof"
elif m[k][key] == "working in academia as a non-ECR":
d[k][keys[key]] = "Non-ECR academic"
elif m[k][key] == "other / prefer not to say":
d[k][keys[key]] = "Other"
else:
d[k][keys[key]] = m[k][key]
if keys[key] == 'mentor_num_trainees':
if m[k][key] == "Less than 5":
d[k][keys[key]] = "<5"
elif m[k][key] == "More than 20":
d[k][keys[key]] = ">15"
elif m[k][key] == "15-20":
d[k][keys[key]] = ">15"
else:
d[k][keys[key]] = m[k][key]
if keys[key] == 'mentor_asking_feedback' and m[k][key] == "More frequently":
d[k][keys[key]] = "Frequently"
if keys[key] == 'country':
if m[k][key].lower().find('states') > 0 or m[k][key].lower().find('oregon') >= 0 or m[k][key].lower().strip() == 'us':
d[k][keys[key]] = "United States"
elif m[k][key].lower().find('kingdom') > 0 or m[k][key].lower().strip() == "uk":
d[k][keys[key]] = "United Kingdom"
elif m[k][key].lower().find('india') >= 0:
d[k][keys[key]] = "India"
elif m[k][key].lower().find('netherl') >= 0:
d[k][keys[key]] = "The Netherlands"
elif m[k][key].lower().find('canada') >= 0:
d[k][keys[key]] = "Canada"
elif m[k][key].lower().find('germ') >= 0 or m[k][key].lower() == 'gernany':
d[k][keys[key]] = "Germany"
elif m[k][key].lower().find('china') >= 0:
d[k][keys[key]] = "China"
elif m[k][key].lower().find('korea') >= 0:
d[k][keys[key]] = "South Korea"
elif m[k][key].lower().find('asia') >= 0 or m[k][key].lower().find('europe') >= 0:
d[k][keys[key]] = "[none entered]"
else:
d[k][keys[key]] = m[k][key].strip().title()
if keys[key] == "mentee_gender" or keys[key] == "mentor_gender":
if m[k][key].lower().find('queer') > 0:
d[k][keys[key]] = "Queer / Non-conforming"
# report the kraken!
u.store_dict_as_TAB_delimited_file(d, 'mentorship.tsv', headers=headers)
# the rest of this code is to report statements in a reproducible fashion.
wisdom_keys = ["mentor_biggest_shortcomings", "mentor_biggest_strengths", "mentee_advice_to_mentor", "mentee_advice_to_mentees"]
wisdom_questions = {"mentor_biggest_shortcomings": """In this section you will find the words of ECRs to describe '<b>the BIGGEST SHORTCOMING of the mentorship they have received</b>' from a mentor of theirs, considering how did this shortcoming influenced their work, wellbeing, and/or career. You will see that even mentees who overall had a positive experience with their mentors suffered from some aspects of the mentorship they have received.""",
"mentor_biggest_strengths" : """In this section you will find the words of ECRs to describe '<b>the MOST ADMIRABLE aspect of the mentorship they have received</b>' from a mentor of theirs, considering how did this shortcoming influenced their work, wellbeing, and/or career. You will see that even mentees who had a negative experience with their mentor had benefited from some aspects of the mentorship style they have received.""",
"mentee_advice_to_mentor" : 'In this section you will find the words of ECRs to describe the advice they would have given to their mentor so they could do better.',
"mentee_advice_to_mentees" : 'In this section you will find the words of ECRs to describe what advice they would have given to those who are getting ready to pursue a career in science and are looking for mentors.'}
wisdom_subtitles = {"mentor_biggest_shortcomings": 'Mentees report on biggest shortcomings of their mentors',
"mentor_biggest_strengths" : 'Mentees report on biggest strengths of their mentors',
"mentee_advice_to_mentor" : 'Mentees advise their mentors to do better',
"mentee_advice_to_mentees" : 'Mentees advise future mentees'}
# this information is read from an output file we generate AFTER going through
# the CURATED `mentorship_wisdom_all.txt` that will be generated in the next loop.
# if you are confused, read the next set of comments.
if os.path.exists('mentorship_wisdom_keys_to_keep.txt'):
mentorship_wisdom_keys_to_keep = set([l.strip() for l in open('mentorship_wisdom_keys_to_keep.txt').readlines()])
else:
mentorship_wisdom_keys_to_keep = set([])
# report all remarks, which is to survey and choose the ones that should be
# reported (essentially we opened the file in EXCEL, and removed the letter
# `R` from the second column of those we wished to keep and updated the
# variable `mentorship_wisdom_keys_to_keep` with those keys)
with open('mentorship_wisdom_all.txt', 'w') as f:
for key in wisdom_keys:
for timestamp in d:
v = d[timestamp]
if v[key]:
wisdom = v[key].strip().strip('-')
identifier = f"{key}!{timestamp}"
if identifier in mentorship_wisdom_keys_to_keep:
status = ''
else:
status = 'R'
f.write(f"{identifier}\t{status}\t{wisdom}\n")
# keep the ones set to be kept, and report a markdown formatted output that
# is included from the blog post.
with open('mentorship_wisdom.md', 'w') as f:
f.write("## The words of early career researchers\n\n")
for key in wisdom_keys:
f.write(f'### {wisdom_subtitles[key]}\n\n')
f.write(f'{wisdom_questions[key]}\n\n')
for timestamp in d:
identifier = f"{key}!{timestamp}"
if identifier not in mentorship_wisdom_keys_to_keep:
continue
v = d[timestamp]
if v[key]:
if v['mentee_gender'] not in ['Man', 'Woman']:
continue
wisdom = v[key].strip()
wisdom = wisdom.replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', '<br /><br />')
G = f"{'♂' if v['mentee_gender'] == 'Man' else '♀'}"
g = f"{'♂' if v['mentor_gender'] == 'Man' else '♀'}"
if v['experience_with_mentor'] == "1":
E = """a <span style="color:red;">very negative experience</span>"""
elif v['experience_with_mentor'] == "2":
E = """a <span style="color:red;">very negative experience</span>"""
elif v['experience_with_mentor'] == "4":
E = """a <span style="color:green;">positive experience</span>"""
elif v['experience_with_mentor'] == "5":
E = """a <span style="color:green;">very positive experience</span>"""
else:
E = """a <span style="color:orange;">neutral experience</span>"""
if key == 'mentor_biggest_shortcomings':
R = f"reporting on the <b>biggest shortcomings</b> of their mentor ({g})"
elif key == 'mentor_biggest_strengths':
R = f"reporting on the <b>most admirable qualities</b> of their mentor ({g})"
elif key == 'mentee_advice_to_mentor':
R = f"shares <b>their advice</b> for their mentor ({g})"
elif key == 'mentee_advice_to_mentees':
R = "shares <b>their advice</b> for future mentees"
f.write("<blockquote>\n")
f.write(f"{wisdom}\n")
f.write(f'<div class="blockquote-author"><b>{v["mentee_then"]}</b> ({G}) had {E} in <b>{v["country"]}</b><br />{R}</div>\n')
f.write("</blockquote>\n\n")