-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathtest-BiobankRead-Bash-script.py
546 lines (525 loc) · 18.7 KB
/
test-BiobankRead-Bash-script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
# -*- coding: utf-8 -*-
"""
Created on Mon May 21 14:38:15 2018
@author: wcrum
"""
import os
import subprocess
import sys
# Not really needed, but for reference
variables = ['Encoded anonymised participant ID',
'Heel ultrasound method',
'Weight method',
'Sex',
'Year of birth',
'Hand grip strength (left)',
'Hand grip strength (right)',
'Waist circumference',
'Hip circumference',
'Standing height',
'Month of birth',
'Date of attending assessment centre',
'UK Biobank assessment centre',
'Cancer year/age first occurred',
'Non-cancer illness year/age first occurred',
'Systolic blood pressure, manual reading',
'Diastolic blood pressure, manual reading',
'Pulse rate (during blood-pressure measurement)',
'Pulse rate, automated reading',
'Birth weight known',
'Place of birth in UK - north co-ordinate',
'Place of birth in UK - east co-ordinate',
'Number of self-reported cancers',
'Number of self-reported non-cancer illnesses',
'Townsend deprivation index at recruitment',
'Type of accommodation lived in',
'Own or rent accommodation lived in',
'Time employed in main current job',
'Length of working week for main job',
'Frequency of travelling from home to job workplace',
'Distance between home and job workplace',
'Job involves mainly walking or standing',
'Job involves heavy manual or physical work',
'Job involves shift work',
'Number of days/week walked 10+ minutes',
'Duration of walks',
'Number of days/week of moderate physical activity 10+ minutes',
'Duration of moderate activity',
'Number of days/week of vigorous physical activity 10+ minutes',
'Duration of vigorous activity',
'Frequency of friend/family visits',
'Time spend outdoors in summer',
'Time spent outdoors in winter',
'Length of mobile phone use',
'Weekly usage of mobile phone in last 3 months',
'Sleep duration',
'Getting up in morning',
'Morning/evening person (chronotype)',
'Nap during day',
'Sleeplessness / insomnia',
'Current tobacco smoking',
'Past tobacco smoking',
'Cooked vegetable intake',
'Salad / raw vegetable intake',
'Fresh fruit intake',
'Dried fruit intake',
'Oily fish intake',
'Non-oily fish intake',
'Processed meat intake',
'Poultry intake',
'Alcohol intake frequency.',
'Country of birth (UK/elsewhere)',
'Breastfed as a baby',
'Handedness (chirality/laterality)',
'Skin colour',
'Ease of skin tanning',
'Childhood sunburn occasions',
'Adopted as a child',
'Part of a multiple birth',
'Able to confide',
'Wears glasses or contact lenses',
'Age started wearing glasses or contact lenses',
'Chest pain or discomfort',
'Taking other prescription medications',
'Light smokers, at least 100 smokes in lifetime',
'Ever had breast cancer screening / mammogram',
'Years since last breast cancer screening / mammogram',
'Ever had cervical smear test',
'Years since last cervical smear test',
'Age when periods started (menarche)',
'Had menopause',
'Age high blood pressure diagnosed',
'Forced vital capacity (FVC)',
'Forced expiratory volume in 1-second (FEV1)',
'Peak expiratory flow (PEF)',
'Foot measured for bone density',
'Fractured heel',
'Contra-indications for spirometry',
'Caffeine drink within last hour',
'Used an inhaler for chest within last hour',
'Pregnant',
'Ankle spacing width',
'Heel Broadband ultrasound attenuation, direct entry',
'Heel quantitative ultrasound index (QUI), direct entry',
'Heel bone mineral density (BMD)',
'Weight, manual entry',
'Job involve night shift work',
'Age started smoking in current smokers',
'Chest pain or discomfort walking normally',
'Chest pain due to walking ceases when standing still',
'Age angina diagnosed',
'Former alcohol drinker',
'Chest pain or discomfort when walking uphill or hurrying',
'Age heart attack diagnosed',
'Age stroke diagnosed',
'Diastolic blood pressure, automated reading',
'Systolic blood pressure, automated reading',
'Stiffness method',
'Pulse rate',
'Pulse wave reflection index',
'Pulse wave peak to peak time',
'Pulse wave pressure versus time response curve',
'Pulse wave velocity (manual entry)',
'Average monthly red wine intake',
'Average monthly champagne plus white wine intake',
'Average monthly beer plus cider intake',
'Average monthly spirits intake',
'Average monthly fortified wine intake',
'Leg pain on walking',
'Intra-ocular pressure (IOP) method (right)',
'Intra-ocular pressure (IOP) method (left)',
'Corneal hysteresis (right)',
'Corneal hysteresis (left)',
'Leg pain when standing still or sitting',
'Leg pain in calf/calves',
'Leg pain when walking uphill or hurrying',
'Leg pain when walking normally',
'Which eye(s) affected by hypermetropia (long sight)',
'Which eye(s) affected by myopia (short sight)',
'Doctor restricts physical activity due to heart condition',
'Chest pain felt during physical activity',
'Qualifications',
'Gas or solid-fuel cooking/heating',
'Heating type(s) in home',
'Current employment status',
'Transport type for commuting to job workplace',
'Reason for glasses/contact lenses',
'Vascular/heart problems diagnosed by doctor',
'Blood clot, DVT, bronchitis, emphysema, asthma, rhinitis, eczema, allergy diagnosed by doctor',
'Medication for cholesterol, blood pressure, diabetes, or take exogenous hormones',
'Medication for pain relief, constipation, heartburn',
'Vitamin and mineral supplements',
'Leisure/social activities',
'Medication for cholesterol, blood pressure or diabetes',
'Mineral and other dietary supplements',
'Medication for pain relief, constipation, heartburn (pilot)',
'Medication for smoking cessation, constipation, heartburn, allergies (pilot)',
'Vitamin and mineral supplements (pilot)',
'Regular use of hands-free device/speakerphone with mobile phone (pilot)',
'Data points for blow (pilot)',
'Vitamin supplements (pilot)',
'Frequency of friend/family visits (pilot)',
'Time using mobile phone in last 3 months (pilot)',
'Other dietary supplements (pilot)',
'Gas or solid-fuel cooking/heating (pilot)',
'Cancer code, self-reported',
'Non-cancer illness code, self-reported',
'Interpolated Year when cancer first diagnosed',
'Interpolated Age of participant when cancer first diagnosed',
'Interpolated Year when non-cancer illness first diagnosed',
'Interpolated Age of participant when non-cancer illness first diagnosed',
'Method of recording time when cancer first diagnosed',
'Method of recording time when non-cancer illness first diagnosed',
'Sitting height',
'Birth weight',
'Acceptability of each blow result (text)',
'Acceptability of each blow result (text) (pilot)',
'Reason for skipping weight',
'Reason for skipping spirometry',
'Reason for skipping grip strength (right)',
'Reason for skipping grip strength (left)',
'Reason for skipping waist',
'Reason for skipping hip measurement',
'Reason for skipping standing height',
'Reason for skipping sitting height',
'Reason for skipping arterial stiffness',
'Reason for skipping IOP (right)',
'Reason for skipping IOP (left)',
'Reason for skipping ECG',
'Reason ECG not completed',
'Reason at-rest ECG performed without bicycle',
'Number of diet questionnaires completed',
'When diet questionnaire completion requested',
'Day-of-week questionnaire completion requested',
'Day-of-week questionnaire completed',
'Hour-of-day questionnaire completed',
'Duration of questionnaire',
'Delay between questionnaire request and completion',
'Vitamin and/or mineral supplement use',
'Reason for not eating or drinking normally',
'Type of special diet followed',
'Types of spreads/sauces consumed',
'Type of meals eaten',
'Type of fat/oil used in cooking',
'Type of sliced bread eaten',
'Type of baguette eaten',
'Type of large bap eaten',
'Type of bread roll eaten',
'Size of white wine glass drunk',
'Size of red wine glass drunk',
'Size of rose wine glass drunk',
'Liquid used to make porridge',
'Type of yogurt eaten',
'Country of Birth (non-UK origin)',
'Smoking status',
'Alcohol drinker status',
'Current employment status - corrected',
'Reproduciblity of spirometry measurement using ERS/ATS criteria',
'Ethnic background',
'Body mass index (BMI)',
'Weight',
'Age when attended assessment centre',
'Pulse wave Arterial Stiffness index',
'Age at recruitment',
'Genotype measurement batch',
'Genetic sex',
'Heterozygosity',
'Heterozygosity, PCA corrected',
'Missingness',
'Genetic ethnic grouping',
'Genotype measurement plate',
'Genotype measurement well',
'Genetic principal components',
'Recommended genomic analysis exclusions',
'Genetic relatedness pairing',
'Genetic relatedness factor',
'Genetic relatedness IBS0',
'UKBiLEVE Affymetrix quality control for samples',
'Chromosome 1 genotype results',
'Chromosome 2 genotype results',
'Chromosome 3 genotype results',
'Chromosome 4 genotype results',
'Chromosome 5 genotype results',
'Chromosome 1 genotype intensities',
'Chromosome 2 genotype intensities',
'Chromosome 3 genotype intensities',
'Chromosome 4 genotype intensities',
'Chromosome 5 genotype intensities',
'Chromosome 6 genotype intensities',
'Chromosome 7 genotype intensities',
'Chromosome 8 genotype intensities',
'Nitrogen dioxide air pollution; 2010',
'Nitrogen oxides air pollution; 2010',
'Particulate matter air pollution (pm10); 2010',
'Particulate matter air pollution (pm2.5); 2010',
'Particulate matter air pollution (pm2.5) absorbance; 2010',
'Particulate matter air pollution 2.5-10um; 2010',
'Traffic intensity on the nearest road',
'Inverse distance to the nearest road',
'Traffic intensity on the nearest major road',
'Inverse distance to the nearest major road',
'Total traffic load on major roads',
'Close to major road',
'Sum of road length of major roads within 100m',
'Nitrogen dioxide air pollution; 2005',
'Nitrogen dioxide air pollution; 2006',
'Nitrogen dioxide air pollution; 2007',
'Particulate matter air pollution (pm10); 2007',
'Average daytime sound level of noise pollution',
'Average evening sound level of noise pollution',
'Average night-time sound level of noise pollution',
'Average 16-hour sound level of noise pollution',
'Average 24-hour sound level of noise pollution',
'Microalbumin in urine',
'Microalbumin in urine result flag',
'Creatinine (enzymatic) in urine',
'Creatinine (enzymatic) in urine result flag',
'Potassium in urine',
'Potassium in urine result flag',
'Sodium in urine',
'Sodium in urine result flag',
'Date of death',
'Underlying (primary) cause of death: ICD10',
'Contributory (secondary) causes of death: ICD10',
'Age at death',
'Description of cause of death',
'Episodes containing "Diagnoses - secondary ICD9" data',
'Episodes containing "Diagnoses - secondary ICD10" data',
'Episodes containing "Dates of operations" data',
'Episodes containing "Episode start date" data',
'Episodes containing "Episode end date" data',
'Episodes containing "Date of admission to hospital" data',
'Episodes containing "Date of discharge from hospital" data',
'Episodes containing "Operation status" data',
'Episodes containing "Diagnoses - main ICD10" data',
'Episodes containing "Diagnoses - main ICD9" data',
'Episodes containing "Operative procedure - main OPCS" data',
'Episodes containing "Date of operation" data',
'Episodes containing "Source of inpatient record" data',
'Food weight',
'Energy',
'Protein',
'Fat',
'Carbohydrate',
'Saturated fat',
'Polyunsaturated fat',
'Total sugars',
'Englyst dietary fibre',
'Portion size',
'Iron',
'Vitamin B6',
'Vitamin B12',
'Folate',
'Vitamin C',
'Potassium',
'Magnesium',
'Retinol',
'Carotene',
'Typical diet yesterday',
'Vitamin D',
'Alcohol',
'Starch',
'Calcium',
'Vitamin E',
'Drinking water intake',
'Low calorie drink intake',
'Fizzy drink intake',
'Squash intake',
'Orange juice intake',
'Grapefruit juice intake',
'Pure fruit/vegetable juice intake',
'Fruit smoothie intake',
'Alcohol consumed',
'Red wine intake',
'Rose wine intake',
'White wine intake',
'Breakfast cereal consumed',
'Porridge intake',
'Muesli intake',
'Oat crunch intake',
'Sweetened cereal intake',
'Type milk consumed',
'Bread consumed',
'Sliced bread intake',
'Baguette intake',
'Bap intake',
'Bread roll intake',
'Naan bread intake',
'Garlic bread intake',
'Crispbread intake',
'Oatcakes intake',
'Other bread intake',
'Double crust pastry intake',
'Single crust pastry intake',
'Crumble intake',
'Pizza intake',
'Pancake intake',
'Scotch pancake intake',
'Yorkshire pudding intake',
'Indian snacks intake',
'Croissant intake',
'Danish pastry intake',
'Scone intake',
'Yogurt/ice-cream consumers',
'Yogurt intake',
'Ice-cream intake',
'Dessert consumers',
'Milk-based pudding intake',
'Other milk-based pudding intake',
'Soya dessert intake',
'Fruitcake intake',
'Cake intake',
'Doughnut intake',
'Sponge pudding intake',
'Cheesecake intake',
'Cheese consumers',
'Low fat hard cheese intake',
'Hard cheese intake',
'Soft cheese intake',
'Blue cheese intake',
'Meat consumers',
'Sausage intake',
'Beef intake',
'Pork intake',
'Lamb intake',
'Crumbed or deep-fried poultry intake',
'Vegetarian alternatives intake',
'Vegetarian sausages/burgers intake',
'Tofu intake',
'Quorn intake',
'Other vegetarian alternative intake',
'Spreads/sauces consumers',
'No fat for cooking',
'Vegetable consumers',
'Baked bean intake',
'Pulses intake',
'Fried potatoes intake',
'Boiled/baked potatoes intake',
'Butter/margarine added to potatoes',
'Mashed potato intake',
'Added salt to food',
'Vitamin supplement user',
'Time spent doing vigorous physical activity',
'Time spent doing moderate physical activity',
'Time spent doing light physical activity',
'When diet questionnaire completed',
'When diet questionnaire started',
'Invitation to complete online 24-hour recall dietary questionnaire, acceptance',
'Invitation to complete online 24-hour recall dietary questionnaire, date sent']
# ----> SET SOMETHING HERE!
# Set the script to test here
scriptList = ['extract_variables.py', 'extract_death.py', 'extract_SR.py', 'HES_extract.py']
scriptdic = {'VAR' : 0, 'DEATH' : 1, 'SR' : 2, 'HES' : 3}
scriptnum = scriptdic['VAR']
# The name of the script to test
scriptname = scriptList[scriptnum];
# Where the data is stored
#csvpath = '/media/storage/UkBiobank/Application 236/R4528/ukb4528.csv'
#htmlpath = '/media/storage/UkBiobank/Application 236/R4528/ukb4528.html'
#exclpath = ''
#hespath = '/media/storage/UkBiobank/HES data/ukb_HES_236.tsv'
csvpath = 'Z:\\EABOAGYE\\Users\\wcrum\\Projects\\UKBB\\UKBB-data-2018\\ukb21204.csv'
htmlpath = 'Z:\\EABOAGYE\\Users\\wcrum\\Projects\\UKBB\\UKBB-data-2018\\ukb21204.html'
exclpath = 'Z:\\EABOAGYE\\Users\\wcrum\\Projects\\UKBB\\UKBB-data-2018\\w10035_20180503_exclusions.csv'
hespath = 'Z:\\EABOAGYE\\Users\\wcrum\\Projects\\UKBB\\UKBB-data-2018\\ukb.tsv'
# Output
#outpath = '/media/storage/codes/BiobankRead-Bash/'
#outname = 'testnewVARpartial'
outpath = 'H:\\IC-Stuff\\software\\Biobank'
outname = 'testnewHESfile'
outfile = os.path.join(outpath, outname)
# Construct script path and arguments for each script
if scriptname == 'extract_variables.py':
# Variables and conditions
varList = ['"Age when attended assessment centre"', '"Body mass index (BMI)"', '"Age high blood pressure diagnosed"']
#varList = ['"Age"']
varString = ' '.join(varList)
#varString = 'H:\\IC-Stuff\\software\\Biobank\\BiobankRead-Bash\\vars_test.txt'
#filterList = ['"Age when attended assessment centre>50"', '"Age when attended assessment centre<70"', '"Body mass index (BMI)>=23"', '"Body mass index (BMI)<=30"']
# NOTE FILTER STRINGS ARE SEARCHED FOR IN VARIABLES SO "BMI" MATCHES "Body mass index (BMI)"
# "Age" CAN BE USED TO ENSURE THAT AGE AT ALL SPECIFIED ASSESSMENTS CONTAINING AGE HAPPENED WITHIN THE RANGE
filterList = ['"Age when attended assessment centre>50"', '"Age when attended assessment centre<70"', '"BMI>=23"', '"BMI<=30"']
#filterList = ['"Age>50"', '"Age<70"', '"BMI>=23"', '"BMI<=30"']
# Command string
'''
bbreadargs = [scriptname,
' --csv '+csvpath,
' --html '+htmlpath,
' --vars ' + varString,
' --filter ' + ' '.join(filterList),
' --remove_outliers True',
' --baseline_only False',
## ' --excl '+exclpath,
' --cov_corr True',
' --aver_visits False',
' --combine partial',
' --out ' + outfile]
'''
bbreadargs = [scriptname,
' --vars ' + varString,
' --filter ' + ' '.join(filterList),
' --remove_outliers True',
' --baseline_only False',
' --cov_corr True',
' --aver_visits False',
' --combine partial',
' --out ' + outfile]
elif scriptname == 'extract_death.py':
# Data file for codes or use list e.g. ' --codes C34 C42'
codespath = 'H:\\IC-Stuff\\software\\Biobank\\codes.txt'
# Command string
# ' --excl '+ exclpath,
bbreadargs = [scriptname,
' --csv ' + csvpath,
' --html ' + htmlpath,
' --codes ' + codespath,
' --primary True',
' --secondary False',
' --out ' + outfile]
elif scriptname == 'extract_SR.py':
# Command string
# Note use of "" to prevent argsparse breaking up the disease string
bbreadargs = [scriptname,
' --csv '+csvpath,
' --html '+htmlpath,
' --baseline_only False',
' --disease "lung cancer" "breast cancer"',
' --SRcancer True',
' --excl '+exclpath,
' --out ' + outfile]
elif scriptname == 'HES_extract.py':
# Data file for codes or use list e.g. ' --codes C34 C42'
codespath = 'H:\\IC-Stuff\\software\\Biobank\\codes.txt'
# ICD9, ICD10, OPCS
codetype = 'ICD10'
# epistart or admidate
datetype = 'epistart'
# Command string
bbreadargs = [scriptname,
' --csv '+csvpath,
' --html '+htmlpath,
' --tsv '+hespath,
' --excl '+exclpath,
' --codes '+codespath,
' --codeType '+ codetype,
' --dateType '+ datetype,
' --firstvisit True',
' --baseline True',
' --out ' + outfile]
else:
print('error: scriptname =', scriptname, 'not recognised')
sys.exit(1)
# Make command-line
# Note append is required here
subprocessargs = ['python.exe']
subprocessargs.append(bbreadargs)
print subprocessargs
# Output command line
#subprocessargs[1]=' '.join(subprocessargs[1])
#print subprocessargs
#print
#print type(subprocessargs[0]), type(subprocessargs[1])
# Run as sub-process to ensure arguments are passed correctly
print
print('Running as sub-process, please wait...')
subprocess.call(subprocessargs,shell=True)