-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_pheno_file.py
50 lines (43 loc) · 2.81 KB
/
prepare_pheno_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
##
# Generates "pheno" file from a raw UKBB's phenotpe file (generated by ukbconv tool).
# Individuals are filtered by pop_file and raw phenotypes are processed by conditions file.
#
##
import argparse
import os
import numpy as np
import pandas as pd
import simplejson
import constants
if __name__=="__main__":
parser = argparse.ArgumentParser(description="Generates \"pheno\" file from a raw UKBB's phenotpe file (generated by ukbconv tool). \nIndividuals are filtered by pop_file and raw phenotypes are processed by conditions file.")
parser.add_argument('-r', '--raw_phenotype_file', help="a raw UKBB's phenotpe file (generated by ukbconv tool)", default=os.path.join(constants.DATASETS_PATH, "ukbb","ukb_code6.csv"))
parser.add_argument('-pop', '--pop_file', help='A pop file for filtering individuals', default=os.path.join(constants.DATASETS_PATH, "ukbb","pop.panel.gbr"))
parser.add_argument('-pheno', '--pheno_file', help="A pheno file destination path", default=os.path.join(constants.DATASETS_PATH, "ukbb","pheno_ctrt_gbr"))
parser.add_argument('-c', '--conditions_file', help="condition file for processing phenotypes", default=os.path.join(constants.DATASETS_PATH, "ukbb", "conditions_ctrt.json"))
args = parser.parse_args()
raw_phenotype_file=args.raw_phenotype_file
pheno_file=args.pheno_file
pop_file=args.pop_file
conditions = simplejson.load(open(args.conditions_file))
df_raw_pheno=pd.read_csv(raw_phenotype_file, sep=',', index_col=0, dtype=str)
filtered_pop=df_raw_pheno.index
if pop_file!= '':
filtered_pop=pd.read_csv(pop_file, index_col=0, sep='\t').index
df_raw_pheno=df_raw_pheno.loc[filtered_pop]
headers=df_raw_pheno.columns.values
headers_prefix=set(np.unique([a.split("-")[0] for a in headers])).intersection(conditions.keys())
headers_dict = {a : [b for b in headers if b.startswith(a+"-")] for a in headers_prefix}
df_vals=pd.DataFrame(index=df_raw_pheno.index, columns=["IID"]+ list(headers_dict.keys()))
for k,v in headers_dict.items():
print(k)
if conditions[k]=='all':
df_vals.loc[:,k]=df_raw_pheno.loc[:,v].apply(lambda row : ",".join(np.unique(row.dropna()).astype(str)) , axis=1)
elif k.split("_")[-1]=='count':
df_vals.loc[:,k]=df_raw_pheno.loc[:,v].apply(lambda row : int(np.sum([a in conditions[k] for a in row.dropna().astype(str)])) , axis=1)
elif conditions[k]=='max':
df_vals.loc[:,k]=df_raw_pheno.loc[:,v].apply(lambda row : np.nanmax([np.nan] + list(row.dropna().astype(float))) , axis=1)
else:
df_vals.loc[:,k]=df_raw_pheno.loc[:,v].apply(lambda row: int(bool(np.sum([a in conditions[k] for a in row.dropna().astype(str)])))+1 , axis=1)
df_vals.loc[:,'IID']=df_vals.index
df_vals.dropna().to_csv(pheno_file, sep='\t', index_label='FID')