-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_genotype_chunk.py
109 lines (81 loc) · 4.27 KB
/
make_genotype_chunk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#break the cau dosages into five chunks like the ALL dosages
#this is to speed up imputation
import pandas as pd
#read the chrom1 and take the header, cause some chroms do not have header
#i read in only two rows to make it fast
#we will use the header to complete the other chroms
#this is to avoid key erro when i start accessing the df by col name
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("chr", action="store", help="put chromosome no")
parser.add_argument("chunk", action="store", help="put chromosome chunk no")
args = parser.parse_args()
chrom = str(args.chr)
chunk = str(args.chunk)
for_header = pd.read_csv("/home/pokoro/data/mesa_models/all/ALL_chr"+chrom+"_genotype_chunk1.txt.gz", sep="\t", nrows=2)
header=list(for_header.columns)
#because the headers in the chroms change between header and no header
#I use condition to capture it
if header[0] == "id":
#read without header
all1 = pd.read_csv("/home/pokoro/data/mesa_models/all/ALL_chr"+chrom+"_genotype_chunk"+chunk+".txt.gz", sep="\t")
all1.columns = header #put header
tr_chr1 = pd.read_csv("/home/pokoro/data/mesa_models/mesa_pheno/thrombotic/cau_imputation_dosage_chr"+chrom+".txt",sep="\t")
snps = list(all1["id"])
tr_chr1_chunk = tr_chr1[(tr_chr1["id"].isin(snps))]
#this is slow
tr_chr1_chunk.to_csv("/home/pokoro/data/mesa_models/mesa_pheno/thrombotic/cau_imputation_dosage_chr"+chrom+"_chunk"+chunk+".txt", header=True, index=False, sep="\t")
#index=False is rownames=False
else:
#read in chrom1 and take its header
for_header = pd.read_csv("/home/pokoro/data/mesa_models/all/ALL_chr1_genotype_chunk1.txt.gz", sep="\t", nrows=2)
header=list(for_header.columns)
#read without header
all1 = pd.read_csv("/home/pokoro/data/mesa_models/all/ALL_chr"+chrom+"_genotype_chunk"+chunk+".txt.gz", sep="\t", header=None)
all1.columns = header #put header
tr_chr1 = pd.read_csv("/home/pokoro/data/mesa_models/mesa_pheno/thrombotic/cau_imputation_dosage_chr"+chrom+".txt",sep="\t")
snps = list(all1["id"])
tr_chr1_chunk = tr_chr1[(tr_chr1["id"].isin(snps))]
#this is slow
tr_chr1_chunk.to_csv("/home/pokoro/data/mesa_models/mesa_pheno/thrombotic/cau_imputation_dosage_chr"+chrom+"_chunk"+chunk+".txt", header=True, index=False, sep="\t")
#index=False is rownames=False
"""
#read without header
all1 = pd.read_csv("/home/pokoro/data/mesa_models/all/ALL_chr"+chrom+"_genotype_chunk"+chunk+".txt.gz", sep="\t", header=None)
all1.columns = header #put header
tr_chr1 = pd.read_csv("/home/pokoro/data/mesa_models/mesa_pheno/thrombotic/cau_imputation_dosage_chr"+chrom+".txt",sep="\t")
snps = list(all1["id"])
tr_chr1_chunk = tr_chr1[(tr_chr1["id"].isin(snps))]
#this is slow
tr_chr1_chunk.to_csv("/home/pokoro/data/mesa_models/mesa_pheno/thrombotic/cau_imputation_chr"+chrom+"_chunk"+chunk+".txt", header=True, index=False, sep="\t")
#index=False is rownames=False
"""
#This is very fast
#tr_chr1_chunk.to_hdf("tr_chr_chunk3.txt", header=True, index=False, key="stage")
#However, it writes the object as a binary file, and can only be read by
#pd.read_hdf()
#this makes it hard to deal the df for model building.
#I will figure it out later
#read_tr2 = pd.read_hdf("tr_chr_chunk3.txt")
#That will require me to make the few changes below in get_maf_filtered_genotype
"""
def get_maf_filtered_genotype(genotype_file_name, maf):
gt_df = pd.read_hdf(genotype_file_name)
effect_allele_freqs = gt_df.mean(axis=1)
effect_allele_freqs = [ x / 2 for x in effect_allele_freqs ]
effect_allele_boolean = pd.Series([ ((x >= maf) & (x <= (1 - maf))) for x in effect_allele_freqs ]).values
gt_df = gt_df.loc[ effect_allele_boolean ]
gt_df = gt_df.T
return gt_df
def get_maf_filtered_genotype_csv(genotype_file_name, maf):
gt_df = pd.read_csv(genotype_file_name, 'r', header = 0, index_col = 0,delimiter='\t')
effect_allele_freqs = gt_df.mean(axis=1)
effect_allele_freqs = [ x / 2 for x in effect_allele_freqs ]
effect_allele_boolean = pd.Series([ ((x >= maf) & (x <= (1 - maf))) for x in effect_allele_freqs ]).values
gt_df = gt_df.loc[ effect_allele_boolean ]
gt_df = gt_df.T
return gt_df
test_gt_df1.drop("id", inplace=True) #Just to drop the id column
test_gt_df1 = test_gt_df1.rename(columns=test_gt_df1.iloc[0]) #to remove the row
#that was used as the column
"""