-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmerge_pulldown.py
151 lines (136 loc) · 6.39 KB
/
merge_pulldown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import argparse
import filecmp
import fileinput
import shutil
from collections import OrderedDict
# return true if snp files are the same
# return false otherwise
def compare_snp_files(snp_file_array):
for other_file in snp_file_array[1:]:
if filecmp.cmp(snp_file_array[0], other_file, shallow=False) == False:
return False
return True
def check_snp_file(snp_filename):
BASES = set('ACGT')
count = 0
with open(snp_filename, 'r') as snps:
for snp_line in snps:
fields = snp_line.split()
if len(fields) != 6:
raise ValueError("does not have expected 6 fields: '{}'".format(snp_line))
chromosome = fields[1]
if not chromosome.isdigit() or (int(chromosome) < 1 or int(chromosome) > 24):
raise ValueError("bad chromosome '{}', {}".format(snp_line, chromosome))
try:
cm = float(fields[2])
except:
raise ValueError("nonfloat '{}': {}".format(snp_line, fields[2]))
position = fields[3]
if not position.isdigit():
raise ValueError("bad position '{}': {}".format(snp_line, position))
allele1 = fields[4]
allele2 = fields[5]
if (allele1 not in BASES) or (allele2 not in BASES) or (allele1 == allele2):
raise ValueError("bad alleles '{}': {} {}".format(snp_line, allele1, allele2))
count += 1
return count
# combine the contents of multiple individual files into one
def merge_ind_files(output_filename, input_ind_filenames, max_overlap=1):
ind_counts = [] # individual counts for each individual file
individuals_geno_offsets_dict = OrderedDict() # mapping of geno sets to individuals by geno offset, treating geno data as one array
VALID_SEX = frozenset('MFU')
with open(output_filename, 'w') as out:
overall_line_count = 0
for ind_filename in input_ind_filenames:
prior_files_total = 0 if len(ind_counts) == 0 else overall_line_count
with open(ind_filename, 'r') as ind_file:
for line in ind_file:
fields = line.split()
if len(fields) > 0:
individual = fields[0]
if individual not in individuals_geno_offsets_dict:
out.write(line)
individuals_geno_offsets_dict[individual] = [overall_line_count]
else:
# make sure there are no duplicate entries for an individual
if len(individuals_geno_offsets_dict[individual]) >= max_overlap:
raise ValueError('individual {} appears too many times'.format(individual))
else:
individuals_geno_offsets_dict[individual].append(overall_line_count)
overall_line_count += 1
# sex is one of M,F,U
sex = fields[1]
if sex not in VALID_SEX:
raise ValueError('invalid sex {} in {}'.format(sex, line))
ind_counts.append(overall_line_count - prior_files_total)
individuals_geno_offsets = [individuals_geno_offsets_dict[x] for x in individuals_geno_offsets_dict]
return ind_counts, individuals_geno_offsets
# First values are preferred, array must be non-empty
def merge_genotypes(array):
geno = '9'
VALID_GENO = frozenset('029')
for value in array:
if geno not in VALID_GENO:
raise ValueError('invalid genotype value {}'.format(geno))
geno = value
if geno != '9':
break
return geno
def merge_geno_files(output_filename, input_geno_filenames, ind_counts, individuals_geno_offsets, num_snps):
geno_files = []
VALID_GENO = frozenset('029')
try:
# open files
for geno_filename in input_geno_filenames:
f = open(geno_filename, 'r')
geno_files.append(f)
with open(output_filename, 'w') as out:
for i in range(num_snps):
# read genotypes from all files into memory
raw_genotypes_for_snp = ''
for file_index in range(len(geno_files)):
geno = geno_files[file_index].readline().strip()
# check that there is one value for each individual
if len(geno) != ind_counts[file_index]:
raise ValueError('{}: expected {} genotypes, but found {}'.format(input_geno_filenames[file_index], ind_counts[file_index], len(geno)))
# check geno values a
values_appearing = set(geno)
if not (values_appearing <= VALID_GENO):
raise ValueError('invalid genotype value(s) {}'.format(values_appearing - VALID_GENO))
raw_genotypes_for_snp += geno
# merge genotypes
merged_genotypes = []
for offsets_for_individual in individuals_geno_offsets:
individual_genos = [raw_genotypes_for_snp[offset] for offset in offsets_for_individual]
individual_geno = merge_genotypes(individual_genos)
merged_genotypes.append(individual_geno)
out.write(''.join(str(g) for g in merged_genotypes))
out.write('\n')
# check that there is no leftover values
for file_index in range(len(geno_files)):
geno = geno_files[file_index].readline().strip()
if len(geno) > 0:
raise ValueError('extra values in {}'.format(input_geno_filenames[file_index]))
finally:
for f in geno_files:
f.close()
# Perform snp file comparison (currently SNP files must be identical)
# Merge ind and genotype files
def merge_geno_snp_ind(geno_filenames, snp_filenames, ind_filenames, output_stem, max_overlap):
num_snps = check_snp_file(snp_filenames[0])
if not compare_snp_files(snp_filenames):
raise ValueError('SNP file mismatch')
shutil.copyfile(snp_filenames[0], "{}.snp".format(output_stem) )
ind_counts, individuals_geno_offsets = merge_ind_files("{}.ind".format(output_stem), ind_filenames, max_overlap)
merge_geno_files("{}.geno".format(output_stem), geno_filenames, ind_counts, individuals_geno_offsets, num_snps)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Merge pulldown eigenstrat results. SNP sets must be the same.")
parser.add_argument('-i', "--input_stems", help="List of stems, with each stem having three files, for example: a0.{geno,snp,ind} or b0.b1.{geno,snp,ind}.", nargs='+', required=True)
parser.add_argument('-o', "--output_stem", help="Stem for output files: output.{geno,snp,ind}.", default='out')
parser.add_argument('-m', "--max_overlap", help="Number of results files an individual can appear in. For UDG half+minus, this should be 2. Merged results are the first non-empty value, or empty. If an individual should only appear in one result set, this should be 1.", type=int, default=1)
args = parser.parse_args()
geno_filenames = ["{}.geno".format(stem) for stem in args.input_stems]
snp_filenames = ["{}.snp".format(stem) for stem in args.input_stems]
ind_filenames = ["{}.ind".format(stem) for stem in args.input_stems]
output_stem = args.output_stem
merge_geno_snp_ind(geno_filenames, snp_filenames, ind_filenames, output_stem, args.max_overlap)