-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenome_phasing.py
173 lines (151 loc) · 7.06 KB
/
Genome_phasing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python3
# -*- encoding:utf-8 -*-
# @Author : Haoran Pan
# date: 2020/11/28
import sys
import gzip
import time
import Fontcolor
import argparse
__author__ = 'Haoran Pan'
__mail__ = 'haoran_pan@qq.com'
__date__ = '20201207'
__version__ = 'v1.1'
def depth_count(depthfile,outputfile):
reads_num = 0
coverage_depth = 0
contig = ''
result = open(outputfile,'w')
with open(depthfile) as f:
for line in f:
line_list = line.split('\t')
if reads_num == 0:
coverage_depth += int(line_list[2].rstrip())
reads_num += 1
contig = line_list[0].rstrip()
elif reads_num != 0 and line_list[0] == contig:
coverage_depth += int(line_list[2].rstrip())
reads_num += 1
else:
depth = float(coverage_depth) / float(reads_num)
reads_num = 0
coverage_depth = 0
output = contig + '\t' + str(depth)
result.write(output.rstrip()+'\n')
last_depth = float(coverage_depth)/float(reads_num)
result.write(contig+'\t'+str(last_depth)+'\n')
result.close()
def calcultate_sample_length(fq_gz_file):
start = time.time()
reads_num = 0
line_index = 0
reads_length = 0
with gzip.open(fq_gz_file) as fq1:
for line in fq1:
line_index +=1
if line_index %4 == 1:
reads_num += 1
elif line_index % 4 ==2:
reads_length += len(line)
end = time.time()
Fontcolor.Tips_output('File {} reads num are {}'.format(str(fq_gz_file),reads_num))
Fontcolor.Tips_output('File {} reads length are {}'.format(str(fq_gz_file),reads_length))
Fontcolor.Tips_output('Time cost: {:.5f}'.format(end-start))
return reads_length
def cal_genome_size(genome_fasta):
fa_dict = {}
with open(genome_fasta) as f:
for line in f:
if line.startswith('>'):
fa_key = line.strip().split('>')[-1].split(' ')[0]
fa_dict[fa_key] = ''
else:
fa_dict[fa_key] += line.strip().upper().replace('\n', '')
genome_size = 0
for key in fa_dict.keys():
genome_size += len(fa_dict[key])
Fontcolor.Tips_output('Genome size is {} bp'.format(genome_size))
return genome_size
def run_step2(fq1file,fq2file,genomefile):
fq1length = calcultate_sample_length(fq1file)
fq2length = calcultate_sample_length(fq2file)
genomesize = cal_genome_size(genomefile)
std_depth = round((fq1length + fq2length) / genomesize)
Fontcolor.Tips_output('Standard Sequencing Depth is {}X'.format(std_depth))
return std_depth
def Fa2dict(fasta_file):
fa_dict = {}
with open(fasta_file) as f:
for line in f:
if line.startswith('>'):
fa_key = line.strip().split('>')[-1].split(' ')[0]
fa_dict[fa_key] = ''
else:
fa_dict[fa_key] += line.strip().upper().replace('\n', '')
return fa_dict
def ctg_phasing_base_depth(fasta_dict,depthcountfile,std_depth,Ploidy,result):
depth_file = open(depthcountfile,'r')
std_depth = int(std_depth)
ploidy = int(Ploidy)
phasing_file=open('Phasing_record.txt','w')
last_output = open(result,'w')
for dep_line in depth_file:
dep_line_list=dep_line.rstrip().split('\t')
raw_depth=float(dep_line_list[1])/float(std_depth)
depth=int(round(raw_depth))
if depth > 1 :
phasing_file.write(dep_line.rstrip()+'\t'+str(raw_depth)+'\n')
if depth - 1 < ploidy:
for i in range(depth-1):
last_output.write('>'+dep_line_list[0]+'_'+str(i)+'\n')
last_output.write(fasta_dict[dep_line_list[0]]+'\n')
else:
for i in range(ploidy - 1):
last_output.write('>' + dep_line_list[0] + '_' + str(i) + '\n')
last_output.write(fasta_dict[dep_line_list[0]] + '\n')
phasing_file.close()
last_output.close()
depth_file.close()
def main(args):
depthfile = args.depth
depthfileout = args.depthout
Fontcolor.Log_output('-' * 10 + 'step 1 beginning !' + '-' * 10)
depth_count(depthfile,depthfileout)
Fontcolor.Log_output('-' * 10 + 'step 1 completed !' + '-' * 10)
if not args.stdth:
fq1_file = args.fastq1
fq2_file = args.fastq2
genome_file = args.genome
Fontcolor.Log_output('-' * 10 + 'step 2 beginning !' + '-' * 10)
std_depth = run_step2(fq1_file,fq2_file,genome_file)
Fontcolor.Log_output('-' * 10 + 'step 2 completed !' + '-' * 10)
else:
Fontcolor.Log_output('-' * 10 + 'step 2 beginning !' + '-' * 10)
std_depth = args.stdth
Fontcolor.Tips_output('Input std depth is {}X'.format(std_depth))
Fontcolor.Log_output('-' * 10 + 'step 2 completed !' + '-' * 10)
Ploidy = args.ploidy
output = args.output
genome_file = args.genome
Fontcolor.Log_output('-' * 10 + 'step 3 beginning !' + '-' * 10)
fasta_dict = Fa2dict(genome_file)
ctg_phasing_base_depth(fasta_dict,depthfileout,std_depth,Ploidy,output)
Fontcolor.Log_output('-' * 10 + 'step 3 completed !' + '-' * 10)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
prog=sys.argv[0],
formatter_class=argparse.RawTextHelpFormatter,
description='''Polyploid genome phasing:Use Second-generation sequencing reads mapping Third-generation sequencing genome''',
usage="python {} -d depth -o depthout -f1 fq1 -f2 fq2 -g genome -p ploidy -O output\n\t\tor\n\t\tpython {} -d depth -o depthout -s std_depth -g genome -p ploidy -O output".format(sys.argv[0],sys.argv[0]),
epilog='author:\t{0}\nmail:\t{1}\ndate:\t{2}\nversion:\t{3}'.format(__author__, __mail__, __date__,
__version__))
parser.add_argument('-d', '--depth', required=True, help='Input samtools depth outputfile')
parser.add_argument('-o', '--depthout', required=True, help='Output ctg depth result(.tab)')
parser.add_argument('-f1', '--fastq1', required=None, help='Input sample1 file(fastq1.gz)')
parser.add_argument('-f2', '--fastq2', required=None, help='Input sample2 file(fastq2.gz)')
parser.add_argument('-s', '--stdth', required=None, help='Input the standard depth of known sequencing data, which is equal to the sequencing data/genome size(bp)')
parser.add_argument('-g', '--genome', required=True, help='Input Thr Sequencing assmbly genome(.fasta)')
parser.add_argument('-p', '--ploidy', required=True, help='Input Ploidy of species(eg:8)')
parser.add_argument('-O', '--output', required=True, help='Output the fasta sequence of the contig that only needs to be phased(.fasta)')
args = parser.parse_args()
main(args)