-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfastq_compare.py
executable file
·104 lines (80 loc) · 3.35 KB
/
fastq_compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
from __future__ import division
import sys
import Bio.SeqIO
def read_fastq(fastqFile, qFormat):
"""
Read the fastq file given as argument
Calculate average base quality score
Output the number of reads in that file, the average
base quality score of those sequences,
and their average length
"""
readScores = []
readLengths = []
for readNumber, read in enumerate(Bio.SeqIO.parse(fastqFile, qFormat)):
readID = read.id
readSeq = read.seq
# If the sequence is of length 0, skip to next sequence in file
if len(readSeq) == 0:
continue
readScoreList = read.letter_annotations["phred_quality"]
# Get the average base quality score for this sequence
readScoreAvg = sum(readScoreList) / len(readScoreList)
readScores.append(readScoreAvg)
readLengths.append(len(readSeq))
#print readID, len(readSeq), readScoreAvg
avgScore = sum(readScores) / len(readScores)
avgLength = sum(readLengths) / len(readLengths)
return str(readNumber + 1), str(round(avgScore, 2)), str(round(avgLength, 2))
def write_report(in_data, out_data, reportPath):
"""
Takes the path of the report to be written
Takes the data from input and output read of FASTQ files
Write that data to the report file
"""
# test
reportFile = open(reportPath, 'w')
in_readNum, in_avgScore, in_avgLength = in_data
out_readNum, out_avgScore, out_avgLength = out_data
reportFile.write("FASTQ trimming report\n\n")
reportFile.write("FastqFile \t Read# \t AvgScore \t AvgLength\n")
reportFile.write("Before \t" + in_readNum + " \t " + in_avgScore + " \t " + in_avgLength + "\n")
reportFile.write("After \t" + out_readNum + " \t " + out_avgScore + " \t " + out_avgLength + "\n")
reportFile.close()
#
# Run the script
#
if len(sys.argv) == 5:
# fastq file path before trim
before_fastq = sys.argv[1]
# fastq file path after trim
after_fastq = sys.argv[2]
# format of the fastq file
fastqFormat = sys.argv[3]
# path of the log file to be written
reportPath = sys.argv[4]
# Allowed fastq format, and the corresponding code to give to SeqIO
allowedFormats = {'phred33' : 'fastq',
'phred64' : 'fastq-illumina',
'solexa' : 'fastq-solexa'}
# Make sure the format is known
try:
assert fastqFormat in allowedFormats.keys()
except AssertionError as e:
print "The format type has to be one of the following", allowedFormats
print e
raise
#in_readNum, in_avgScore, in_avgLength = read_fastq(in_fastq)
#out_readNum, out_avgScore, out_avgLength = read_fastq(out_fastq)
# The function read_fastq() is called on the BEFORE and AFTER trim fastq files
# and the path of the output is also given
write_report(read_fastq(before_fastq, allowedFormats[fastqFormat]),
read_fastq(after_fastq, allowedFormats[fastqFormat]),
reportPath)
else:
print "usage: `python fastq_compare.py BEFORE.fq AFTER.fq <quality_coding> <outfile>`"
print "options for quality_coding:"
print "phred33 : Sanger and Illumina reads produced with CASAVA 1.8+"
print "phred64 : Illumina reads produced with CASAVA 1.3+"
print "solexa : Solexa reads"