forked from cjjossart/qc_reports
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqc_report_stats.py
123 lines (101 loc) · 4.16 KB
/
qc_report_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#! /usr/bin/env python3
from lib2to3.pgen2.token import NUMBER
import os
import sys
import pandas as pd
# read input
# Look into collection of read in them
# Open *stats.txt file
# initialize variables
# read length before trimming
read_length_before_trim = 0
# reads before trimming and add to array
reads_before_trim = 0
# reads after trimming and add to array
reads_after_trim = 0
# Paired reads after trimming
paired_reads_after_trim = 0
# unpaired reads after trimming
unpaired_reads_after_trim = 0
# output the numerator (reads count * reads avg length) for the coverage calculation to be divided by reference length in the module
# coverage_numer = reads_after_trim * #read_avg_length
with open(os.path.join(os.path.dirname(__file__), "/home/tdotrang/qc_reports/ERR2172266.stats.txt"), "r") as f:
lines = f.readlines()
reads_before_trim = lines[1].split(" ")[2]
print(reads_before_trim)
read_length_before_trim = lines[3].split(" ")[2]
print(read_length_before_trim)
reads_after_trim = lines[6].split(" ")[2]
print(reads_after_trim)
paired_reads_after_trim = lines[9].split(" ")[5]
print(paired_reads_after_trim)
unpaired_reads_after_trim = lines[11].split(" ")[5]
print(unpaired_reads_after_trim)
coverage_numer = float(read_length_before_trim) * float(reads_before_trim)
print(coverage_numer)
f.close()
# Close *stats.txt file
# Open *base_content.txt
with open(os.path.join(os.path.dirname(__file__), "/home/tdotrang/qc_reports/ERR2172266.base_content.txt"), "r") as f:
# parse only GC lines and calculate average GC content
# Collect the percentages of GC content
# Collect the frequencey of reads with specific GC content
# Multply the percenatage columns by the frequency columns
# Sum the frequencies
# Sum the (percentages*frequncies)
# divide the (sum(percentages)*sum(frequncies)) by total frequencies
f.close()
# Close *base_content.txt
# Open qa*base_content.txt
# parse only GC lines and calculate average GC content
# Collect the percentages of GC content
# Collect the frequencey of reads with specific GC content
# Multply the percenatage columns by the frequency columns
# Sum the frequencies
# Sum the (percentages*frequncies)
# divide the (sum(percentages)*sum(frequncies)) by total frequencies
# Close qa*base_content.txt
# THE qa.* FILES ARE THE AFTER DATA!!!!!!!!
# Open *for_qual_histograms.txt
f = pd.read_csv('/home/tdotrang/qc_reports/qa.ERR2172266.for_qual_histogram.txt',
delim_whitespace=True, index_col=None)
f['x'] = f['Score']*f['readsNum']
sum_reads_num = sum(f['readsNum'])
print("sum of reads: ", sum_reads_num)
phred_avg = sum(f['x'])/sum_reads_num
print("phred quality score: \n", f)
print("phred average: ", phred_avg)
# no pandas methods
# with open(os.path.join(os.path.dirname(__file__), "/home/tdotrang/qc_reports/ERR2172266.for_qual_histogram.txt"), "r") as f:
# # Collect the PHRED scores
# phred_score_array = []
# next(f)
# for line in f:
# phred_score_array.append(line.strip().split('\t')[0:2])
# for i in phred_score_array:
# phred_freq = [0] * [1]
# i.append(phred_freq)
# print(phred_score_array)
# Collect the frequencey of reads with specific PHRED content
# Check if we want read number or read bases?
# Multply the PHRED columns by the frequency columns
# Sum the frequencies
# Sum the (PHRED*frequncies)
# divide the (sum(PHRED)*sum(frequncies)) by total frequencies
# Close *for_qual_histograms.txt
# Open qa*for_qual_histograms.txt
f = pd.read_csv('/home/tdotrang/qc_reports/ERR2172266.for_qual_histogram.txt',
delim_whitespace=True, index_col=None)
f['x'] = f['Score']*f['readsNum']
sum_reads_num = sum(f['readsNum'])
print("AFTER sum of reads: ", sum_reads_num)
phred_avg = sum(f['x'])/sum_reads_num
print("phred quality score: \n", f)
print("AFTER phred average: ", phred_avg)
# Collect the PHRED scores
# Collect the frequencey of reads with specific PHRED content
# Multply the PHRED columns by the frequency columns
# Sum the frequencies
# Sum the (PHRED*frequncies)
# divide the (sum(PHRED)*sum(frequncies)) by total frequencies
# Close qa*for_qual_histograms.txt