-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmeluch_deduper.py
executable file
·130 lines (107 loc) · 4.72 KB
/
meluch_deduper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
import argparse
import bioinfo
import re
# assumption: the input SAM file is sorted
############################## FUNCTIONS ##############################
def pos_adjust(stranded: bool, lmap: int, cig: str) -> int:
"""Parses CIGAR string and returns a corrected leftmost start position
based on strandedness and soft clipping."""
# if read is on the - strand
if stranded:
# find numbers before each of the CIGAR letters and sum them
M: int = sum([int(i) for i in re.findall(r'(\d+)M', cig)])
D: int = sum([int(i) for i in re.findall(r'(\d+)D', cig)])
N: int = sum([int(i) for i in re.findall(r'(\d+)N', cig)])
S: int = 0
# if there is soft clipping at the end, add that
soft = re.search(r'(\d+)S\Z', cig)
if soft:
S = int(soft.group(1))
return lmap + M + D + N + S
# if read is on the + strand
else:
# if soft clipping at the beginning of the read, subtract it from pos
soft = re.search(r'^(\d+)S', cig)
if soft:
return lmap - int(soft.group(1))
else:
return lmap
############################## GET INPUT FILES ##############################
parser = argparse.ArgumentParser(description="Get file paths for a SORTED SAM input file, output SAM file, and file containing the list of UMIs.")
parser.add_argument("-f", "--file", help="Absolute path to SORTED SAM input file", required=True)
parser.add_argument("-o", "--outfile", help="Absolute path to output SAM File", required=True)
parser.add_argument("-u", "--UMI", help="File containing the list of UMIs", required=True)
args = parser.parse_args()
inpath: str = args.file
outpath: str = args.outfile
umipath: str = args.UMI
# test files
# inpath = "/projects/bgmp/bmeluch/bioinfo/Bi624/Deduper-bmeluch/test.sam"
# outpath = "/projects/bgmp/bmeluch/bioinfo/Bi624/Deduper-bmeluch/Part3/test_out.sam"
# inpath = "/projects/bgmp/bmeluch/bioinfo/Bi624/Deduper-bmeluch/Part1/unit-test_input.sam"
# outpath = "/projects/bgmp/bmeluch/bioinfo/Bi624/Deduper-bmeluch/Part1/test_output.sam"
# umipath = "/projects/bgmp/bmeluch/bioinfo/Bi624/Deduper-bmeluch/Part1/unit-test_UMIs.txt"
############################## IMPORT UMIS ##############################
# Read file with list of known UMIs, save UMI sequences into a set
umis = set()
with open(umipath, 'r') as umifile:
for line in umifile:
i = line.strip()
if bioinfo.validate_base_seq(i):
umis.add(i)
############################## COUNTERS AND HOLDERS ##############################
# counts of things
headercount: int = 0
uniqcount: int = 0
wrongumicount: int = 0
dupsremoved: int = 0
# counts of unique reads per chrom
chroms: dict = {}
# set holding past read info
archive = set()
############################## MAIN LOGIC ##############################
with open(inpath, 'r') as infile, open(outpath, 'w') as outfile:
for line in infile:
# Write out header lines
if line[0] == "@":
outfile.write(line)
headercount += 1
# Once at real alignment lines, split by whitespace into component fields
else:
# Pull out important data from the resulting list
alignment = line.strip().split()
umi: str = alignment[0][-8:]
stranded: bool = ((int(alignment[1]) and 16) == 16)
chrom: str = alignment[2]
lmap: int = int(alignment[3])
cig: str = alignment[5]
# If UMI is not recognized, do not output read, move to next line
if umi not in umis:
wrongumicount += 1
continue
# Adjust start position based on strandedness, CIGAR string
lmap = pos_adjust(stranded, lmap, cig)
# current read data as tuple
current: tuple = (umi, stranded, chrom, lmap)
# Duplicate check? UMI, strand, chromosome, leftmost mapping position
if current in archive:
dupsremoved += 1
else:
if chrom in chroms:
chroms[chrom] += 1
else:
# when you hit a new chromosome, clear archive to save memory
chroms[chrom] = 1
archive = set()
# write out read, count a unique read written out, save
outfile.write(line)
uniqcount += 1
archive.add(current)
############################## COUNTS OUTPUT ##############################
print("Header lines:\t\t\t", headercount)
print("Total unique reads written out:\t", uniqcount)
print("Wrong UMIs:\t\t\t", wrongumicount)
print("Duplicates removed:\t\t", dupsremoved)
print("Unique reads per chromosome:")
print(chroms)