diff --git a/scripts/chain_invert.py b/scripts/chain_invert.py index 76ac2a6..e61774d 100644 --- a/scripts/chain_invert.py +++ b/scripts/chain_invert.py @@ -1,34 +1,35 @@ -''' +""" Switch the source and target reference of a chain file -''' +""" import argparse -import sys import re +import sys def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-c', '--chain', required=True, - help='Path to the input chain file.' + "-c", "--chain", required=True, help="Path to the input chain file." ) parser.add_argument( - '-o', '--out', default='', - help='Path to the output verbose chain file. [empty string]' + "-o", + "--out", + default="", + help="Path to the output verbose chain file. [empty string]", ) args = parser.parse_args() return args def invert(in_fn: str, out_fn: str) -> None: - f = open(in_fn, 'r') - fo = open(out_fn, 'w') + f = open(in_fn, "r") + fo = open(out_fn, "w") for line in f: if not line: continue line = line.rstrip() - fields = re.split(r'[\s\t]+', line) - if fields[0] == 'chain': + fields = re.split(r"[\s\t]+", line) + if fields[0] == "chain": source = fields[2] source_len = int(fields[3]) s_start = int(fields[5]) @@ -39,41 +40,41 @@ def invert(in_fn: str, out_fn: str) -> None: d_start = int(fields[10]) d_end = int(fields[11]) changed = fields[:2] + fields[7:12] + fields[2:7] + [fields[12]] - if strand == '-': + if strand == "-": changed[5] = str(dest_len - d_end) changed[6] = str(dest_len - d_start) changed[10] = str(source_len - s_end) changed[11] = str(source_len - s_start) - changed[4] = '+' - changed[9] = '-' - changed_str = ' '.join(changed) - print(f'{changed_str}', file=fo) + changed[4] = "+" + changed[9] = "-" + changed_str = " ".join(changed) + print(f"{changed_str}", file=fo) chain_list = [] elif len(fields) == 3: l = int(fields[0]) ds = int(fields[1]) dd = int(fields[2]) changed = [fields[0], fields[2], fields[1]] - changed_str = ' '.join(changed) - if strand == '+': + changed_str = " ".join(changed) + if strand == "+": chain_list += [fields[0], fields[2], fields[1]] else: chain_list += [fields[0], fields[1], fields[2]] - elif len(fields) == 1 and fields[0] != '': + elif len(fields) == 1 and fields[0] != "": l = int(fields[0]) chain_list.append(fields[0]) - if strand == '-': + if strand == "-": chain_list = chain_list[::-1] for i in range(int(len(chain_list) / 3)): - p = ' '.join(chain_list[i*3: (i+1)*3]) + p = " ".join(chain_list[i * 3 : (i + 1) * 3]) print(p, file=fo) print(chain_list[-1], file=fo) - print('', file=fo) + print("", file=fo) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() - print('Input chain:', args.chain, file=sys.stderr) - print('Output chain (verbose):', args.out, file=sys.stderr) + print("Input chain:", args.chain, file=sys.stderr) + print("Output chain (verbose):", args.out, file=sys.stderr) invert(in_fn=args.chain, out_fn=args.out) diff --git a/scripts/chain_utils.py b/scripts/chain_utils.py index 0ab6ba2..d4c1e68 100644 --- a/scripts/chain_utils.py +++ b/scripts/chain_utils.py @@ -1,4 +1,4 @@ -''' +""" This script includes two functionalities: chain2vcf and chain2bed chain2vcf: Converts a chain file to a VCF file. @@ -19,54 +19,61 @@ Example: python chain_utils.py chain2bed -i hg38.t2t-chm13-v1.0.over.chain -c 1-23 -o hg38.t2t-chm13-v1.0.conf_regions.bed -''' +""" import argparse import sys + def parse_args(): parser = argparse.ArgumentParser() subparsers = parser.add_subparsers() - #subparsers = parser.add_subparsers(dest='subparser') - parser_c2v = subparsers.add_parser('chain2vcf') + # subparsers = parser.add_subparsers(dest='subparser') + parser_c2v = subparsers.add_parser("chain2vcf") parser_c2v.add_argument( - '-i', '--in_chain', required=True, - help='Path to the chain file to be converted.' + "-i", + "--in_chain", + required=True, + help="Path to the chain file to be converted.", ) parser_c2v.add_argument( - '-c', '--chain_ids', required=True, - help='Chain ids to be converted, separated by commas and hyphens. E.g., "1-9", "1,5-8".' + "-c", + "--chain_ids", + required=True, + help='Chain ids to be converted, separated by commas and hyphens. E.g., "1-9", "1,5-8".', ) parser_c2v.add_argument( - '-o', '--out_vcf', - help='Path to the output VCF file.' + "-o", "--out_vcf", help="Path to the output VCF file." ) parser_c2v.set_defaults(func=chain2vcf) - parser_c2b = subparsers.add_parser('chain2bed') + parser_c2b = subparsers.add_parser("chain2bed") parser_c2b.add_argument( - '-i', '--in_chain', required=True, - help='Path to the chain file to be converted.' + "-i", + "--in_chain", + required=True, + help="Path to the chain file to be converted.", ) parser_c2b.add_argument( - '-c', '--chain_ids', required=True, - help='Chain ids to be converted, separated by commas and hyphens. E.g., "1-9", "1,5-8".' + "-c", + "--chain_ids", + required=True, + help='Chain ids to be converted, separated by commas and hyphens. E.g., "1-9", "1,5-8".', ) parser_c2b.add_argument( - '-o', '--out_bed', - help='Path to the output BED file.' + "-o", "--out_bed", help="Path to the output BED file." ) parser_c2b.set_defaults(func=chain2bed) args = parser.parse_args() args.func(args) - #kwargs = vars(parser.parse_args()) - #globals()[kwargs.pop('subparser')](**kwargs) + # kwargs = vars(parser.parse_args()) + # globals()[kwargs.pop('subparser')](**kwargs) -class Chain_Fields(): +class Chain_Fields: HEADER = 0 SCORE = 1 TNAME = 2 @@ -85,10 +92,10 @@ class Chain_Fields(): ALN_DQ = 2 -class Chain2Bed(): - def __init__(self, out_bed=sys.stdout, chain_hdr='', CF=None): - assert(chain_hdr[CF.HEADER] == 'chain') - assert(chain_hdr[CF.TNAME] == chain_hdr[CF.QNAME]) +class Chain2Bed: + def __init__(self, out_bed=sys.stdout, chain_hdr="", CF=None): + assert chain_hdr[CF.HEADER] == "chain" + assert chain_hdr[CF.TNAME] == chain_hdr[CF.QNAME] self.out_bed = out_bed self.CF = CF self.contig = chain_hdr[self.CF.QNAME] @@ -98,22 +105,29 @@ def __init__(self, out_bed=sys.stdout, chain_hdr='', CF=None): def add(self, chain_aln): if len(self.bed_records) == 0 or self.tstart != self.bed_records[-1][1]: - self.bed_records.append([self.tstart, self.tstart + int(chain_aln[self.CF.ALN_SIZE])]) + self.bed_records.append( + [self.tstart, self.tstart + int(chain_aln[self.CF.ALN_SIZE])] + ) else: - self.bed_records[-1][1] = self.tstart + int(chain_aln[self.CF.ALN_SIZE]) + self.bed_records[-1][1] = self.tstart + int( + chain_aln[self.CF.ALN_SIZE] + ) - self.tstart += int(chain_aln[self.CF.ALN_SIZE]) + int(chain_aln[self.CF.ALN_DT]) - self.qstart += int(chain_aln[self.CF.ALN_SIZE]) + int(chain_aln[self.CF.ALN_DQ]) + self.tstart += int(chain_aln[self.CF.ALN_SIZE]) + int( + chain_aln[self.CF.ALN_DT] + ) + self.qstart += int(chain_aln[self.CF.ALN_SIZE]) + int( + chain_aln[self.CF.ALN_DQ] + ) def write(self): for record in self.bed_records: - print(f'{self.contig}\t{record[0]}\t{record[1]}', file=self.out_bed) - + print(f"{self.contig}\t{record[0]}\t{record[1]}", file=self.out_bed) -class Chain2Vcf(): - def __init__(self, out_vcf=sys.stdout, chain_hdr='', CF=None): - assert(chain_hdr[CF.HEADER] == 'chain') +class Chain2Vcf: + def __init__(self, out_vcf=sys.stdout, chain_hdr="", CF=None): + assert chain_hdr[CF.HEADER] == "chain" # assert(chain_hdr[CF.TNAME] == chain_hdr[CF.QNAME]) self.out_vcf = out_vcf @@ -127,32 +141,41 @@ def __init__(self, out_vcf=sys.stdout, chain_hdr='', CF=None): if chain_hdr[self.CF.TNAME] == chain_hdr[self.CF.QNAME]: # INS w.r.t the query sequence. if self.qstart < self.tstart: - ref = 'A' - qry = 'A' * (self.tstart - self.qstart + 1) - print(f'{self.contig}\t{self.qstart}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}', - file=self.out_vcf) + ref = "A" + qry = "A" * (self.tstart - self.qstart + 1) + print( + f"{self.contig}\t{self.qstart}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}", + file=self.out_vcf, + ) # DEL elif self.qstart > self.tstart: - ref = 'A' * (self.qstart - self.tstart + 1) - qry = 'A' - print(f'{self.contig}\t{self.qstart}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}', - file=self.out_vcf) + ref = "A" * (self.qstart - self.tstart + 1) + qry = "A" + print( + f"{self.contig}\t{self.qstart}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}", + file=self.out_vcf, + ) else: - len_t = int(chain_hdr[self.CF.TEND]) - int(chain_hdr[self.CF.TSTART]) - len_q = int(chain_hdr[self.CF.QEND]) - int(chain_hdr[self.CF.QSTART]) + len_t = int(chain_hdr[self.CF.TEND]) - int( + chain_hdr[self.CF.TSTART] + ) + len_q = int(chain_hdr[self.CF.QEND]) - int( + chain_hdr[self.CF.QSTART] + ) if len_t > len_q: - ref = 'A' - qry = 'A' * (len_t - len_q + 1) - print(f'{self.contig}\t{self.qstart}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}', - file=self.out_vcf) + ref = "A" + qry = "A" * (len_t - len_q + 1) + print( + f"{self.contig}\t{self.qstart}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}", + file=self.out_vcf, + ) elif len_q > len_t: - ref = 'A' * (len_q - len_t + 1) - qry = 'A' - print(f'{self.contig}\t{self.qstart - len_q}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}', - file=self.out_vcf) - - - + ref = "A" * (len_q - len_t + 1) + qry = "A" + print( + f"{self.contig}\t{self.qstart - len_q}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}", + file=self.out_vcf, + ) def add(self, chain_aln): self.tstart += int(chain_aln[self.CF.ALN_SIZE]) @@ -161,29 +184,32 @@ def add(self, chain_aln): len_t = int(chain_aln[self.CF.ALN_DT]) len_q = int(chain_aln[self.CF.ALN_DQ]) - if (len_t > len_q): + if len_t > len_q: # Insertion wrt the query seq len_t -= len_q len_q = 0 - seq_t = 'A' * (len_t + 1) - seq_q = 'A' * (len_q + 1) - print(f'{self.contig}\t{self.qstart}\t.\t{seq_q}\t{seq_t}\t.\tAUTO\t' + - f'TPOS={self.tstart};ALN_DT={int(chain_aln[self.CF.ALN_DT])};ALN_DQ={int(chain_aln[self.CF.ALN_DQ])}', - file=self.out_vcf) - elif (len_t < len_q): + seq_t = "A" * (len_t + 1) + seq_q = "A" * (len_q + 1) + print( + f"{self.contig}\t{self.qstart}\t.\t{seq_q}\t{seq_t}\t.\tAUTO\t" + + f"TPOS={self.tstart};ALN_DT={int(chain_aln[self.CF.ALN_DT])};ALN_DQ={int(chain_aln[self.CF.ALN_DQ])}", + file=self.out_vcf, + ) + elif len_t < len_q: # Deletion wrt the query seq len_q -= len_t len_t = 0 - seq_t = 'A' * (len_t + 1) - seq_q = 'A' * (len_q + 1) - print(f'{self.contig}\t{self.qstart}\t.\t{seq_q}\t{seq_t}\t.\tAUTO\t' + - f'TPOS={self.tstart};ALN_DT={int(chain_aln[self.CF.ALN_DT])};ALN_DQ={int(chain_aln[self.CF.ALN_DQ])}', - file=self.out_vcf) + seq_t = "A" * (len_t + 1) + seq_q = "A" * (len_q + 1) + print( + f"{self.contig}\t{self.qstart}\t.\t{seq_q}\t{seq_t}\t.\tAUTO\t" + + f"TPOS={self.tstart};ALN_DT={int(chain_aln[self.CF.ALN_DT])};ALN_DQ={int(chain_aln[self.CF.ALN_DQ])}", + file=self.out_vcf, + ) self.tstart += int(chain_aln[self.CF.ALN_DT]) self.qstart += int(chain_aln[self.CF.ALN_DQ]) - # Parse raw `chain_ids` and return a list of chain ids. # # Input: @@ -192,9 +218,9 @@ def add(self, chain_aln): # A list containing individual chain ids. def parse_chain_id(chain_ids): list_chain = [] - chain_ids = chain_ids.split(',') + chain_ids = chain_ids.split(",") for c in chain_ids: - c = c.split('-') + c = c.split("-") if len(c) == 1: list_chain.append(c[0]) elif len(c) == 2: @@ -205,7 +231,7 @@ def parse_chain_id(chain_ids): def get_contig_name(fn, chain_ids, CF): dict_cid_contig = {} - with open(fn, 'r') as f: + with open(fn, "r") as f: for line in f: line = line.split() if len(line) == 13: @@ -215,35 +241,43 @@ def get_contig_name(fn, chain_ids, CF): def write_vcf_hdr(f_out, chain_ids, dict_cid_contig): - print('##fileformat=VCFv4.3', file=f_out) - print('##FILTER=', file=f_out) + print("##fileformat=VCFv4.3", file=f_out) + print( + '##FILTER=', file=f_out + ) print(dict_cid_contig) for cid, contig in dict_cid_contig.items(): - print(f'##contig=', file=f_out) - print('##INFO=', - file=f_out) - print('##INFO=', - file=f_out) - print('##INFO=', - file=f_out) - print('#CHROM POS ID REF ALT QUAL FILTER INFO', file=f_out) + print(f"##contig=", file=f_out) + print( + '##INFO=', + file=f_out, + ) + print( + '##INFO=', + file=f_out, + ) + print( + '##INFO=', + file=f_out, + ) + print("#CHROM POS ID REF ALT QUAL FILTER INFO", file=f_out) def chain2bed(args): - print('chain2bed', file=sys.stderr) + print("chain2bed", file=sys.stderr) # Constants. CF = Chain_Fields() chain_ids = parse_chain_id(args.chain_ids) - print('Chain IDs to be processed', chain_ids, file=sys.stderr) + print("Chain IDs to be processed", chain_ids, file=sys.stderr) if not args.out_bed: f_out = sys.stdout else: - f_out = open(args.out_bed, 'w') + f_out = open(args.out_bed, "w") - f = open(args.in_chain, 'r') - current_id = '' + f = open(args.in_chain, "r") + current_id = "" chain = None for line in f: line = line.split() @@ -255,7 +289,7 @@ def chain2bed(args): del chain chain = None elif len(line) == 0: - current_id = '' + current_id = "" if chain: del chain chain = None @@ -269,7 +303,6 @@ def chain2bed(args): chain = None - def chain2vcf(args): # Constants. CF = Chain_Fields() @@ -278,29 +311,34 @@ def chain2vcf(args): if not args.out_vcf: f_out = sys.stdout else: - f_out = open(args.out_vcf, 'w') + f_out = open(args.out_vcf, "w") - write_vcf_hdr(f_out, chain_ids, - get_contig_name(args.in_chain, chain_ids, CF)) + write_vcf_hdr( + f_out, chain_ids, get_contig_name(args.in_chain, chain_ids, CF) + ) - f = open(args.in_chain, 'r') - current_id = '' + f = open(args.in_chain, "r") + current_id = "" chain = None for line in f: line = line.split() # Chain header if len(line) == 13: if line[CF.CID] in chain_ids: - # if line[CF.QNAME] in ['chr2']: + # if line[CF.QNAME] in ['chr2']: chain = Chain2Vcf(out_vcf=f_out, chain_hdr=line, CF=CF) if line[CF.TNAME] != line[CF.QNAME]: - print('[Error] Contig names mismatch', - line[CF.TNAME], line[CF.QNAME], file=sys.stderr) + print( + "[Error] Contig names mismatch", + line[CF.TNAME], + line[CF.QNAME], + file=sys.stderr, + ) exit() del chain chain = None elif len(line) == 0: - current_id = '' + current_id = "" if chain: del chain chain = None @@ -313,6 +351,5 @@ def chain2vcf(args): chain = None -if __name__ == '__main__': +if __name__ == "__main__": parse_args() - diff --git a/scripts/collect_perf.py b/scripts/collect_perf.py index a5374b7..fad8eba 100644 --- a/scripts/collect_perf.py +++ b/scripts/collect_perf.py @@ -1,4 +1,4 @@ -''' +""" Collect computational performance from a collection of GNU time reports. Usage: @@ -11,34 +11,47 @@ Nae-Chyun Chen Johns Hopkins University 2021-2022 -''' +""" import argparse -import os -import pandas as pd import sys +import pandas as pd + + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-a', '--aln-log', - help='Paths to the GNU time log for full alignment.') - parser.add_argument( - '-an', '--aln-name', default='aln', - help='Label of the full alignment experiment [aln].') + "-a", "--aln-log", help="Paths to the GNU time log for full alignment." + ) parser.add_argument( - '-l', '--leviosam-logs', - action='append', required=True, - help='Paths to GNU time logs for the levioSAM pipeline.') + "-an", + "--aln-name", + default="aln", + help="Label of the full alignment experiment [aln].", + ) parser.add_argument( - '-ln', '--leviosam-name', default='leviosam', - help='Label of the levioSAM experiment [leviosam].') + "-l", + "--leviosam-logs", + action="append", + required=True, + help="Paths to GNU time logs for the levioSAM pipeline.", + ) parser.add_argument( - '-ll', '--labels', default='', - help=('Customized labels for each `-l` items, separated by commas.' - 'Length should be equal to number of `-l`')) + "-ln", + "--leviosam-name", + default="leviosam", + help="Label of the levioSAM experiment [leviosam].", + ) parser.add_argument( - '-o', '--output', - help='Path to the output TSV file.') + "-ll", + "--labels", + default="", + help=( + "Customized labels for each `-l` items, separated by commas." + "Length should be equal to number of `-l`" + ), + ) + parser.add_argument("-o", "--output", help="Path to the output TSV file.") args = parser.parse_args() return args @@ -46,29 +59,31 @@ def parse_args(): def collect_perf_core(f, ls_perf) -> None: for line in f: line = line.rstrip() - if line.count('Command being timed:') > 0: + if line.count("Command being timed:") > 0: cmd = line.split('"')[1].split() - program = cmd[0].split('/')[-1] - task = program + '_' + cmd[1] - elif line.count('User time (seconds):') > 0: - usr_time = float(line.split('):')[1]) - elif line.count('System time (seconds):') > 0: - sys_time = float(line.split('):')[1]) - elif line.count('Elapsed (wall clock) time (h:mm:ss or m:ss):') > 0: - wt = line.split('):')[1].split(':') + program = cmd[0].split("/")[-1] + task = program + "_" + cmd[1] + elif line.count("User time (seconds):") > 0: + usr_time = float(line.split("):")[1]) + elif line.count("System time (seconds):") > 0: + sys_time = float(line.split("):")[1]) + elif line.count("Elapsed (wall clock) time (h:mm:ss or m:ss):") > 0: + wt = line.split("):")[1].split(":") if len(wt) == 3: - wall_time = 60 * 60 * float(wt[0]) + 60 * float(wt[1]) + float(wt[2]) + wall_time = ( + 60 * 60 * float(wt[0]) + 60 * float(wt[1]) + float(wt[2]) + ) elif len(wt) == 2: wall_time = 60 * float(wt[0]) + float(wt[1]) else: - print('error - invalid wall time format', file=sys.stderr) + print("error - invalid wall time format", file=sys.stderr) exit(1) - elif line.count('Maximum resident set size (kbytes):') > 0: - max_rss = int(line.split('):')[1]) + elif line.count("Maximum resident set size (kbytes):") > 0: + max_rss = int(line.split("):")[1]) cpu_time = usr_time + sys_time - ls_perf.append([ - task, usr_time, sys_time, - cpu_time, wall_time, max_rss]) + ls_perf.append( + [task, usr_time, sys_time, cpu_time, wall_time, max_rss] + ) return @@ -76,7 +91,7 @@ def collect_perf_list(logs_list, cols): print(logs_list, file=sys.stderr) ls_perf = [] for log in logs_list: - f = open(log, 'r') + f = open(log, "r") # task = os.path.basename(log).split('.')[0] collect_perf_core(f, ls_perf) f.close() @@ -85,7 +100,7 @@ def collect_perf_list(logs_list, cols): def summarize_df(df, cols): - l_sum = ['summary'] + l_sum = ["summary"] for i in range(1, 5): l_sum.append(sum(df.iloc[:, i])) l_sum.append(max(df.iloc[:, 5])) @@ -95,34 +110,42 @@ def summarize_df(df, cols): def collect_perf(args): - COLS = ['Task', 'User time (s)', 'System time (s)', - 'CPU time (s)', 'Wall time (s)', 'Max RSS (KB)'] + COLS = [ + "Task", + "User time (s)", + "System time (s)", + "CPU time (s)", + "Wall time (s)", + "Max RSS (KB)", + ] if args.labels: - args.labels = args.labels.split(',') + args.labels = args.labels.split(",") if len(args.labels) != len(args.leviosam_logs): print( - f'Error. len(args.labels)={len(args.labels)} != len(args.leviosam_log)={args.leviosam_log}', - file=sys.stderr) + f"Error. len(args.labels)={len(args.labels)} != len(args.leviosam_log)={args.leviosam_log}", + file=sys.stderr, + ) df_l = collect_perf_list(args.leviosam_logs, cols=COLS) if args.labels: - df_l['Method'] = args.labels + df_l["Method"] = args.labels else: - df_l['Method'] = args.leviosam_name + df_l["Method"] = args.leviosam_name if args.aln_log: df_a = collect_perf_list([args.aln_log], cols=COLS) - df_a['Method'] = args.aln_name + df_a["Method"] = args.aln_name df = pd.concat([df_l, df_a]) else: df = df_l if args.output: - df.to_csv(args.output, sep='\t', index=False) + df.to_csv(args.output, sep="\t", index=False) else: print(df) -if __name__ == '__main__': + +if __name__ == "__main__": args = parse_args() collect_perf(args) diff --git a/scripts/compare_fastq.py b/scripts/compare_fastq.py index 62e3412..8a32eea 100644 --- a/scripts/compare_fastq.py +++ b/scripts/compare_fastq.py @@ -1,70 +1,89 @@ -''' +""" Compare two sets of FASTQ files Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse -import pysam import sys +import pysam + + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-g1', '--gold_r1', required=True, - help='Path to the truth FASTQ-R1 file. [required]' + "-g1", + "--gold_r1", + required=True, + help="Path to the truth FASTQ-R1 file. [required]", ) parser.add_argument( - '-g2', '--gold_r2', required=True, - help='Path to the truth FASTQ-R2 file. [required]' + "-g2", + "--gold_r2", + required=True, + help="Path to the truth FASTQ-R2 file. [required]", ) parser.add_argument( - '-r1', '--input_r1', required=True, - help='Path to the truth FASTQ-R1 file. [required]' + "-r1", + "--input_r1", + required=True, + help="Path to the truth FASTQ-R1 file. [required]", ) parser.add_argument( - '-r2', '--input_r2', required=True, - help='Path to the truth FASTQ-R2 file. [required]' + "-r2", + "--input_r2", + required=True, + help="Path to the truth FASTQ-R2 file. [required]", ) parser.add_argument( - '-k', '--sampled_len', default=30, type=int, - help='Only store the first `k`-bp of a seq to save memory. [30]' + "-k", + "--sampled_len", + default=30, + type=int, + help="Only store the first `k`-bp of a seq to save memory. [30]", ) parser.add_argument( - '--num_printed_err', default=3, type=int, - help='Num of printed errors during processing. [3]' + "--num_printed_err", + default=3, + type=int, + help="Num of printed errors during processing. [3]", ) parser.add_argument( - '--small_gold', action='store_true', - help='Activate to save gold reads in dicts. [off]' + "--small_gold", + action="store_true", + help="Activate to save gold reads in dicts. [off]", ) args = parser.parse_args() return args def compare_fastq_large_gold( - gold_fn1: str, gold_fn2: str, - input_fn1: str, input_fn2: str, - k: int, max_cnt: int + gold_fn1: str, + gold_fn2: str, + input_fn1: str, + input_fn2: str, + k: int, + max_cnt: int, ) -> None: - print(f'Reading r1...', file=sys.stderr) + print(f"Reading r1...", file=sys.stderr) r1 = {} with pysam.FastxFile(input_fn1) as f: for r in f: r1[r.name] = r.sequence[:k] - print(f'R1: {len(r1)}') - print(f'Reading r2...', file=sys.stderr) + print(f"R1: {len(r1)}") + print(f"Reading r2...", file=sys.stderr) r2 = {} with pysam.FastxFile(input_fn2) as f: for r in f: r2[r.name] = r.sequence[:k] - print(f'R2: {len(r2)}') + print(f"R2: {len(r2)}") if len(r1) != len(r2): - print(f'Warning: lengths of R1 and R2 differ') + print(f"Warning: lengths of R1 and R2 differ") err_dict = {} - print(f'Reading g1...', file=sys.stderr) + print(f"Reading g1...", file=sys.stderr) with pysam.FastxFile(gold_fn1) as f: for r in f: name = r.name.split()[0] @@ -73,12 +92,12 @@ def compare_fastq_large_gold( if r.sequence[:k] != r1[name][:k]: print(name) print(r) - print(' ' + r.sequence[:k]) - print(' ' + r1[name][:k]) + print(" " + r.sequence[:k]) + print(" " + r1[name][:k]) print() err_dict[name] = [1, 0] print(len(err_dict)) - print(f'Reading g2...', file=sys.stderr) + print(f"Reading g2...", file=sys.stderr) with pysam.FastxFile(gold_fn2) as f: for r in f: if r.name in r2: @@ -90,32 +109,35 @@ def compare_fastq_large_gold( else: err_dict[name] = [0, 1] - err = {'only_r1': 0, 'only_r2': 0, 'both': 0, 'correct': 0} - printed_err = {'only_r1': 0, 'only_r2': 0, 'both': 0, 'correct': 0} + err = {"only_r1": 0, "only_r2": 0, "both": 0, "correct": 0} + printed_err = {"only_r1": 0, "only_r2": 0, "both": 0, "correct": 0} for i, (n, result) in enumerate(err_dict.items()): if result == [1, 1]: - err['both'] += 1 - if printed_err['both'] < max_cnt: - printed_err['both'] += 1 - print(f'[both] {n}') + err["both"] += 1 + if printed_err["both"] < max_cnt: + printed_err["both"] += 1 + print(f"[both] {n}") elif result == [0, 1]: - err['only_r2'] += 1 - if printed_err['only_r2'] < max_cnt: - printed_err['only_r2'] += 1 - print(f'[only_r2] {n}') + err["only_r2"] += 1 + if printed_err["only_r2"] < max_cnt: + printed_err["only_r2"] += 1 + print(f"[only_r2] {n}") elif result == [1, 0]: - err['only_r1'] += 1 - if printed_err['only_r1'] < max_cnt: - printed_err['only_r1'] += 1 - print(f'[only_r1] {n}') - err['correct'] = len(r1) - err['only_r1'] - err['only_r2'] - err['both'] + err["only_r1"] += 1 + if printed_err["only_r1"] < max_cnt: + printed_err["only_r1"] += 1 + print(f"[only_r1] {n}") + err["correct"] = len(r1) - err["only_r1"] - err["only_r2"] - err["both"] print(err) def compare_fastq( - gold_fn1: str, gold_fn2: str, - input_fn1: str, input_fn2: str, - k: int, max_cnt: int + gold_fn1: str, + gold_fn2: str, + input_fn1: str, + input_fn2: str, + k: int, + max_cnt: int, ) -> None: gold_r1 = {} with pysam.FastxFile(gold_fn1) as f: @@ -135,36 +157,36 @@ def compare_fastq( for r in f: r2[r.name] = [r.sequence[:k]] - err = {'only_r1': 0, 'only_r2': 0, 'both': 0, 'correct': 0} - printed_err = {'only_r1': 0, 'only_r2': 0, 'both': 0, 'correct': 0} + err = {"only_r1": 0, "only_r2": 0, "both": 0, "correct": 0} + printed_err = {"only_r1": 0, "only_r2": 0, "both": 0, "correct": 0} for i, (n, v1) in enumerate(r1.items()): - r1_diff = (gold_r1[n] != v1) - r2_diff = (gold_r2[n] != r2[n]) + r1_diff = gold_r1[n] != v1 + r2_diff = gold_r2[n] != r2[n] if r1_diff and r2_diff: - err['both'] += 1 + err["both"] += 1 elif r1_diff: - err['only_r1'] += 1 - if printed_err['only_r1'] < max_cnt: - printed_err['only_r1'] += 1 - print(f'[only_r1] {n}') + err["only_r1"] += 1 + if printed_err["only_r1"] < max_cnt: + printed_err["only_r1"] += 1 + print(f"[only_r1] {n}") elif r2_diff: - err['only_r2'] += 1 - if printed_err['only_r2'] < max_cnt: - printed_err['only_r2'] += 1 - print(f'[only_r2] {n}') + err["only_r2"] += 1 + if printed_err["only_r2"] < max_cnt: + printed_err["only_r2"] += 1 + print(f"[only_r2] {n}") else: - err['correct'] += 1 + err["correct"] += 1 print(err) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() - print(f'gold_fn1 = {args.gold_r1}') - print(f'gold_fn2 = {args.gold_r2}') - print(f'fn1 = {args.input_r1}') - print(f'fn2 = {args.input_r2}') + print(f"gold_fn1 = {args.gold_r1}") + print(f"gold_fn2 = {args.gold_r2}") + print(f"fn1 = {args.input_r1}") + print(f"fn2 = {args.input_r2}") if args.small_gold: compare_fastq( @@ -173,7 +195,7 @@ def compare_fastq( input_fn1=args.input_r1, input_fn2=args.input_r2, k=args.sampled_len, - max_cnt=args.num_printed_err + max_cnt=args.num_printed_err, ) else: compare_fastq_large_gold( @@ -182,6 +204,5 @@ def compare_fastq( input_fn1=args.input_r1, input_fn2=args.input_r2, k=args.sampled_len, - max_cnt=args.num_printed_err + max_cnt=args.num_printed_err, ) - diff --git a/scripts/compare_sam-test.py b/scripts/compare_sam-test.py index fc7aa71..5298b82 100644 --- a/scripts/compare_sam-test.py +++ b/scripts/compare_sam-test.py @@ -1,48 +1,74 @@ +import unittest + import compare_sam import pysam -import unittest + def create_sam_header(name: list, length: list) -> pysam.AlignmentHeader: assert len(name) == len(length) - sq = '' + sq = "" for i, n in enumerate(name): - sq += f'@SQ SN:{n} LN:{length[i]}\n' + sq += f"@SQ SN:{n} LN:{length[i]}\n" pg = f'@PG ID:bowtie2 PN:bowtie2 VN:2.4.2 CL:"bowtie2-align-s --wrapper basic-0 -x chm13.draft_v1.0 -S chm13_1.0.sam -1 HG002.R1.fq -2 HG002.R2.fq"' - return pysam.AlignmentHeader.from_text( - sq + pg - ) + return pysam.AlignmentHeader.from_text(sq + pg) class TestCompareSam(unittest.TestCase): @classmethod def setUpClass(cls): cls.chm13_v1_0_header = create_sam_header( - [f'chr{i}' for i in range(1,23)], - [248387497, 242696747, 201106605, 193575430, - 182045437, 172126870, 160567423, 146259322, - 150617274, 134758122, 135127772, 133324781, - 114240146, 101219177, 100338308, 96330493, - 84277185, 80542536, 61707359, 66210247, - 45827691, 51353906]) - + [f"chr{i}" for i in range(1, 23)], + [ + 248387497, + 242696747, + 201106605, + 193575430, + 182045437, + 172126870, + 160567423, + 146259322, + 150617274, + 134758122, + 135127772, + 133324781, + 114240146, + 101219177, + 100338308, + 96330493, + 84277185, + 80542536, + 61707359, + 66210247, + 45827691, + 51353906, + ], + ) + def test_calc_identity(self): def _test_calc_identity(b, q, gold_idy): - baseline = pysam.AlignedSegment.fromstring(b, self.chm13_v1_0_header) + baseline = pysam.AlignedSegment.fromstring( + b, self.chm13_v1_0_header + ) query = pysam.AlignedSegment.fromstring(q, self.chm13_v1_0_header) summ = compare_sam.CompareSamSummary() idy = summ._calc_identity(query=query, baseline=baseline) self.assertEqual(idy, gold_idy) + test_cases = [ - ['A00744:46:HV3C3DSXX:2:1101:13946:8171 145 chr1 164180484 42 9M2I140M = 164179905 -728 ATATGTATATGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATTCAGAGCTCAGATATCTGTGACTGTTTTTTCTCTTTTCCATACTTTTCTTTGGGGCAGCTGTTATCTCTGTGGCCTTGAGCTTGATC F:FFFFFFFFFFFFFFFFF,F:FFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F,FFFFFFFFFF,FFFFF,FFFFFFFFFFFFFFFFFFFFFFFFF,FFFFFFFFFFFFFFFFFFFFFFFFF', - 'A00744:46:HV3C3DSXX:2:1101:13946:8171 145 chr1 164180472 42 62M10D89M = 164179905 652819 ATATGTATATGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATTCAGAGCTCAGATATCTGTGACTGTTTTTTCTCTTTTCCATACTTTTCTTTGGGGCAGCTGTTATCTCTGTGGCCTTGAGCTTGATC F:FFFFFFFFFFFFFFFFF,F:FFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F,FFFFFFFFFF,FFFFF,FFFFFFFFFFFFFFFFFFFFFFFFF,FFFFFFFFFFFFFFFFFFFFFFFFF', - 89/151], - ['A00744:46:HV3C3DSXX:2:1101:4589:4930 161 chr8 8132831 42 151M = 8133221 541 CCCTGGGATGAATGATATAGGTTCATTCATTGAACCTCATTCATTCAACCTCACTGTTGGCAGTGGGAAGTTCCTACTGGGGTAGCAAAACTGACAATAGGAGTCTAGAGCTGTCATTAATTTCTTTCTCCTCTGTATGGAGAAAGCTTGC FFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFF:FFFFFFFF', - 'A00744:46:HV3C3DSXX:2:1101:4589:4930 161 chr8 8132845 24 4M14I133M = 8133221 3473114 GCAAGCTTTCTCCATACAGAGGAGAAAGAAATTAATGACAGCTCTAGACTCCTATTGTCAGTTTTGCTACCCCAGTAGGAACTTCCCACTGCCAACAGTGAGGTTGAATGAATGAGGTTCAATGAATGAACCTATATCATTCATCCCAGGG FFFFFFFF:FFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFF', - 133/151] + [ + "A00744:46:HV3C3DSXX:2:1101:13946:8171 145 chr1 164180484 42 9M2I140M = 164179905 -728 ATATGTATATGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATTCAGAGCTCAGATATCTGTGACTGTTTTTTCTCTTTTCCATACTTTTCTTTGGGGCAGCTGTTATCTCTGTGGCCTTGAGCTTGATC F:FFFFFFFFFFFFFFFFF,F:FFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F,FFFFFFFFFF,FFFFF,FFFFFFFFFFFFFFFFFFFFFFFFF,FFFFFFFFFFFFFFFFFFFFFFFFF", + "A00744:46:HV3C3DSXX:2:1101:13946:8171 145 chr1 164180472 42 62M10D89M = 164179905 652819 ATATGTATATGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATTCAGAGCTCAGATATCTGTGACTGTTTTTTCTCTTTTCCATACTTTTCTTTGGGGCAGCTGTTATCTCTGTGGCCTTGAGCTTGATC F:FFFFFFFFFFFFFFFFF,F:FFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F,FFFFFFFFFF,FFFFF,FFFFFFFFFFFFFFFFFFFFFFFFF,FFFFFFFFFFFFFFFFFFFFFFFFF", + 89 / 151, + ], + [ + "A00744:46:HV3C3DSXX:2:1101:4589:4930 161 chr8 8132831 42 151M = 8133221 541 CCCTGGGATGAATGATATAGGTTCATTCATTGAACCTCATTCATTCAACCTCACTGTTGGCAGTGGGAAGTTCCTACTGGGGTAGCAAAACTGACAATAGGAGTCTAGAGCTGTCATTAATTTCTTTCTCCTCTGTATGGAGAAAGCTTGC FFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFF:FFFFFFFF", + "A00744:46:HV3C3DSXX:2:1101:4589:4930 161 chr8 8132845 24 4M14I133M = 8133221 3473114 GCAAGCTTTCTCCATACAGAGGAGAAAGAAATTAATGACAGCTCTAGACTCCTATTGTCAGTTTTGCTACCCCAGTAGGAACTTCCCACTGCCAACAGTGAGGTTGAATGAATGAGGTTCAATGAATGAACCTATATCATTCATCCCAGGG FFFFFFFF:FFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFF", + 133 / 151, + ], ] for t in test_cases: _test_calc_identity(t[0], t[1], t[2]) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/scripts/compare_sam.py b/scripts/compare_sam.py index c4bf3c0..98a81b3 100644 --- a/scripts/compare_sam.py +++ b/scripts/compare_sam.py @@ -1,88 +1,111 @@ -''' +""" Compares two SAM/BAM files and report a summary. Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse -import pysam import re import sys +import pysam + + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-q', '--input_query', - help='Path to a query SAM/BAM file.' + "-q", "--input_query", help="Path to a query SAM/BAM file." ) parser.add_argument( - '-b', '--input_baseline', - help='Path to a baseline SAM/BAM file.' + "-b", "--input_baseline", help="Path to a baseline SAM/BAM file." ) parser.add_argument( - '-o', '--out', default='', - help='Path to the ouput report. ['': print to sys.stdout]' + "-o", + "--out", + default="", + help="Path to the ouput report. [" ": print to sys.stdout]", ) parser.add_argument( - '-c', '--num_err_printed', default=10, type=int, - help='Number of printed errors. Set to -1 to print all. [10]' + "-c", + "--num_err_printed", + default=10, + type=int, + help="Number of printed errors. Set to -1 to print all. [10]", ) parser.add_argument( - '-cp', '--categories_printed', default='pos_idy', - help=('Caterogies that erroneous records are printed. ' - 'Options: "pos", "idy", "pos_idy", "cigar", "tlen". ' - 'Set to "none" to turn this off. ' - 'Split by commas, e.g. `--categories_printed pos, idy`. ' - '["pos_idy"]' - ) + "-cp", + "--categories_printed", + default="pos_idy", + help=( + "Caterogies that erroneous records are printed. " + 'Options: "pos", "idy", "pos_idy", "cigar", "tlen". ' + 'Set to "none" to turn this off. ' + "Split by commas, e.g. `--categories_printed pos, idy`. " + '["pos_idy"]' + ), ) parser.add_argument( - '-p', '--allowed_posdiff', default=1, type=int, - help='Allowed bases of positional difference. [1]' + "-p", + "--allowed_posdiff", + default=1, + type=int, + help="Allowed bases of positional difference. [1]", ) parser.add_argument( - '-i', '--identity_cutoff', default=0.8, type=float, + "-i", + "--identity_cutoff", + default=0.8, + type=float, help=( - 'Identify cutoff. An alignment with an identity >= this value ' - 'is considered as correct. [0.8]') + "Identify cutoff. An alignment with an identity >= this value " + "is considered as correct. [0.8]" + ), ) parser.add_argument( - '-m', '--min_mapq', default=0, type=int, - help='Min MAPQ to consider. Alignments with lower MAPQ are considered as invalid. [0]' + "-m", + "--min_mapq", + default=0, + type=int, + help="Min MAPQ to consider. Alignments with lower MAPQ are considered as invalid. [0]", ) parser.add_argument( - '-mr', '--max_posdiff_reported', default=sys.maxsize, type=int, + "-mr", + "--max_posdiff_reported", + default=sys.maxsize, + type=int, help=( - 'Only report records within this value of positional difference. ' - 'Set to a negative value to turn off. [sys.maxsize]') + "Only report records within this value of positional difference. " + "Set to a negative value to turn off. [sys.maxsize]" + ), ) args = parser.parse_args() return args class SamUtils: - ''' Convert a typical CIGAR string to the expanded form + """Convert a typical CIGAR string to the expanded form Example: s = CompareSamSummary(0) print(s._expand_cigar('3M2D5M')) # MMMDDMMMMM - ''' + """ + @staticmethod def _expand_cigar(cigarstring: str, consumes: str) -> str: - re_cigar = re.compile('[MIDS]+') + re_cigar = re.compile("[MIDS]+") ops = re_cigar.findall(cigarstring) lens = re_cigar.split(cigarstring) - ecigar = '' + ecigar = "" for i, op in enumerate(ops): - if consumes == 'ref' and not (op in ['M', 'D']): + if consumes == "ref" and not (op in ["M", "D"]): continue - elif consumes == 'query' and not (op in ['M', 'I', 'S']): + elif consumes == "query" and not (op in ["M", "I", "S"]): continue ecigar += int(lens[i]) * op return ecigar - ''' Return true if a cigar operator consumes QUERY + """ Return true if a cigar operator consumes QUERY Op table: 0 M BAM_CMATCH 1 I BAM_CINS @@ -94,12 +117,14 @@ def _expand_cigar(cigarstring: str, consumes: str) -> str: 7 = BAM_CEQUAL 8 X BAM_CDIFF 9 B BAM_CBACK - ''' + """ + @staticmethod def cigar_op_consumes_query(op: int) -> bool: return op in [0, 1, 4, 7, 8] - ''' Return true if a cigar operator consumes REFERENCE ''' + """ Return true if a cigar operator consumes REFERENCE """ + @staticmethod def cigar_op_consumes_ref(op: int) -> bool: return op in [0, 2, 3, 7, 8] @@ -114,8 +139,8 @@ def __init__(self, cigartuple, q_offset, r_offset) -> None: self.intercept = r_offset - q_offset def __repr__(self) -> str: - return f'{self.op}\t{self.size}\t{self.q_offset:<10d}\t{self.intercept}' - + return f"{self.op}\t{self.size}\t{self.q_offset:<10d}\t{self.intercept}" + def truncate_front(self, y) -> int: assert y.q_offset > self.q_offset diff = y.q_offset - self.q_offset @@ -124,7 +149,7 @@ def truncate_front(self, y) -> int: self.r_offset += diff def overlap(self, y) -> int: - assert (self.q_offset == y.q_offset) + assert self.q_offset == y.q_offset if self.op == y.op and self.intercept == y.intercept: return min(self.size, y.size) else: @@ -146,9 +171,9 @@ def __init__(self, aln) -> None: self.cigar = stats def __repr__(self) -> str: - out = 'OP\tSIZE\tQ_OFFSET\tINTETCEPT\n' + out = "OP\tSIZE\tQ_OFFSET\tINTETCEPT\n" for s in self.cigar: - out += (s.__repr__() + '\n') + out += s.__repr__() + "\n" return out def __getitem__(self, key): @@ -161,26 +186,32 @@ def pop(self, key) -> None: self.cigar.pop(key) -class CompareSamSummary(): - ''' +class CompareSamSummary: + """ Inputs: - allowed_posdiff: allowed bases of positional difference - num_err_printed: number of printed errors - max_posdiff_reported: only report records within this value of positional difference - fn_out: path to the ouput report - aln_filter: alignment filtering criteria - ''' + """ + def __init__( - self, allowed_posdiff: int=1, num_err_printed: int=10, - max_posdiff_reported: int=sys.maxsize, identity_cutoff: float=0.8, - fn_out: str='', aln_filter: dict={'MAPQ': 0}) -> None: + self, + allowed_posdiff: int = 1, + num_err_printed: int = 10, + max_posdiff_reported: int = sys.maxsize, + identity_cutoff: float = 0.8, + fn_out: str = "", + aln_filter: dict = {"MAPQ": 0}, + ) -> None: self.posdiff = [] self.mapq_diff = [] self.cigar_diff = [] self.records = [] # Unaligned or invalid alignments self.unmapped_records = [[], []] - self.invalid_records = [[], []] # MAPQ = 255 + self.invalid_records = [[], []] # MAPQ = 255 self.identity = [] self.tlendiff = [] @@ -191,7 +222,7 @@ def __init__( self.fn_out = fn_out self.aln_filter = aln_filter - ''' Caculate CIGAR identity between two sequences + """ Caculate CIGAR identity between two sequences Core algorithmic idea: In each iteration, compare the query_offsets between two sequences. Truncate the sequence with a smaller query_offset (there can not be @@ -202,7 +233,8 @@ def __init__( This algorithm can be executed in O(len(rx)+len(ry)) time, where rx and ry are the number of CIGAR runs for the input sequences respectively. - ''' + """ + def _calc_identity( self, query: pysam.AlignedSegment, baseline: pysam.AlignedSegment ) -> float: @@ -232,73 +264,94 @@ def _calc_identity( q.truncate_front(b) else: b.truncate_front(q) - + return idy / query.infer_read_length() - ''' + """ Calculate the difference of the leftmost aligned positions between query and baseline. If query and baseline are aligned to different contigs, return -1. Otherwise results are always >= 0. - ''' - def _calc_posdiff(self, query: pysam.AlignedSegment, baseline: pysam.AlignedSegment) -> int: + """ + + def _calc_posdiff( + self, query: pysam.AlignedSegment, baseline: pysam.AlignedSegment + ) -> int: if query.reference_name == baseline.reference_name: return abs(query.reference_start - baseline.reference_start) else: return -1 - - ''' + + """ Calculate the difference of TLEN between query and baseline. If query and baseline are aligned to different contigs, return -1. Otherwise results are always >= 0. - ''' - def _calc_tlendiff(self, query: pysam.AlignedSegment, baseline: pysam.AlignedSegment) -> int: + """ + + def _calc_tlendiff( + self, query: pysam.AlignedSegment, baseline: pysam.AlignedSegment + ) -> int: if query.reference_name == baseline.reference_name: return abs(query.template_length - baseline.template_length) else: return -1 - ''' + """ Check if either of query or baseline is unmapped. For an unmapped alignment, add it to `self.unmapped_records`. If any alignment in the pair is unmapped, return false; return true otherwise. - ''' - def _check_unmap(self, query: pysam.AlignedSegment, baseline: pysam.AlignedSegment) -> bool: + """ + + def _check_unmap( + self, query: pysam.AlignedSegment, baseline: pysam.AlignedSegment + ) -> bool: is_unmapped = False if query.is_unmapped: self.unmapped_records[0].append( - query.query_name + ('_1' if query.is_read1 else '_2')) + query.query_name + ("_1" if query.is_read1 else "_2") + ) is_unmapped = True if baseline.is_unmapped: self.unmapped_records[1].append( - baseline.query_name + ('_1' if baseline.is_read1 else '_2')) + baseline.query_name + ("_1" if baseline.is_read1 else "_2") + ) is_unmapped = True return is_unmapped - ''' + """ Check if either of query or baseline has low MAPQ. For an alignment not passed the filter, add it to `self.invalid_records`. If any alignment in the pair does not pass, return false; return true otherwise. - ''' + """ + def _check_low_qual( self, query: pysam.AlignedSegment, baseline: pysam.AlignedSegment ) -> bool: is_low_qual = False - if query.mapping_quality == 255 or query.mapping_quality < self.aln_filter['MAPQ']: + if ( + query.mapping_quality == 255 + or query.mapping_quality < self.aln_filter["MAPQ"] + ): self.invalid_records[0].append( - query.query_name + ('_1' if query.is_read1 else '_2')) + query.query_name + ("_1" if query.is_read1 else "_2") + ) is_low_qual = True - if baseline.mapping_quality == 255 or baseline.mapping_quality < self.aln_filter['MAPQ']: + if ( + baseline.mapping_quality == 255 + or baseline.mapping_quality < self.aln_filter["MAPQ"] + ): self.invalid_records[1].append( - baseline.query_name + ('_1' if baseline.is_read1 else '_2')) + baseline.query_name + ("_1" if baseline.is_read1 else "_2") + ) is_low_qual = True return is_low_qual - ''' + """ Read one record from query and baseline and update the summary Inputs: - query/baseline (pysam alignment record) - ''' + """ + def update( self, query: pysam.AlignedSegment, baseline: pysam.AlignedSegment ) -> None: @@ -314,7 +367,7 @@ def update( # Mapped but invalid if self._check_low_qual(query, baseline): return - + self.posdiff.append(self._calc_posdiff(query, baseline)) self.records.append([query, baseline]) self.mapq_diff.append(query.mapping_quality - baseline.mapping_quality) @@ -322,118 +375,172 @@ def update( self.identity.append(self._calc_identity(query, baseline)) self.tlendiff.append(self._calc_tlendiff(query, baseline)) - ''' Report summary results ''' - def _print_records(self, f_out, by='pos') -> None: + """ Report summary results """ + + def _print_records(self, f_out, by="pos") -> None: cnt = 0 show = False for i, rec in enumerate(self.records): if cnt >= self.num_err_printed: break - if by == 'pos': - show = (self.posdiff[i] > self.allowed_posdiff and - self.posdiff[i] <= self.max_posdiff_reported) - elif by == 'idy': - show = (self.identity[i] < self.identity_cutoff) - elif by == 'pos_idy': - show = (self.posdiff[i] > self.allowed_posdiff and - self.posdiff[i] <= self.max_posdiff_reported) - show &= (self.identity[i] < self.identity_cutoff) - elif by == 'cigar': + if by == "pos": + show = ( + self.posdiff[i] > self.allowed_posdiff + and self.posdiff[i] <= self.max_posdiff_reported + ) + elif by == "idy": + show = self.identity[i] < self.identity_cutoff + elif by == "pos_idy": + show = ( + self.posdiff[i] > self.allowed_posdiff + and self.posdiff[i] <= self.max_posdiff_reported + ) + show &= self.identity[i] < self.identity_cutoff + elif by == "cigar": show = not self.cigar_diff[i] - elif by == 'tlen': - show = (self.tlendiff[i] >= 50 and self.tlendiff[i] >= 0) + elif by == "tlen": + show = self.tlendiff[i] >= 50 and self.tlendiff[i] >= 0 if show: query = rec[0] baseline = rec[1] msg_query = ( - ' ' - f'{query.flag:5d}\t{query.reference_name:6s}\t' - f'{query.reference_start+1:10d}\t' - f'{query.mapping_quality:3d}\t{query.cigarstring}\t' - f'{query.template_length}\tAS:i:{query.get_tag("AS")}') - #f'NM:i:{query.get_tag("NM")}') + " " + f"{query.flag:5d}\t{query.reference_name:6s}\t" + f"{query.reference_start+1:10d}\t" + f"{query.mapping_quality:3d}\t{query.cigarstring}\t" + f'{query.template_length}\tAS:i:{query.get_tag("AS")}' + ) + # f'NM:i:{query.get_tag("NM")}') msg_baseline = ( - ' ' - f'{baseline.flag:5d}\t{baseline.reference_name:6s}\t' - f'{baseline.reference_start+1:10d}\t' - f'{baseline.mapping_quality:3d}\t{baseline.cigarstring}\t' - f'{baseline.template_length}\tAS:i:{baseline.get_tag("AS")}') - #f'NM:i:{baseline.get_tag("NM")}')` - print(f'{query.query_name}', file=f_out) - print(f' p_diff = {self.posdiff[i]:<10d}\t{msg_query}', file=f_out) - print(f' idy = {self.identity[i]:.4f}\t{msg_baseline}', file=f_out) + " " + f"{baseline.flag:5d}\t{baseline.reference_name:6s}\t" + f"{baseline.reference_start+1:10d}\t" + f"{baseline.mapping_quality:3d}\t{baseline.cigarstring}\t" + f'{baseline.template_length}\tAS:i:{baseline.get_tag("AS")}' + ) + # f'NM:i:{baseline.get_tag("NM")}')` + print(f"{query.query_name}", file=f_out) + print( + f" p_diff = {self.posdiff[i]:<10d}\t{msg_query}", + file=f_out, + ) + print( + f" idy = {self.identity[i]:.4f}\t{msg_baseline}", + file=f_out, + ) cnt += 1 return - def report(self, cat_printed: list=[]) -> None: - if self.fn_out == '': + def report(self, cat_printed: list = []) -> None: + if self.fn_out == "": f_out = sys.stdout else: - f_out = open(self.fn_out, 'w') + f_out = open(self.fn_out, "w") if len(self.posdiff) == 0: - print('Zero matched records. Exit.', file=f_out) + print("Zero matched records. Exit.", file=f_out) exit(1) - print('## Position', file=f_out) - num_pos_match = sum([i >= 0 and i < self.allowed_posdiff for i in self.posdiff]) - print(f'{num_pos_match / len(self.posdiff):.6f} ' - f'({num_pos_match}/{len(self.posdiff)})', - file=f_out) - if self.num_err_printed > 1 and 'pos' in cat_printed: - self._print_records(f_out, by='pos') + print("## Position", file=f_out) + num_pos_match = sum( + [i >= 0 and i < self.allowed_posdiff for i in self.posdiff] + ) + print( + f"{num_pos_match / len(self.posdiff):.6f} " + f"({num_pos_match}/{len(self.posdiff)})", + file=f_out, + ) + if self.num_err_printed > 1 and "pos" in cat_printed: + self._print_records(f_out, by="pos") - print('## Identity', file=f_out) + print("## Identity", file=f_out) num_idy = sum([i >= self.identity_cutoff for i in self.identity]) - print(f'{num_idy / len(self.identity):.6f} ({num_idy}/{len(self.identity)})', - file=f_out) - if self.num_err_printed > 1 and 'idy' in cat_printed: - self._print_records(f_out, by='idy') - print('## Average Identity', file=f_out) + print( + f"{num_idy / len(self.identity):.6f} ({num_idy}/{len(self.identity)})", + file=f_out, + ) + if self.num_err_printed > 1 and "idy" in cat_printed: + self._print_records(f_out, by="idy") + print("## Average Identity", file=f_out) avg_idy = sum([i for i in self.identity]) / len(self.identity) - print(f'{avg_idy:.6f}', file=f_out) - - print('## Position || Identity', file=f_out) - num_pos_idy = sum([d >= self.identity_cutoff or (self.posdiff[i] >= 0 and self.posdiff[i] < self.allowed_posdiff) for i, d in enumerate(self.identity)]) - print(f'{num_pos_idy / len(self.identity):.6f} ({num_pos_idy}/{len(self.identity)})', - file=f_out) - if self.num_err_printed > 1 and 'pos_idy' in cat_printed: - self._print_records(f_out, by='pos_idy') - - print('## MAPQ', file=f_out) - print((f'{self.mapq_diff.count(0) / len(self.mapq_diff):.6f} ' - f'({self.mapq_diff.count(0)}/{len(self.mapq_diff)})'), file=f_out) - - print('## CIGAR', file=f_out) - print((f'{self.cigar_diff.count(True) / len(self.cigar_diff):.6f} ' - f'({self.cigar_diff.count(True)}/{len(self.cigar_diff)})'), file=f_out) - if self.num_err_printed > 1 and 'cigar' in cat_printed: - self._print_records(f_out, by='cigar') - - print('## TLEN', file=f_out) + print(f"{avg_idy:.6f}", file=f_out) + + print("## Position || Identity", file=f_out) + num_pos_idy = sum( + [ + d >= self.identity_cutoff + or ( + self.posdiff[i] >= 0 + and self.posdiff[i] < self.allowed_posdiff + ) + for i, d in enumerate(self.identity) + ] + ) + print( + f"{num_pos_idy / len(self.identity):.6f} ({num_pos_idy}/{len(self.identity)})", + file=f_out, + ) + if self.num_err_printed > 1 and "pos_idy" in cat_printed: + self._print_records(f_out, by="pos_idy") + + print("## MAPQ", file=f_out) + print( + ( + f"{self.mapq_diff.count(0) / len(self.mapq_diff):.6f} " + f"({self.mapq_diff.count(0)}/{len(self.mapq_diff)})" + ), + file=f_out, + ) + + print("## CIGAR", file=f_out) + print( + ( + f"{self.cigar_diff.count(True) / len(self.cigar_diff):.6f} " + f"({self.cigar_diff.count(True)}/{len(self.cigar_diff)})" + ), + file=f_out, + ) + if self.num_err_printed > 1 and "cigar" in cat_printed: + self._print_records(f_out, by="cigar") + + print("## TLEN", file=f_out) num_tlen_match = sum([i <= 50 and i >= 0 for i in self.tlendiff]) - print(f'{num_tlen_match / len(self.tlendiff):.6f} ' - f'({num_tlen_match}/{len(self.tlendiff)})', - file=f_out) - if self.num_err_printed > 1 and 'tlen' in cat_printed: - self._print_records(f_out, by='tlen') + print( + f"{num_tlen_match / len(self.tlendiff):.6f} " + f"({num_tlen_match}/{len(self.tlendiff)})", + file=f_out, + ) + if self.num_err_printed > 1 and "tlen" in cat_printed: + self._print_records(f_out, by="tlen") - print('## Unaligned', file=f_out) + print("## Unaligned", file=f_out) set_unaligned = set(self.unmapped_records[0] + self.unmapped_records[1]) - print((f'{len(set_unaligned)} (query={len(self.unmapped_records[0])}, ' - f'baseline={len(self.unmapped_records[1])})'), file=f_out) + print( + ( + f"{len(set_unaligned)} (query={len(self.unmapped_records[0])}, " + f"baseline={len(self.unmapped_records[1])})" + ), + file=f_out, + ) - print('## Invalid (MAPQ=255)', file=f_out) + print("## Invalid (MAPQ=255)", file=f_out) set_invalid = set(self.invalid_records[0] + self.invalid_records[1]) - print((f'{len(set_invalid)} (query={len(self.invalid_records[0])}, ' - f'baseline={len(self.invalid_records[1])})'), file=f_out) + print( + ( + f"{len(set_invalid)} (query={len(self.invalid_records[0])}, " + f"baseline={len(self.invalid_records[1])})" + ), + file=f_out, + ) + + +""" Read a SAM/BAM file as a dictionary (key: query name; value: pysam alignment record) """ -''' Read a SAM/BAM file as a dictionary (key: query name; value: pysam alignment record) ''' def read_sam_as_dict(fn: str) -> dict: dict_reads = {} - f = pysam.AlignmentFile(fn, 'r') + f = pysam.AlignmentFile(fn, "r") for read in f: if read.is_secondary or read.is_supplementary: continue @@ -452,29 +559,30 @@ def read_sam_as_dict(fn: str) -> dict: def compare_sam(args): - aln_filter = {'MAPQ': args.min_mapq} + aln_filter = {"MAPQ": args.min_mapq} summary = CompareSamSummary( allowed_posdiff=args.allowed_posdiff, identity_cutoff=args.identity_cutoff, num_err_printed=args.num_err_printed, max_posdiff_reported=args.max_posdiff_reported, - fn_out=args.out, aln_filter=aln_filter) + fn_out=args.out, + aln_filter=aln_filter, + ) dict_query = read_sam_as_dict(args.input_query) dict_baseline = read_sam_as_dict(args.input_baseline) for _, [name, [first_seg, second_seg]] in enumerate(dict_query.items()): if dict_baseline.get(name): summary.update(query=first_seg, baseline=dict_baseline[name][0]) summary.update(query=second_seg, baseline=dict_baseline[name][1]) - if args.categories_printed == 'none': + if args.categories_printed == "none": cat_printed = [] else: - cat_printed = args.categories_printed.split(',') + cat_printed = args.categories_printed.split(",") for c in cat_printed: - assert c in ['pos', 'idy', 'pos_idy', 'cigar', 'tlen'] + assert c in ["pos", "idy", "pos_idy", "cigar", "tlen"] summary.report(cat_printed) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() compare_sam(args) - diff --git a/scripts/extract_seq_from_bed.py b/scripts/extract_seq_from_bed.py index ff7cd85..cbe1648 100644 --- a/scripts/extract_seq_from_bed.py +++ b/scripts/extract_seq_from_bed.py @@ -1,28 +1,36 @@ -''' +""" Extract subsequences from a FASTA file using info from a BED Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse -import pysam import sys + import leviosam_utils + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-f', '--fasta', required=True, - help='Path to the input FASTA file. [required]' + "-f", + "--fasta", + required=True, + help="Path to the input FASTA file. [required]", ) parser.add_argument( - '-b', '--bed', required=True, - help='Path to the input BED file. [required]' + "-b", + "--bed", + required=True, + help="Path to the input BED file. [required]", ) parser.add_argument( - '-o', '--out', default='', - help='Path to the extracted subsequences (FASTA file). ['': print to sys.stdout]' + "-o", + "--out", + default="", + help="Path to the extracted subsequences (FASTA file). [" + ": print to sys.stdout]", ) args = parser.parse_args() return args @@ -30,25 +38,23 @@ def parse_args(): def extract_seq_from_bed(args): ref = leviosam_utils.read_fasta(args.fasta) - if args.out == '': + if args.out == "": fo = sys.stdout else: - fo = open(args.out, 'w') + fo = open(args.out, "w") - fb = open(args.bed, 'r') + fb = open(args.bed, "r") for line in fb: - line = line.split('\t') + line = line.split("\t") contig = line[0] start = int(line[1]) end = int(line[2]) # Note that FASTA coords are usually represented in 1-based; # other systems here are 0-based - print(f'>{contig}:{start+1}-{end}', file=fo) - print(f'{ref[contig][start:end]}', file=fo) + print(f">{contig}:{start+1}-{end}", file=fo) + print(f"{ref[contig][start:end]}", file=fo) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() extract_seq_from_bed(args) - - diff --git a/scripts/extract_unpaired_reads.py b/scripts/extract_unpaired_reads.py index 89243bf..366c3b6 100644 --- a/scripts/extract_unpaired_reads.py +++ b/scripts/extract_unpaired_reads.py @@ -1,4 +1,4 @@ -''' +""" Extract the mate of reads in a single-end FASTQ from a BAM file This script is designed for the pipeline where @@ -10,50 +10,64 @@ Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse import os -import pysam import sys +import pysam + + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-i', '--input', required=True, - help='Path to a SAM/BAM. [required]' + "-i", "--input", required=True, help="Path to a SAM/BAM. [required]" ) parser.add_argument( - '-s', '--reads', required=True, - help='Path to a single-end FASTQ. [required]' + "-s", + "--reads", + required=True, + help="Path to a single-end FASTQ. [required]", ) parser.add_argument( - '-op', '--out_prefix', required=True, - help='Prefix of outputs (a properly-paired BAM and a pair of FASTQs. [required]' + "-op", + "--out_prefix", + required=True, + help="Prefix of outputs (a properly-paired BAM and a pair of FASTQs. [required]", ) args = parser.parse_args() return args def reverse_complement(seq): - d = {'A': 'T', 'a': 'T', 'C': 'G', 'c': 'G', - 'G': 'C', 'g': 'G', 'T': 'A', 't': 'A', - 'N': 'N'} - rc = '' + d = { + "A": "T", + "a": "T", + "C": "G", + "c": "G", + "G": "C", + "g": "G", + "T": "A", + "t": "A", + "N": "N", + } + rc = "" for s in seq: if s in d: rc += d[s] else: - print(f'Base "{s}" is not a known nucleotide and is converted to N', - file=sys.stderr) - rc += 'N' + print( + f'Base "{s}" is not a known nucleotide and is converted to N', + file=sys.stderr, + ) + rc += "N" return rc[::-1] -def extract_unpaired_reads( - fn_reads: str, out_prefix: str, fn_input: str): +def extract_unpaired_reads(fn_reads: str, out_prefix: str, fn_input: str): dict_reads = {} - read_name = '' - with open(fn_reads, 'r') as fr: + read_name = "" + with open(fn_reads, "r") as fr: for i, line in enumerate(fr): line = line.rstrip() if i % 4 == 0: @@ -67,30 +81,34 @@ def extract_unpaired_reads( elif i % 4 == 3: dict_reads[read_name][1] = line read_name = None - print(f'Read {i/4} records from {fn_reads}', file=sys.stderr) + print(f"Read {i/4} records from {fn_reads}", file=sys.stderr) - fn_paired_output = out_prefix + '-paired.bam' - fn_fq1 = out_prefix + '-singleton-R1.fq' - fn_fq2 = out_prefix + '-singleton-R2.fq' + fn_paired_output = out_prefix + "-paired.bam" + fn_fq1 = out_prefix + "-singleton-R1.fq" + fn_fq2 = out_prefix + "-singleton-R2.fq" for fn in [fn_paired_output, fn_fq1, fn_fq2]: if os.path.exists(fn): - print((f'Error: file {fn} exists. ' - f'Please remove it if you want to proceed'), - file=sys.stderr) + print( + ( + f"Error: file {fn} exists. " + f"Please remove it if you want to proceed" + ), + file=sys.stderr, + ) exit(1) - fa = pysam.AlignmentFile(fn_input, 'r') - fo_paired = pysam.AlignmentFile(fn_paired_output, 'wb', template=fa) - fo_fq1 = open(fn_fq1, 'w') - fo_fq2 = open(fn_fq2, 'w') - print(f'Read from {fn_input}') - print(f'Write filtered BAM to {fn_paired_output}') - print(f'Write extracted R1 to {fn_fq1}') - print(f'Write extracted R2 to {fn_fq2}') + fa = pysam.AlignmentFile(fn_input, "r") + fo_paired = pysam.AlignmentFile(fn_paired_output, "wb", template=fa) + fo_fq1 = open(fn_fq1, "w") + fo_fq2 = open(fn_fq2, "w") + print(f"Read from {fn_input}") + print(f"Write filtered BAM to {fn_paired_output}") + print(f"Write extracted R1 to {fn_fq1}") + print(f"Write extracted R2 to {fn_fq2}") for read in fa: if read.query_name in dict_reads: seq = read.get_forward_sequence() - qual = ''.join(map(lambda x: chr( x+33 ), read.query_qualities)) + qual = "".join(map(lambda x: chr(x + 33), read.query_qualities)) # See https://github.com/pysam-developers/pysam/issues/839#issuecomment-560130300 # if read.is_reverse: # qual = read.query_qualities[::-1] @@ -99,17 +117,30 @@ def extract_unpaired_reads( # qual = read.query_qualities # seq = read.query_sequence singleton = dict_reads[read.query_name] - if read.is_read1 and not read.is_secondary and not read.is_supplementary: - fo_fq1.write(f'@{read.query_name}\n{seq}\n+\n{qual}\n') - fo_fq2.write(f'@{read.query_name}\n{singleton[0]}\n+\n{singleton[1]}\n') - elif read.is_read2 and not read.is_secondary and not read.is_supplementary: - fo_fq2.write(f'@{read.query_name}\n{seq}\n+\n{qual}\n') - fo_fq1.write(f'@{read.query_name}\n{singleton[0]}\n+\n{singleton[1]}\n') + if ( + read.is_read1 + and not read.is_secondary + and not read.is_supplementary + ): + fo_fq1.write(f"@{read.query_name}\n{seq}\n+\n{qual}\n") + fo_fq2.write( + f"@{read.query_name}\n{singleton[0]}\n+\n{singleton[1]}\n" + ) + elif ( + read.is_read2 + and not read.is_secondary + and not read.is_supplementary + ): + fo_fq2.write(f"@{read.query_name}\n{seq}\n+\n{qual}\n") + fo_fq1.write( + f"@{read.query_name}\n{singleton[0]}\n+\n{singleton[1]}\n" + ) else: fo_paired.write(read) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() extract_unpaired_reads( - fn_reads=args.reads, out_prefix=args.out_prefix, fn_input=args.input) + fn_reads=args.reads, out_prefix=args.out_prefix, fn_input=args.input + ) diff --git a/scripts/fai_to_bed.py b/scripts/fai_to_bed.py index abb118d..bb91392 100644 --- a/scripts/fai_to_bed.py +++ b/scripts/fai_to_bed.py @@ -1,42 +1,44 @@ -''' +""" Convert a FAI file to a BED file Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse -import pysam import sys def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-f', '--fai', required=True, - help='Path to the input FAI file. [required]' + "-f", + "--fai", + required=True, + help="Path to the input FAI file. [required]", ) parser.add_argument( - '-o', '--out', default='', - help='Path to the ouput BED file. ['': print to sys.stdout]' + "-o", + "--out", + default="", + help="Path to the ouput BED file. [" ": print to sys.stdout]", ) args = parser.parse_args() return args def fai_to_bed(args): - fb = open(args.fai, 'r') - if args.out == '': + fb = open(args.fai, "r") + if args.out == "": fo = sys.stdout else: - fo = open(args.out, 'w') + fo = open(args.out, "w") for line in fb: line = line.split() - print(f'{line[0]}\t0\t{line[1]}', file=fo) + print(f"{line[0]}\t0\t{line[1]}", file=fo) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() fai_to_bed(args) - diff --git a/scripts/filter_bed_by_size.py b/scripts/filter_bed_by_size.py index c336309..2bdd1f3 100644 --- a/scripts/filter_bed_by_size.py +++ b/scripts/filter_bed_by_size.py @@ -1,47 +1,53 @@ -''' +""" Filter a BED file by segment size. Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse -import pysam import sys + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-b', '--input_bed', required=True, - help='Path to the input mappability BED file. [required]' + "-b", + "--input_bed", + required=True, + help="Path to the input mappability BED file. [required]", ) parser.add_argument( - '-o', '--out', default='', - help='Path to the ouput report. ['': print to sys.stdout]' + "-o", + "--out", + default="", + help="Path to the ouput report. [" ": print to sys.stdout]", ) parser.add_argument( - '-s', '--size', type=int, - help='Min allowed size of a BED record (smaller ones are removed). [required]' + "-s", + "--size", + type=int, + help="Min allowed size of a BED record (smaller ones are removed). [required]", ) args = parser.parse_args() return args def filter_bed_by_size(args): - fb = open(args.input_bed, 'r') - if args.out == '': + fb = open(args.input_bed, "r") + if args.out == "": fo = sys.stdout else: - fo = open(args.out, 'w') + fo = open(args.out, "w") for line in fb: line = line.split() start = int(line[1]) end = int(line[2]) if end - start >= args.size: - print(f'{line[0]}\t{start}\t{end}', file=fo) + print(f"{line[0]}\t{start}\t{end}", file=fo) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() filter_bed_by_size(args) diff --git a/scripts/gen_length_map.py b/scripts/gen_length_map.py index 7df83a2..6d29de9 100644 --- a/scripts/gen_length_map.py +++ b/scripts/gen_length_map.py @@ -1,32 +1,26 @@ import argparse import sys + def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument( - '-f', '--faidx', - help='Path to the .fa.idx file.' - ) - parser.add_argument( - '-o', '--out', - help='Path to the output length map.' - ) + parser.add_argument("-f", "--faidx", help="Path to the .fa.idx file.") + parser.add_argument("-o", "--out", help="Path to the output length map.") args = parser.parse_args() return args def gen_length_map(args): - f = open(args.faidx, 'r') + f = open(args.faidx, "r") if not args.out: f_out = sys.stdout else: - f_out = open(args.out, 'w') + f_out = open(args.out, "w") for line in f: line = line.split() - print(f'{line[0]}\t{line[1]}', file=f_out) + print(f"{line[0]}\t{line[1]}", file=f_out) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() gen_length_map(args) - diff --git a/scripts/get_low_identity_regions.py b/scripts/get_low_identity_regions.py index 83f8829..dda562b 100644 --- a/scripts/get_low_identity_regions.py +++ b/scripts/get_low_identity_regions.py @@ -1,33 +1,41 @@ -''' +""" Extract low identity regions from a chain summary Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse -import pysam import sys + import pandas as pd def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-s', '--summary', required=True, - help='Path to the input chain summary file. [required]' + "-s", + "--summary", + required=True, + help="Path to the input chain summary file. [required]", ) parser.add_argument( - '-o', '--out', default='', - help='Path to the ouput BED file. ['': print to sys.stdout]' + "-o", + "--out", + default="", + help="Path to the ouput BED file. [" ": print to sys.stdout]", ) parser.add_argument( - '-l', '--identity_cutoff', required=True, type=float, - help='Identity cutoff. Values lower than this will be excluded. [required]' + "-l", + "--identity_cutoff", + required=True, + type=float, + help="Identity cutoff. Values lower than this will be excluded. [required]", ) parser.add_argument( - '--source', action='store_true', - help='Set to write wrt the source reference; otherwise write wrt the dest reference.' + "--source", + action="store_true", + help="Set to write wrt the source reference; otherwise write wrt the dest reference.", ) args = parser.parse_args() return args @@ -37,26 +45,27 @@ def get_low_identity_regions( summary: str, out: str, identity_cutoff: float, source: bool ) -> None: if source: - print('Output with respect to the source reference', file=sys.stderr) + print("Output with respect to the source reference", file=sys.stderr) else: - print('Output with respect to the dest reference', file=sys.stderr) + print("Output with respect to the dest reference", file=sys.stderr) - df = pd.read_csv(summary, sep='\t') - fo = open(out, 'w') + df = pd.read_csv(summary, sep="\t") + fo = open(out, "w") for i in range(df.shape[0]): if df.iloc[i][1] < identity_cutoff: rec = df.iloc[i] if source: - print(f'{rec[2]}\t{rec[3]}\t{rec[4]}', file=fo) + print(f"{rec[2]}\t{rec[3]}\t{rec[4]}", file=fo) else: - print(f'{rec[6]}\t{rec[7]}\t{rec[8]}', file=fo) + print(f"{rec[6]}\t{rec[7]}\t{rec[8]}", file=fo) fo.close() -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() get_low_identity_regions( summary=args.summary, out=args.out, identity_cutoff=args.identity_cutoff, - source=args.source) + source=args.source, + ) diff --git a/scripts/get_mappable_regions.py b/scripts/get_mappable_regions.py index ef4b9e8..16bfad1 100644 --- a/scripts/get_mappable_regions.py +++ b/scripts/get_mappable_regions.py @@ -1,32 +1,42 @@ -''' +""" Reads a mappability BED file and returns highly-mappable regions Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse -import typing -import pysam import sys +import typing + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-b', '--input_bed', required=True, - help='Path to the input mappability BED file. [required]' + "-b", + "--input_bed", + required=True, + help="Path to the input mappability BED file. [required]", ) parser.add_argument( - '-o', '--out', default='', - help='Path to the ouput report. ['': print to sys.stdout]' + "-o", + "--out", + default="", + help="Path to the ouput report. [" ": print to sys.stdout]", ) parser.add_argument( - '-c', '--min_mappability', default=1.0, type=float, - help='Min mappability to be considered as highly-mappable. [1.0]' + "-c", + "--min_mappability", + default=1.0, + type=float, + help="Min mappability to be considered as highly-mappable. [1.0]", ) parser.add_argument( - '-k', '--kmer_size', required=True, type=int, - help='Kmer size for each BED record. [required]' + "-k", + "--kmer_size", + required=True, + type=int, + help="Kmer size for each BED record. [required]", ) args = parser.parse_args() return args @@ -36,23 +46,23 @@ def print_to_bed( chrom: chr, high_map_start: int, low_map_start: int, fo: typing.TextIO ) -> None: if high_map_start < low_map_start: - print(f'{chrom}\t{high_map_start}\t{low_map_start}', file=fo) + print(f"{chrom}\t{high_map_start}\t{low_map_start}", file=fo) def is_mappable(fields: list, min_mappability: float) -> bool: - if fields[3] == 'inf' or float(fields[3]) < min_mappability: + if fields[3] == "inf" or float(fields[3]) < min_mappability: return False return True def get_mappable_regions(args): - fb = open(args.input_bed, 'r') - if args.out == '': + fb = open(args.input_bed, "r") + if args.out == "": fo = sys.stdout else: - fo = open(args.out, 'w') + fo = open(args.out, "w") - chrom = '' + chrom = "" high_map_start = 0 low_map_start = 0 state = None @@ -67,21 +77,21 @@ def get_mappable_regions(args): if not is_mappable(line, args.min_mappability): # mappable to unmappable - if state != 'unmappable': + if state != "unmappable": low_map_start = int(line[1]) print_to_bed(chrom, high_map_start, low_map_start, fo) - state = 'unmappable' + state = "unmappable" high_map_start = low_map_start + args.kmer_size else: - if state != 'mappable': + if state != "mappable": if int(line[1]) > high_map_start: high_map_start = int(line[1]) - state = 'mappable' + state = "mappable" - if state == 'mappable': - print_to_bed(chrom, high_map_start, int(line[1])+args.kmer_size, fo) + if state == "mappable": + print_to_bed(chrom, high_map_start, int(line[1]) + args.kmer_size, fo) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() get_mappable_regions(args) diff --git a/scripts/leviosam_utils-test.py b/scripts/leviosam_utils-test.py index 2b28836..0aea3ab 100644 --- a/scripts/leviosam_utils-test.py +++ b/scripts/leviosam_utils-test.py @@ -1,20 +1,26 @@ -''' +""" Tests for levioSAM utilities Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import unittest -import subprocess -import mask_fasta_with_bed + import leviosam_utils +import mask_fasta_with_bed + class TestMakeFastaWithBed(unittest.TestCase): def test_chr1_1(self): - ref = mask_fasta_with_bed.mask_fasta_with_bed(bed='testdata/chr1_test1.bed', fasta='testdata/chr1_1_100000.fa') - gold_ref = leviosam_utils.read_fasta('testdata/mask_fasta_with_bed-chr1_test1.fa.results') + ref = mask_fasta_with_bed.mask_fasta_with_bed( + bed="testdata/chr1_test1.bed", fasta="testdata/chr1_1_100000.fa" + ) + gold_ref = leviosam_utils.read_fasta( + "testdata/mask_fasta_with_bed-chr1_test1.fa.results" + ) self.assertDictEqual(ref, gold_ref) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/scripts/leviosam_utils.py b/scripts/leviosam_utils.py index fde5e65..32f6e97 100644 --- a/scripts/leviosam_utils.py +++ b/scripts/leviosam_utils.py @@ -1,20 +1,21 @@ -''' +""" Utils for levioSAM Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import pysam -''' -Read a FASTA file as a dict if a file name is given. If not, return an empty dict. -''' + def read_fasta(ref_fn: str) -> dict: + """Reads a FASTA file as a dict if a file name is given. + + If no file name, returns an empty dict. + """ ref = {} - if ref_fn != '': + if ref_fn != "": f = pysam.FastaFile(ref_fn) for r in f.references: ref[r] = f[r].upper() return ref - diff --git a/scripts/mask_fasta_with_bed.py b/scripts/mask_fasta_with_bed.py index e6be228..2b8e5ef 100644 --- a/scripts/mask_fasta_with_bed.py +++ b/scripts/mask_fasta_with_bed.py @@ -1,38 +1,43 @@ -''' +""" Mask sequences in a FASTA with annotations in a BED file Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse -import leviosam_utils -import pysam import sys +import leviosam_utils + + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-f', '--fasta', required=True, - help='Path to the input FASTA file. [required]' + "-f", + "--fasta", + required=True, + help="Path to the input FASTA file. [required]", ) parser.add_argument( - '-b', '--bed', required=True, - help='Path to the input BED file. [required]' + "-b", + "--bed", + required=True, + help="Path to the input BED file. [required]", ) parser.add_argument( - '-o', '--out', default='', - help='Path to the ouput FASTA file. ['': print to sys.stdout]' + "-o", + "--out", + default="", + help="Path to the ouput FASTA file. [" ": print to sys.stdout]", ) args = parser.parse_args() return args -def mask_fasta_with_bed( - fasta: str, bed: str -) -> dict: +def mask_fasta_with_bed(fasta: str, bed: str) -> dict: ref = leviosam_utils.read_fasta(fasta) - fb = open(bed, 'r') + fb = open(bed, "r") for line in fb: line = line.split() @@ -41,28 +46,26 @@ def mask_fasta_with_bed( end = int(line[2]) size = end - start assert size >= 0 - ref[contig] = ref[contig][:start] + 'N' * size + ref[contig][end:] + ref[contig] = ref[contig][:start] + "N" * size + ref[contig][end:] fb.close() return ref def print_fasta(ref: dict, out: str) -> None: - if out == '': + if out == "": fo = sys.stdout else: - fo = open(out, 'w') + fo = open(out, "w") for i, (k, v) in enumerate(ref.items()): - print(f'>{k}', file=fo) + print(f">{k}", file=fo) for i in range(0, len(v), 60): - print(f'{v[i:i+60]}', file=fo) + print(f"{v[i:i+60]}", file=fo) fo.close() -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() ref = mask_fasta_with_bed(fasta=args.fasta, bed=args.bed) print_fasta(ref=ref, out=args.out) - - diff --git a/scripts/sam_qname_to_bed.py b/scripts/sam_qname_to_bed.py index 0a0e4e4..8f2b5c4 100644 --- a/scripts/sam_qname_to_bed.py +++ b/scripts/sam_qname_to_bed.py @@ -1,4 +1,4 @@ -''' +""" Convert SAM QNAME to BED records This only works for QNAMEs like `chr1:1-100, which would be converted @@ -7,52 +7,69 @@ Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse -import pysam import sys +import pysam + + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-s', '--sam', required=True, - help='Path to the input SAM file. [required]' + "-s", + "--sam", + required=True, + help="Path to the input SAM file. [required]", ) parser.add_argument( - '-f', '--flag_include', default=None, type=int, action='append', - help='Only process records with specified flags. [None]' + "-f", + "--flag_include", + default=None, + type=int, + action="append", + help="Only process records with specified flags. [None]", ) parser.add_argument( - '-F', '--flag_exclude', default=None, type=int, action='append', - help='Exclude records with specified flags. [None]' + "-F", + "--flag_exclude", + default=None, + type=int, + action="append", + help="Exclude records with specified flags. [None]", ) parser.add_argument( - '-m', '--exclude_aln_num_range', default=None, type=str, - help='Only print records outside a range of alignments, e.g. setting `-m 1-10` prints qnames aligned zero times or >10 times. [None]' + "-m", + "--exclude_aln_num_range", + default=None, + type=str, + help="Only print records outside a range of alignments, e.g. setting `-m 1-10` prints qnames aligned zero times or >10 times. [None]", ) parser.add_argument( - '-o', '--out', default='', - help='Path to the ouput BED file. ['': print to sys.stdout]' + "-o", + "--out", + default="", + help="Path to the ouput BED file. [" ": print to sys.stdout]", ) args = parser.parse_args() return args def print_record(qname, fo) -> None: - qname = qname.split(':') + qname = qname.split(":") contig = qname[0] - start = int(qname[1].split('-')[0]) - 1 - end = qname[1].split('-')[1] - print(f'{contig}\t{start}\t{end}', file=fo) + start = int(qname[1].split("-")[0]) - 1 + end = qname[1].split("-")[1] + print(f"{contig}\t{start}\t{end}", file=fo) def sam_qname_to_bed(args) -> None: f = pysam.AlignmentFile(args.sam) - if args.out == '': + if args.out == "": fo = sys.stdout else: - fo = open(args.out, 'w') - + fo = open(args.out, "w") + aln_dict = {} for r in f: qname = r.query_name @@ -81,13 +98,12 @@ def sam_qname_to_bed(args) -> None: if args.exclude_aln_num_range is None: print_record(qname, fo) else: - min_exc_aln_num = int(args.exclude_aln_num_range.split('-')[0]) - max_exc_aln_num = int(args.exclude_aln_num_range.split('-')[1]) + min_exc_aln_num = int(args.exclude_aln_num_range.split("-")[0]) + max_exc_aln_num = int(args.exclude_aln_num_range.split("-")[1]) if times < min_exc_aln_num or times > max_exc_aln_num: print_record(qname, fo) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() sam_qname_to_bed(args) - diff --git a/scripts/sample_fq.py b/scripts/sample_fq.py index 57906c6..3aa8786 100644 --- a/scripts/sample_fq.py +++ b/scripts/sample_fq.py @@ -1,51 +1,47 @@ -''' +""" @A00744:46:HV3C3DSXX:2:2551:23493:1063 GACAGCGGGCACTCAGTTCCACTAACGAATGATGCCCGTGTGGACAGACAGAATGATGGACAGGCAGATGAATGCGTGGGCTTTATGTGAAACAGGTCCTCTTGGTTGTTGACAAGATACTGTTTTAAAGTTCCATTTTGCCATACTTCGA + FF,F::::,FFF::F:FFFFFF,F:F:F:F:FFF,FFFFFFFFFFFFFFFFFFFFF,::FFF::FFFFF:FFFFFF:F:FF,FF:F:FFFFF,FF:FFF:F:FFF:FFFF:FFFF::F:FFFFFF:F::FF:F,FFFFF::FFFFFFFF,F -''' +""" import argparse import gzip import random -import sys + def parse_args(): parser = argparse.ArgumentParser() + parser.add_argument("-r1", "--read1", help="Path to R1 FASTQ") + parser.add_argument("-r2", "--read2", help="Path to R2 FASTQ") + parser.add_argument("-op", "--out_prefix", help="Output FASTQ prefix") parser.add_argument( - '-r1', '--read1', - help='Path to R1 FASTQ' - ) - parser.add_argument( - '-r2', '--read2', - help='Path to R2 FASTQ' - ) - parser.add_argument( - '-op', '--out_prefix', - help='Output FASTQ prefix' - ) - parser.add_argument( - '-s', '--sample_rate', type=float, default=0, - help=('Sample rate (e.g. -s 0.05 samples ~5% of the reads). ' - 'Should be within 0 and 1. [0].') + "-s", + "--sample_rate", + type=float, + default=0, + help=( + "Sample rate (e.g. -s 0.05 samples ~5% of the reads). " + "Should be within 0 and 1. [0]." + ), ) args = parser.parse_args() return args def sample_fq(args): - f1 = gzip.open(args.read1, 'rb') - f2 = gzip.open(args.read2, 'rb') - fo1 = gzip.open(args.out_prefix + '-R1.fq.gz', 'wb') - fo2 = gzip.open(args.out_prefix + '-R2.fq.gz', 'wb') - + f1 = gzip.open(args.read1, "rb") + f2 = gzip.open(args.read2, "rb") + fo1 = gzip.open(args.out_prefix + "-R1.fq.gz", "wb") + fo2 = gzip.open(args.out_prefix + "-R2.fq.gz", "wb") + rd = 0 for l1 in f1: # l2 = f2.readline().decode('ascii').rstrip() l2 = f2.readline() # l1 = l1.decode('ascii').rstrip() # if l1.startswith('@'): - if l1.startswith(b'@'): + if l1.startswith(b"@"): rd = random.random() if rd <= 1 - args.sample_rate: # print(l1, file=sys.stderr) @@ -54,9 +50,8 @@ def sample_fq(args): fo2.write(l2) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() if args.read1 is None: exit(1) sample_fq(args) - diff --git a/scripts/summarize_aln_features.py b/scripts/summarize_aln_features.py index 68565f0..e0b91e0 100644 --- a/scripts/summarize_aln_features.py +++ b/scripts/summarize_aln_features.py @@ -1,44 +1,59 @@ -''' +""" Summarize an alignment tag for a BAM file and show quantiles Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse + import numpy as np import pysam -import sys + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-f', '--input', required=True, - help='Path to the input SAM/BAM file. [required]' + "-f", + "--input", + required=True, + help="Path to the input SAM/BAM file. [required]", ) parser.add_argument( - '-t', '--tag', default=None, type=str, - help='Alignment tag to summarize, e.g. AS. [None]' + "-t", + "--tag", + default=None, + type=str, + help="Alignment tag to summarize, e.g. AS. [None]", ) parser.add_argument( - '-q', '--mapq', action='store_true', - help='Set to summarize MAPQ. [False]' + "-q", + "--mapq", + action="store_true", + help="Set to summarize MAPQ. [False]", ) parser.add_argument( - '-T', '--isize', action='store_true', - help='Set to summarize template length (absolute value). [False]' + "-T", + "--isize", + action="store_true", + help="Set to summarize template length (absolute value). [False]", ) parser.add_argument( - '-c', '--clipped_fraction', action='store_true', - help='Set to summarzie clipped fraction. [False]' + "-c", + "--clipped_fraction", + action="store_true", + help="Set to summarzie clipped fraction. [False]", ) args = parser.parse_args() return args def summarize_aln_features( - fn_input: str, tag: str=None, - mapq: bool=False, isize: int=None, clipped_fraction: float=None + fn_input: str, + tag: str = None, + mapq: bool = False, + isize: int = None, + clipped_fraction: float = None, ) -> None: results = [] failed_cnt = 0 @@ -61,13 +76,15 @@ def summarize_aln_features( else: other_cnt += 1 elif clipped_fraction: - results.append(read.query_alignment_length / read.query_length) + results.append( + read.query_alignment_length / read.query_length + ) except: failed_cnt += 1 - print(f'Summarized {total} primary and aligned reads') + print(f"Summarized {total} primary and aligned reads") if failed_cnt > 0: - print(f'{failed_cnt} reads failed') + print(f"{failed_cnt} reads failed") results = np.array(sorted(results)) l = len(results) @@ -75,41 +92,43 @@ def summarize_aln_features( v = np.std(results) if tag: - print(f'{tag} distribution:') + print(f"{tag} distribution:") elif mapq: - print('MAPQ distribution:') + print("MAPQ distribution:") elif isize: - print('Template length distribution:') + print("Template length distribution:") elif clipped_fraction: - print('Clipped fraction distribution:') - print(f' mean = {m:.2f}') - print(f' std = {v:.2f}') - print(f'-3 std = {m - 3 * v:.2f}') - print(f'-2 std = {m - 2 * v:.2f}') - print(f'-1 std = {m - v:.2f}') - print(f'+1 std = {m + v:.2f}') - print(f'+2 std = {m + 2 * v:.2f}') - print(f'+3 std = {m + 3 * v:.2f}') + print("Clipped fraction distribution:") + print(f" mean = {m:.2f}") + print(f" std = {v:.2f}") + print(f"-3 std = {m - 3 * v:.2f}") + print(f"-2 std = {m - 2 * v:.2f}") + print(f"-1 std = {m - v:.2f}") + print(f"+1 std = {m + v:.2f}") + print(f"+2 std = {m + 2 * v:.2f}") + print(f"+3 std = {m + 3 * v:.2f}") print() - print(f' min = {results[0]}') - print(f' 1% = {results[: int(l * 0.01)][-1]}') - print(f' 3% = {results[: int(l * 0.03)][-1]}') - print(f' 5% = {results[: int(l * 0.05)][-1]}') - print(f' 10% = {results[: int(l * 0.10)][-1]}') - print(f' 20% = {results[: int(l * 0.20)][-1]}') - print(f' 50% = {results[: int(l * 0.50)][-1]}') - print(f' 80% = {results[: int(l * 0.80)][-1]}') - print(f' 90% = {results[: int(l * 0.90)][-1]}') - print(f' 95% = {results[: int(l * 0.95)][-1]}') - print(f' 97% = {results[: int(l * 0.97)][-1]}') - print(f' 99% = {results[: int(l * 0.99)][-1]}') - print(f' max = {results[-1]}') + print(f" min = {results[0]}") + print(f" 1% = {results[: int(l * 0.01)][-1]}") + print(f" 3% = {results[: int(l * 0.03)][-1]}") + print(f" 5% = {results[: int(l * 0.05)][-1]}") + print(f" 10% = {results[: int(l * 0.10)][-1]}") + print(f" 20% = {results[: int(l * 0.20)][-1]}") + print(f" 50% = {results[: int(l * 0.50)][-1]}") + print(f" 80% = {results[: int(l * 0.80)][-1]}") + print(f" 90% = {results[: int(l * 0.90)][-1]}") + print(f" 95% = {results[: int(l * 0.95)][-1]}") + print(f" 97% = {results[: int(l * 0.97)][-1]}") + print(f" 99% = {results[: int(l * 0.99)][-1]}") + print(f" max = {results[-1]}") if isize: - print(f'not properly paird = {other_cnt} ({int(other_cnt / total * 100)}%)') + print( + f"not properly paird = {other_cnt} ({int(other_cnt / total * 100)}%)" + ) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() fn_input = args.input tag = args.tag @@ -117,12 +136,24 @@ def summarize_aln_features( isize = args.isize clipped_fraction = args.clipped_fraction - check_options = [tag != None, mapq != False, isize != False, clipped_fraction != False] + check_options = [ + tag != None, + mapq != False, + isize != False, + clipped_fraction != False, + ] if sum(check_options) != 1: - print('Error: Only one out of the tag/mapq/isize/clipped_fraction options can be set') - print('[tag, mapq, isize, clipped_fraction]') + print( + "Error: Only one out of the tag/mapq/isize/clipped_fraction options can be set" + ) + print("[tag, mapq, isize, clipped_fraction]") print(check_options) exit(1) - summarize_aln_features(fn_input=fn_input, tag=tag, mapq=mapq, - isize=isize, clipped_fraction=clipped_fraction) + summarize_aln_features( + fn_input=fn_input, + tag=tag, + mapq=mapq, + isize=isize, + clipped_fraction=clipped_fraction, + ) diff --git a/scripts/verbosify_chain.py b/scripts/verbosify_chain.py index 5913fe8..d45fadf 100644 --- a/scripts/verbosify_chain.py +++ b/scripts/verbosify_chain.py @@ -1,86 +1,104 @@ -''' +""" Add info for a chain file to facilitate debugging Nae-Chyun Chen Johns Hopkins University 2021 -''' +""" import argparse -import leviosam_utils -import pysam import re import sys +import leviosam_utils + + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - '-c', '--chain', required=True, - help='Path to the input chain file.' + "-c", "--chain", required=True, help="Path to the input chain file." ) parser.add_argument( - '-o', '--out', default='', - help='Path to the output verbose chain file. [empty string]' + "-o", + "--out", + default="", + help="Path to the output verbose chain file. [empty string]", ) parser.add_argument( - '-b', '--bed_prefix', default='', - help='Prefix to the BED files that include all the chain segments. [empty string]' + "-b", + "--bed_prefix", + default="", + help="Prefix to the BED files that include all the chain segments. [empty string]", ) parser.add_argument( - '-s', '--summary', default='', - help='Path to the output summary. Leave empty for no output. [empty string]' + "-s", + "--summary", + default="", + help="Path to the output summary. Leave empty for no output. [empty string]", ) parser.add_argument( - '-f1', '--ref1', default='', - help='Path to the source reference (optional).' + "-f1", + "--ref1", + default="", + help="Path to the source reference (optional).", ) parser.add_argument( - '-f2', '--ref2', default='', - help='Path to the destination reference (optional).' + "-f2", + "--ref2", + default="", + help="Path to the destination reference (optional).", ) args = parser.parse_args() return args def reverse_complement(seq): - d = {'A': 'T', 'a': 'T', 'C': 'G', 'c': 'G', - 'G': 'C', 'g': 'G', 'T': 'A', 't': 'A', - 'N': 'N'} - rc = '' + d = { + "A": "T", + "a": "T", + "C": "G", + "c": "G", + "G": "C", + "g": "G", + "T": "A", + "t": "A", + "N": "N", + } + rc = "" for s in seq: if s in d: rc += d[s] else: - print(f'Base "{s}" is not a known nucleotide and is converted to N', - file=sys.stderr) - rc += 'N' + print( + f'Base "{s}" is not a known nucleotide and is converted to N', + file=sys.stderr, + ) + rc += "N" return rc[::-1] def compute_hamming_dist( - forward, - ref1, contig1, start1, end1, - ref2, contig2, start2, end2 + forward, ref1, contig1, start1, end1, ref2, contig2, start2, end2 ): if contig1 in ref1: - s1 = ref1[contig1][start1: end1] + s1 = ref1[contig1][start1:end1] else: - print(f'Warning : {contig1} not in ref1', file=sys.stderr) + print(f"Warning : {contig1} not in ref1", file=sys.stderr) return 0 if contig2 in ref2: - s2 = ref2[contig2][start2: end2] + s2 = ref2[contig2][start2:end2] if not forward: s2 = reverse_complement(s2) else: - print(f'Warning : {contig2} not in ref2', file=sys.stderr) + print(f"Warning : {contig2} not in ref2", file=sys.stderr) return 0 try: - assert (len(s1) == len(s2)) + assert len(s1) == len(s2) except: - print('Error: lengths do not match', file=sys.stderr) + print("Error: lengths do not match", file=sys.stderr) print(len(s1), len(s2), file=sys.stderr) - print(f'{contig1}:{start1}-{end1}', file=sys.stderr) - print(f'{contig2}:{start2}-{end2}', file=sys.stderr) + print(f"{contig1}:{start1}-{end1}", file=sys.stderr) + print(f"{contig2}:{start2}-{end2}", file=sys.stderr) exit(1) if len(s1) == 0: @@ -93,26 +111,38 @@ def compute_hamming_dist( return idy -''' Write a chain record in the BED format ''' +""" Write a chain record in the BED format """ + + def write_to_summary(fs_fn, fs, strand, l, hd, source, s_start, dest, d_start): if fs_fn: - if strand == '+': - print((f'{l}\t{hd:.6f}\t{source}\t{s_start}\t{s_start+l}\t+' - f'\t{dest}\t{d_start}\t{d_start+l}'), file=fs) + if strand == "+": + print( + ( + f"{l}\t{hd:.6f}\t{source}\t{s_start}\t{s_start+l}\t+" + f"\t{dest}\t{d_start}\t{d_start+l}" + ), + file=fs, + ) else: - print((f'{l}\t{hd:.6f}\t{source}\t{s_start}\t{s_start+l}\t+' - f'\t{dest}\t{d_start-l}\t{d_start}'), file=fs) + print( + ( + f"{l}\t{hd:.6f}\t{source}\t{s_start}\t{s_start+l}\t+" + f"\t{dest}\t{d_start-l}\t{d_start}" + ), + file=fs, + ) def verbosify_chain(args): - f = open(args.chain, 'r') - if args.out == '': + f = open(args.chain, "r") + if args.out == "": fo = sys.stderr else: - fo = open(args.out, 'w') - if args.bed_prefix != '': - f_sbed = open(args.bed_prefix + '-source.bed', 'w') - f_dbed = open(args.bed_prefix + '-dest.bed', 'w') + fo = open(args.out, "w") + if args.bed_prefix != "": + f_sbed = open(args.bed_prefix + "-source.bed", "w") + f_dbed = open(args.bed_prefix + "-dest.bed", "w") ref1 = leviosam_utils.read_fasta(args.ref1) ref2 = leviosam_utils.read_fasta(args.ref2) @@ -122,9 +152,12 @@ def verbosify_chain(args): hd = None if args.summary: assert check_hdist == True - fs = open(args.summary, 'w') + fs = open(args.summary, "w") # Write header - print(f'SIZE\tHDIST\tSOURCE\tS_START\tS_END\tSTRAND\tDEST\tD_START\tD_END', file=fs) + print( + f"SIZE\tHDIST\tSOURCE\tS_START\tS_END\tSTRAND\tDEST\tD_START\tD_END", + file=fs, + ) else: fs = None @@ -134,8 +167,8 @@ def verbosify_chain(args): if not line: continue line = line.rstrip() - fields = re.split(r'[\s\t]+', line) - if fields[0] == 'chain': + fields = re.split(r"[\s\t]+", line) + if fields[0] == "chain": print(line, file=fo) source = fields[2] s_start = int(fields[5]) @@ -143,7 +176,7 @@ def verbosify_chain(args): dest = fields[7] dest_len = int(fields[8]) strand = fields[9] - if strand == '+': + if strand == "+": d_start = int(fields[10]) d_end = int(fields[11]) else: @@ -154,81 +187,121 @@ def verbosify_chain(args): total_bases += l ds = int(fields[1]) dd = int(fields[2]) - if strand == '+': - msg = f'\t{source}:{s_start}-{s_start+l}=>{dest}:{d_start}-{d_start+l} ({d_start-s_start})' - if args.bed_prefix != '': - f_sbed.write(f'{source}\t{s_start}\t{s_start+l}\n') - f_dbed.write(f'{dest}\t{d_start}\t{d_start+l}\n') + if strand == "+": + msg = f"\t{source}:{s_start}-{s_start+l}=>{dest}:{d_start}-{d_start+l} ({d_start-s_start})" + if args.bed_prefix != "": + f_sbed.write(f"{source}\t{s_start}\t{s_start+l}\n") + f_dbed.write(f"{dest}\t{d_start}\t{d_start+l}\n") if check_hdist: hd = compute_hamming_dist( True, - ref1, source, s_start, s_start+l, - ref2, dest, d_start, d_start+l) - msg += f'\t{hd}' + ref1, + source, + s_start, + s_start + l, + ref2, + dest, + d_start, + d_start + l, + ) + msg += f"\t{hd}" else: - msg = f'\t{source}:{s_start}-{s_start+l}=>{dest}:{d_start}-{d_start-l} ({d_start-s_start})' - if args.bed_prefix != '': - f_sbed.write(f'{source}\t{s_start}\t{s_start+l}\n') - f_dbed.write(f'{dest}\t{d_start-l}\t{d_start}\n') + msg = f"\t{source}:{s_start}-{s_start+l}=>{dest}:{d_start}-{d_start-l} ({d_start-s_start})" + if args.bed_prefix != "": + f_sbed.write(f"{source}\t{s_start}\t{s_start+l}\n") + f_dbed.write(f"{dest}\t{d_start-l}\t{d_start}\n") if check_hdist: hd = compute_hamming_dist( False, - ref1, source, s_start, s_start+l, - ref2, dest, d_start-l, d_start) - msg += f'\t{hd}' + ref1, + source, + s_start, + s_start + l, + ref2, + dest, + d_start - l, + d_start, + ) + msg += f"\t{hd}" print(line + msg, file=fo) - write_to_summary(args.summary, fs, strand, l, hd, source, s_start, dest, d_start) - if strand == '+': - s_start += (l + ds) - d_start += (l + dd) + write_to_summary( + args.summary, fs, strand, l, hd, source, s_start, dest, d_start + ) + if strand == "+": + s_start += l + ds + d_start += l + dd else: - s_start += (l + ds) - d_start -= (l + dd) + s_start += l + ds + d_start -= l + dd # write_to_summary(args.summary, fs, strand, l, hd, source, s_start, dest, d_start) if check_hdist: - total_bases_idy += (l * hd) - elif len(fields) == 1 and fields[0] != '': + total_bases_idy += l * hd + elif len(fields) == 1 and fields[0] != "": l = int(fields[0]) total_bases += l - if strand == '+': - msg = f'\t\t\t{source}:{s_start}-{s_start+l}=>{dest}:{d_start}-{d_start+l}' - if args.bed_prefix != '': - f_sbed.write(f'{source}\t{s_start}\t{s_start+l}\n') - f_dbed.write(f'{dest}\t{d_start}\t{d_start+l}\n') + if strand == "+": + msg = f"\t\t\t{source}:{s_start}-{s_start+l}=>{dest}:{d_start}-{d_start+l}" + if args.bed_prefix != "": + f_sbed.write(f"{source}\t{s_start}\t{s_start+l}\n") + f_dbed.write(f"{dest}\t{d_start}\t{d_start+l}\n") if check_hdist: hd = compute_hamming_dist( True, - ref1, source, s_start, s_start+l, - ref2, dest, d_start, d_start+l) - msg += f'\t{hd}' + ref1, + source, + s_start, + s_start + l, + ref2, + dest, + d_start, + d_start + l, + ) + msg += f"\t{hd}" else: - msg = f'\t\t\t{source}:{s_start}-{s_start+l}=>{dest}:{d_start}-{d_start-l}' - if args.bed_prefix != '': - f_sbed.write(f'{source}\t{s_start}\t{s_start+l}\n') - f_dbed.write(f'{dest}\t{d_start-l}\t{d_start}\n') + msg = f"\t\t\t{source}:{s_start}-{s_start+l}=>{dest}:{d_start}-{d_start-l}" + if args.bed_prefix != "": + f_sbed.write(f"{source}\t{s_start}\t{s_start+l}\n") + f_dbed.write(f"{dest}\t{d_start-l}\t{d_start}\n") if check_hdist: hd = compute_hamming_dist( False, - ref1, source, s_start, s_start+l, - ref2, dest, d_start-l, d_start) - msg += f'\t{hd}' - write_to_summary(args.summary, fs, strand, l, hd, source, s_start, dest, d_start) + ref1, + source, + s_start, + s_start + l, + ref2, + dest, + d_start - l, + d_start, + ) + msg += f"\t{hd}" + write_to_summary( + args.summary, fs, strand, l, hd, source, s_start, dest, d_start + ) if check_hdist: - total_bases_idy += (l * hd) - print(line + msg + '\n', file=fo) + total_bases_idy += l * hd + print(line + msg + "\n", file=fo) - print(f'Total number of gapless aligned bases = {total_bases}', file=sys.stderr) - print(f'Total number of gapless matched bases = {total_bases_idy}', file=sys.stderr) + print( + f"Total number of gapless aligned bases = {total_bases}", + file=sys.stderr, + ) + print( + f"Total number of gapless matched bases = {total_bases_idy}", + file=sys.stderr, + ) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() - print('Input chain:', args.chain, file=sys.stderr) - print('Output chain (verbose):', args.out, file=sys.stderr) + print("Input chain:", args.chain, file=sys.stderr) + print("Output chain (verbose):", args.out, file=sys.stderr) - assert (args.ref1 != '' and args.ref2 != '') or (args.ref1 == '' and args.ref2 == '') - print('Ref1:', args.ref1, file=sys.stderr) - print('Ref2:', args.ref2, file=sys.stderr) + assert (args.ref1 != "" and args.ref2 != "") or ( + args.ref1 == "" and args.ref2 == "" + ) + print("Ref1:", args.ref1, file=sys.stderr) + print("Ref2:", args.ref2, file=sys.stderr) verbosify_chain(args)