From 56dc1a893c6618080067acaff0f7f6a9e0aa3290 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 17 Apr 2024 22:15:40 -0700 Subject: [PATCH 1/8] TST: stop coordinate to exclusive --- woltka/tests/test_align.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/woltka/tests/test_align.py b/woltka/tests/test_align.py index 12a10a5..5447193 100644 --- a/woltka/tests/test_align.py +++ b/woltka/tests/test_align.py @@ -375,13 +375,13 @@ def test_parse_sam_file_ex_ft(self): ) obs = list(parse_sam_file_ex_ft(iter(sam), {'G1'})) exp = [ - ('S2', [('G2', None, 50, 80, 129), - ('G3', None, 50, 80, 129)]), - ('S2/1', [('G3', None, 50, 80, 129)]), - ('S2/2', [('G4', None, 50, 80, 129)]), - ('S4', [('G6', None, 50, 80, 129)]), - ('S4/1', [('G6', None, 50, 80, 129)]), - ('S4/2', [('G6', None, 50, 80, 129)]) + ('S2', [('G2', None, 50, 80, 130), + ('G3', None, 50, 80, 130)]), + ('S2/1', [('G3', None, 50, 80, 130)]), + ('S2/2', [('G4', None, 50, 80, 130)]), + ('S4', [('G6', None, 50, 80, 130)]), + ('S4/1', [('G6', None, 50, 80, 130)]), + ('S4/2', [('G6', None, 50, 80, 130)]) ] self.assertEqual(len(obs), len(exp)) for o, e in zip(obs, exp): From 3ec85e3d2aa946fd351e83c941592fef215a8d90 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 17 Apr 2024 22:18:48 -0700 Subject: [PATCH 2/8] BUG: switch stop coordinate to exclusive to be consistent with bedtools --- woltka/tests/test_align.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/woltka/tests/test_align.py b/woltka/tests/test_align.py index 5447193..3323c80 100644 --- a/woltka/tests/test_align.py +++ b/woltka/tests/test_align.py @@ -304,13 +304,13 @@ def test_parse_sam_file_ex(self): ) obs = list(parse_sam_file_ex(iter(sam))) exp = [ - ('S1/1', [('NC_123456', None, 100, 26, 125)]), - ('S1/2', [('NC_123456', None, 80, 151, 230)]), - ('S2', [('NC_789012', None, 90, 186, 280)]), - ('S3/1', [('NC_123456', None, 100, 452, 551), - ('NC_345678', None, 100, 133, 232)]), - ('S3/2', [('NC_123456', None, 95, 378, 477), - ('NC_345678', None, 95, 261, 355)]) + ('S1/1', [('NC_123456', None, 100, 26, 126)]), + ('S1/2', [('NC_123456', None, 80, 151, 231)]), + ('S2', [('NC_789012', None, 90, 186, 281)]), + ('S3/1', [('NC_123456', None, 100, 452, 552), + ('NC_345678', None, 100, 133, 233)]), + ('S3/2', [('NC_123456', None, 95, 378, 478), + ('NC_345678', None, 95, 261, 356)]) ] self.assertEqual(len(obs), len(exp)) for o, e in zip(obs, exp): From 75567567edc229cdd045dc0dd608a1ce61eca37b Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 17 Apr 2024 22:25:03 -0700 Subject: [PATCH 3/8] TST: remove left shift --- woltka/tests/test_coverage.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/woltka/tests/test_coverage.py b/woltka/tests/test_coverage.py index 2b094cf..70eee60 100644 --- a/woltka/tests/test_coverage.py +++ b/woltka/tests/test_coverage.py @@ -38,8 +38,8 @@ def test_merge_ranges(self): self.assertListEqual(obs, exp) # ranges that are connected (but not overlapped) - obs = merge_ranges([1, 2, 3, 4, 5, 6]) - exp = [1, 6] + obs = merge_ranges([1, 2, 2, 3, 3, 4]) + exp = [1, 4] self.assertListEqual(obs, exp) # empty range list @@ -79,8 +79,8 @@ def test_parse_ranges(self): ('S2', 'G2'): [0, [26, 100, 151, 200]], ('S2', 'G3'): [0, [1, 50, 76, 125]], ('S2', 'G4'): [0, [26, 75, 101, 150]], - ('S3', 'G1'): [0, [1, 200]], - ('S3', 'G2'): [0, [1, 100]]} + ('S3', 'G1'): [0, [1, 50, 51, 100, 101, 150, 151, 200]], + ('S3', 'G2'): [0, [1, 50, 51, 100]]} self.assertDictEqual(obs, exp) def test_calc_coverage(self): @@ -97,8 +97,8 @@ def test_calc_coverage(self): 'S2': {'G2': [26, 100, 151, 200], 'G3': [1, 50, 76, 125], 'G4': [26, 75, 101, 150]}, - 'S3': {'G1': [1, 200], - 'G2': [1, 100]}} + 'S3': {'G1': [1, 50, 51, 100, 101, 150, 151, 200], + 'G2': [1, 50, 51, 100]}} self.assertDictEqual(obs, exp) def test_write_coverage(self): From 2e91f97da8ac76c5ed5ab7046d5a4b99d0ca3e0b Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 17 Apr 2024 22:25:24 -0700 Subject: [PATCH 4/8] BUG: remove left shift in range calculation --- woltka/coverage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/woltka/coverage.py b/woltka/coverage.py index b1151bf..a527e34 100644 --- a/woltka/coverage.py +++ b/woltka/coverage.py @@ -50,7 +50,7 @@ def merge_ranges(ranges): if cend is None: # case 1: no active range, start active range cstart, cend = start, end - elif cend >= start - 1: + elif cend >= start: # case 2: active range continues through this range # extend active range cend = max(cend, end) From 81b087737d83d476f64b7d487569e775ef646820 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 17 Apr 2024 22:27:11 -0700 Subject: [PATCH 5/8] DOC: changelog mention --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ecc49d..90e9335 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Change Log +## Version 0.1.6-dev + +### Fixed +- Corrected coverage calculation to (1) remove an unnecessary left shift on start and (2) record the stop coordinate as exclusive in accordance with bedtools ([#204](https://github.com/qiyunzhu/woltka/pull/204)). + ## Version 0.1.6 (2/22/2024) ### Changed From 036948c202f66f45368935fc8d464d171c4e2311 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 17 Apr 2024 22:47:54 -0700 Subject: [PATCH 6/8] Missed an add --- woltka/align.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/woltka/align.py b/woltka/align.py index 5eba418..6dba1e0 100644 --- a/woltka/align.py +++ b/woltka/align.py @@ -457,7 +457,7 @@ def parse_sam_file_ex(fh): this, pool = qname, ([], [], []) # append - pool[mate].append((rname, None, length, pos, pos + offset - 1)) + pool[mate].append((rname, None, length, pos, pos + offset)) # final yield if pool[0]: @@ -590,7 +590,7 @@ def parse_sam_file_ex_ft(fh, excl): pos = int(pos) length, offset = cigar_to_lens(cigar) pool[int(flag) >> 6 & 3].append(( - rname, None, length, pos, pos + offset - 1)) + rname, None, length, pos, pos + offset)) elif keep: if rname in excl: @@ -599,7 +599,7 @@ def parse_sam_file_ex_ft(fh, excl): pos = int(pos) length, offset = cigar_to_lens(cigar) pool[int(flag) >> 6 & 3].append(( - rname, None, length, pos, pos + offset - 1)) + rname, None, length, pos, pos + offset)) if pool[0]: yield this, pool[0] @@ -718,7 +718,7 @@ def parse_sam_file_pd(fh, n=65536): # # this is slow, because of function all # chunk['length'], offset = zip(*chunk['cigar'].apply( # cigar_to_lens)) - # chunk['right'] = chunk['pos'] + offset - 1 + # chunk['right'] = chunk['pos'] + offset # # this is slow, because of function all # # chunk['qname'] = chunk[['qname', 'flag']].apply( # # qname_by_flag, axis=1) From e591c964856018de45f1d7a74300586b9e3e7910 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 17 Apr 2024 22:57:17 -0700 Subject: [PATCH 7/8] Reflect adjusted stop coordinate --- woltka/tests/test_ordinal.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/woltka/tests/test_ordinal.py b/woltka/tests/test_ordinal.py index 10704f8..10bfc97 100644 --- a/woltka/tests/test_ordinal.py +++ b/woltka/tests/test_ordinal.py @@ -265,9 +265,9 @@ def test_ordinal_parser_dummy(self): 'NC_123456': {0: 100, 1: 80}, 'NC_789012': {2: 90}}) self.assertDictEqual(obs[2], { - 'NC_123456': [(26, True, False, 0), (125, False, False, 0), - (151, True, False, 1), (230, False, False, 1)], - 'NC_789012': [(186, True, False, 2), (280, False, False, 2)]}) + 'NC_123456': [(26, True, False, 0), (126, False, False, 0), + (151, True, False, 1), (231, False, False, 1)], + 'NC_789012': [(186, True, False, 2), (281, False, False, 2)]}) def test_ordinal_mapper(self): # uses the same example as above, with some noises From b7ca01539dc3d3120b3dfd7cc989a701792ff22a Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 17 Apr 2024 22:58:56 -0700 Subject: [PATCH 8/8] Reflect adjusted stop coordinate --- woltka/tests/test_workflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/woltka/tests/test_workflow.py b/woltka/tests/test_workflow.py index b54c9ab..5604da7 100644 --- a/woltka/tests/test_workflow.py +++ b/woltka/tests/test_workflow.py @@ -57,8 +57,8 @@ def test_coverage(self): with open(join(outcov_dir, 'S04.cov'), 'r') as f: obs = f.read().splitlines() self.assertEqual(len(obs), 1078) - self.assertEqual(obs[10], 'G000007265\t2092666\t2092815') - self.assertEqual(obs[200], 'G000215745\t768758\t769038') + self.assertEqual(obs[10], 'G000007265\t2092666\t2092816') + self.assertEqual(obs[200], 'G000215745\t768758\t769039') remove(output_fp) rmtree(outcov_dir)