qiyunzhu · wasade · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Change Log
 
+## Version 0.1.6-dev
+
+### Fixed
+- Corrected coverage calculation to (1) remove an unnecessary left shift on start and (2) record the stop coordinate as exclusive in accordance with bedtools ([#204](https://github.com/qiyunzhu/woltka/pull/204)).
+
 ## Version 0.1.6 (2/22/2024)
 
 ### Changed

diff --git a/woltka/align.py b/woltka/align.py
@@ -457,7 +457,7 @@ def parse_sam_file_ex(fh):
             this, pool = qname, ([], [], [])
 
         # append
-        pool[mate].append((rname, None, length, pos, pos + offset - 1))
+        pool[mate].append((rname, None, length, pos, pos + offset))
 
     # final yield
     if pool[0]:
@@ -590,7 +590,7 @@ def parse_sam_file_ex_ft(fh, excl):
                 pos = int(pos)
                 length, offset = cigar_to_lens(cigar)
                 pool[int(flag) >> 6 & 3].append((
-                    rname, None, length, pos, pos + offset - 1))
+                    rname, None, length, pos, pos + offset))
 
         elif keep:
             if rname in excl:
@@ -599,7 +599,7 @@ def parse_sam_file_ex_ft(fh, excl):
                 pos = int(pos)
                 length, offset = cigar_to_lens(cigar)
                 pool[int(flag) >> 6 & 3].append((
-                    rname, None, length, pos, pos + offset - 1))
+                    rname, None, length, pos, pos + offset))
 
     if pool[0]:
         yield this, pool[0]
@@ -718,7 +718,7 @@ def parse_sam_file_pd(fh, n=65536):
     #         # this is slow, because of function all
     #         chunk['length'], offset = zip(*chunk['cigar'].apply(
     #             cigar_to_lens))
-    #         chunk['right'] = chunk['pos'] + offset - 1
+    #         chunk['right'] = chunk['pos'] + offset
     #         # this is slow, because of function all
     #         # chunk['qname'] = chunk[['qname', 'flag']].apply(
     #         #   qname_by_flag, axis=1)

diff --git a/woltka/coverage.py b/woltka/coverage.py
@@ -50,7 +50,7 @@ def merge_ranges(ranges):
         if cend is None:
             # case 1: no active range, start active range
             cstart, cend = start, end
-        elif cend >= start - 1:
+        elif cend >= start:
             # case 2: active range continues through this range
             # extend active range
             cend = max(cend, end)

diff --git a/woltka/tests/test_align.py b/woltka/tests/test_align.py
@@ -304,13 +304,13 @@ def test_parse_sam_file_ex(self):
         )
         obs = list(parse_sam_file_ex(iter(sam)))
         exp = [
-            ('S1/1', [('NC_123456', None, 100, 26,  125)]),
-            ('S1/2', [('NC_123456', None,  80, 151, 230)]),
-            ('S2',   [('NC_789012', None,  90, 186, 280)]),
-            ('S3/1', [('NC_123456', None, 100, 452, 551),
-                      ('NC_345678', None, 100, 133, 232)]),
-            ('S3/2', [('NC_123456', None,  95, 378, 477),
-                      ('NC_345678', None,  95, 261, 355)])
+            ('S1/1', [('NC_123456', None, 100, 26,  126)]),
+            ('S1/2', [('NC_123456', None,  80, 151, 231)]),
+            ('S2',   [('NC_789012', None,  90, 186, 281)]),
+            ('S3/1', [('NC_123456', None, 100, 452, 552),
+                      ('NC_345678', None, 100, 133, 233)]),
+            ('S3/2', [('NC_123456', None,  95, 378, 478),
+                      ('NC_345678', None,  95, 261, 356)])
         ]
         self.assertEqual(len(obs), len(exp))
         for o, e in zip(obs, exp):
@@ -375,13 +375,13 @@ def test_parse_sam_file_ex_ft(self):
         )
         obs = list(parse_sam_file_ex_ft(iter(sam), {'G1'}))
         exp = [
-            ('S2',   [('G2', None, 50, 80, 129),
-                      ('G3', None, 50, 80, 129)]),
-            ('S2/1', [('G3', None, 50, 80, 129)]),
-            ('S2/2', [('G4', None, 50, 80, 129)]),
-            ('S4',   [('G6', None, 50, 80, 129)]),
-            ('S4/1', [('G6', None, 50, 80, 129)]),
-            ('S4/2', [('G6', None, 50, 80, 129)])
+            ('S2',   [('G2', None, 50, 80, 130),
+                      ('G3', None, 50, 80, 130)]),
+            ('S2/1', [('G3', None, 50, 80, 130)]),
+            ('S2/2', [('G4', None, 50, 80, 130)]),
+            ('S4',   [('G6', None, 50, 80, 130)]),
+            ('S4/1', [('G6', None, 50, 80, 130)]),
+            ('S4/2', [('G6', None, 50, 80, 130)])
         ]
         self.assertEqual(len(obs), len(exp))
         for o, e in zip(obs, exp):

diff --git a/woltka/tests/test_coverage.py b/woltka/tests/test_coverage.py
@@ -38,8 +38,8 @@ def test_merge_ranges(self):
         self.assertListEqual(obs, exp)
 
         # ranges that are connected (but not overlapped)
-        obs = merge_ranges([1, 2, 3, 4, 5, 6])
-        exp = [1, 6]
+        obs = merge_ranges([1, 2, 2, 3, 3, 4])
+        exp = [1, 4]
         self.assertListEqual(obs, exp)
 
         # empty range list
@@ -79,8 +79,8 @@ def test_parse_ranges(self):
                ('S2', 'G2'): [0, [26, 100, 151, 200]],
                ('S2', 'G3'): [0, [1, 50, 76, 125]],
                ('S2', 'G4'): [0, [26, 75, 101, 150]],
-               ('S3', 'G1'): [0, [1, 200]],
-               ('S3', 'G2'): [0, [1, 100]]}
+               ('S3', 'G1'): [0, [1, 50, 51, 100, 101, 150, 151, 200]],
+               ('S3', 'G2'): [0, [1, 50, 51, 100]]}
         self.assertDictEqual(obs, exp)
 
     def test_calc_coverage(self):
@@ -97,8 +97,8 @@ def test_calc_coverage(self):
                'S2': {'G2': [26, 100, 151, 200],
                       'G3': [1, 50, 76, 125],
                       'G4': [26, 75, 101, 150]},
-               'S3': {'G1': [1, 200],
-                      'G2': [1, 100]}}
+               'S3': {'G1': [1, 50, 51, 100, 101, 150, 151, 200],
+                      'G2': [1, 50, 51, 100]}}
         self.assertDictEqual(obs, exp)
 
     def test_write_coverage(self):

diff --git a/woltka/tests/test_ordinal.py b/woltka/tests/test_ordinal.py
@@ -265,9 +265,9 @@ def test_ordinal_parser_dummy(self):
             'NC_123456': {0: 100, 1: 80},
             'NC_789012': {2: 90}})
         self.assertDictEqual(obs[2], {
-            'NC_123456': [(26,  True, False, 0), (125, False, False, 0),
-                          (151, True, False, 1), (230, False, False, 1)],
-            'NC_789012': [(186, True, False, 2), (280, False, False, 2)]})
+            'NC_123456': [(26,  True, False, 0), (126, False, False, 0),
+                          (151, True, False, 1), (231, False, False, 1)],
+            'NC_789012': [(186, True, False, 2), (281, False, False, 2)]})
 
     def test_ordinal_mapper(self):
         # uses the same example as above, with some noises

diff --git a/woltka/tests/test_workflow.py b/woltka/tests/test_workflow.py
@@ -57,8 +57,8 @@ def test_coverage(self):
         with open(join(outcov_dir, 'S04.cov'), 'r') as f:
             obs = f.read().splitlines()
         self.assertEqual(len(obs), 1078)
-        self.assertEqual(obs[10], 'G000007265\t2092666\t2092815')
-        self.assertEqual(obs[200], 'G000215745\t768758\t769038')
+        self.assertEqual(obs[10], 'G000007265\t2092666\t2092816')
+        self.assertEqual(obs[200], 'G000215745\t768758\t769039')
         remove(output_fp)
         rmtree(outcov_dir)