Cleanup + Bug fixes in z_scores module. (#49)

Jenny pointed out some bugs in the peak calling code. This PR resolves those errors (hopefully) and also cleans up the z score calling code.
gwlilabmit · Mar 3, 2023 · 361407b · 361407b
1 parent 9cace9a
commit 361407b
Show file tree

Hide file tree

Showing 5 changed files with 206 additions and 339 deletions.
diff --git a/rendseq/file_funcs.py b/rendseq/file_funcs.py
@@ -8,7 +8,7 @@
 from pandas import read_csv
 
 
-def validate_reads(reads):
+def _validate_reads(reads):
     """Make sure the given reads meet our format requirements.
 
     Parameters
@@ -48,7 +48,7 @@ def write_wig(wig_track, wig_file_name, chrom_name):
         - wig_track (required) - the wig data you wish to write (in 2xn array)
         - wig_file_name (string) - the new file you will write to
     """
-    validate_reads(wig_track)
+    _validate_reads(wig_track)
     d_inds = where(wig_track[:, 0] < 1)
     wig_track = delete(wig_track, d_inds, axis=0)
     with open(wig_file_name, "w+", encoding="utf-8") as wig_file:
@@ -83,7 +83,7 @@ def open_wig(filename):
         chrom = line[line.rfind("=") + 1 :].rstrip()
     # next we read all the wig file data and return that if it's valid:
     reads = asarray(read_csv(filename, sep="\t", header=1, names=["bp", "count"]))
-    validate_reads(reads)
+    _validate_reads(reads)
     return reads, chrom
 
 

diff --git a/rendseq/zscores.py b/rendseq/zscores.py
@@ -5,48 +5,62 @@
 import warnings
 from os.path import abspath
 
-from numpy import mean, std, zeros
+from numpy import mean, nan, std, zeros
 
-from rendseq.file_funcs import make_new_dir, open_wig, validate_reads, write_wig
+from rendseq.file_funcs import _validate_reads, make_new_dir, open_wig, write_wig
 
 
-def _adjust_down(cur_ind, target_val, reads):
+def _get_lower_reads(cur_ind, target_start, target_stop, reads):
     """Calculate the lower reads index in range for the z-score calculation."""
-    validate_reads(reads)
-    cur_ind = min(cur_ind, len(reads) - 1)
-    while reads[cur_ind, 0] > target_val:
+    cur_ind = max(min(cur_ind, len(reads) - 1), 0)
+    start_ind = cur_ind
+    vals = [0 for _ in range(int(reads[cur_ind, 0] - target_start - 1))]
+    while cur_ind >= 0 and reads[cur_ind, 0] >= target_stop:
+        if cur_ind != start_ind:
+            vals.extend(
+                0 for _ in range(int(reads[cur_ind, 0] - reads[cur_ind - 1, 0] - 1))
+            )
+        vals.append(reads[cur_ind, 1])
         cur_ind -= 1
 
-        if cur_ind == 0:
-            break
-    return cur_ind
+    return vals
 
 
-def _adjust_up(cur_ind, target_val, reads):
-    """Calculate the higher reads index in range for the z-score calculation."""
-    if len(reads) < 1:
-        raise ValueError("requires non-empty reads")
+def _get_upper_reads(cur_ind, target_start, target_stop, reads):
+    """Fetch the upper reads needed for z score calculation with zero padding."""
+    cur_ind = min(max(cur_ind, 0), len(reads) - 1)
+    start_ind = cur_ind
+    vals = [0 for _ in range(int(reads[cur_ind, 0] - target_start - 1))]
 
-    cur_ind = min(max(cur_ind, 0), len(reads))
-
-    while reads[cur_ind, 0] < target_val:
-        if cur_ind >= len(reads) - 1:
-            break
+    while reads[cur_ind, 0] < target_stop and not cur_ind >= len(reads) - 1:
+        if cur_ind != start_ind:
+            vals.extend(
+                0 for _ in range(int(reads[cur_ind, 0] - reads[cur_ind - 1, 0] - 1))
+            )
+        vals.append(reads[cur_ind, 1])
         cur_ind += 1
 
-    return cur_ind
+    return vals
 
 
-def _z_score(val, v_mean, v_std):
+def _calc_z_score(vals, calc_val):
     """Calculate a z-score given a value, mean, and standard deviation.
 
+    vals = values to calculate the z score with respect to.
+    calc_val = value to calculate z score for.
+
     NOTE: The z_score() of a constant vector is 0
     """
-    score = 0 if v_std == 0 else (val - v_mean) / v_std
-    return score
+    v_std = std(vals)
+    v_mean = mean(vals)
+    if nan in [v_mean, v_std]:
+        return calc_val
+    if v_std == 0:
+        return 0 if v_mean == calc_val else (calc_val - v_mean) / 0.2
+    return (calc_val - v_mean) / v_std
 
 
-def _remove_outliers(vals):
+def _remove_outliers(vals, method="remove_by_std"):
     """Normalize window of reads by removing outliers (values 2.5 std > mean).
 
     Parameters
@@ -59,74 +73,28 @@ def _remove_outliers(vals):
             removed.
     """
     normalized_vals = vals
-    if len(vals) > 1:
-        v_mean = mean(vals)
+
+    if method == "remove_by_std" and len(vals) > 1:
         v_std = std(vals)
         if v_std != 0:
-            normalized_vals = [v for v in vals if abs(_z_score(v, v_mean, v_std)) < 2.5]
+            normalized_vals = [v for v in vals if abs(_calc_z_score(vals, v)) < 2.5]
 
     return normalized_vals
 
 
-def _calc_score(vals, min_r, cur_val):
-    """Compute the z score.
-
-    Parameters
-    ----------
-        -vals raw read count values array
-        -min_r: the minumum number of reads needed to calculate score
-        -cur_val: the value for which the z score is being calculated
-
-    Returns
-    -------
-        -score: the zscore for the current value, or None if insufficent reads
-    """
-    score = None
-    if sum(vals + cur_val) > min_r:
-        v_mean = mean(vals)
-        v_std = std(vals)
-
-        score = _z_score(cur_val, v_mean, v_std)
-
-    return score
-
-
-def score_helper(start, stop, min_r, reads, i):
-    """Find the z-score of reads[i] relative to the subsection of reads.
-
-    Goes from start to stop, with a read cutoff of min_r
-    """
-    reads_outlierless = _remove_outliers(list(reads[start:stop, 1]))
-    return _calc_score(reads_outlierless, min_r, reads[i, 1])
-
-
-def validate_gap_window(gap, w_sz):
+def _validate_gap_window(gap, w_sz):
     """Check that gap and window size are reasonable in r/l_score_helper."""
     if w_sz < 1:
         raise ValueError("Window size must be larger than 1 to find a z-score")
     if gap < 0:
         raise ValueError("Gap size must be at least zero to find a z-score")
     if gap == 0:
-        warnings.warn("Warning...a gap size of 0 includes the current position.")
-
-
-def _l_score_helper(gap, w_sz, min_r, reads, i):
-    """Find the z_score based on reads to the left of the current pos."""
-    validate_gap_window(gap, w_sz)
-    l_start = _adjust_up(i - (gap + w_sz), reads[i, 0] - (gap + w_sz), reads)
-    l_stop = _adjust_up(i - gap, reads[i, 0] - gap, reads)
-    return score_helper(l_start, l_stop, min_r, reads, i)
-
-
-def _r_score_helper(gap, w_sz, min_r, reads, i):
-    """Find the z_score based on reads to the right of the current pos."""
-    validate_gap_window(gap, w_sz)
-    r_start = _adjust_down(i + gap, reads[i, 0] + gap, reads)
-    r_stop = _adjust_down(i + gap + w_sz, reads[i, 0] + gap + w_sz, reads)
-    return score_helper(r_start, r_stop, min_r, reads, i)
+        warnings.warn(
+            "Warning...a gap size of 0 includes the current position.", stacklevel=2
+        )
 
 
-def z_scores(reads, gap=5, w_sz=50, min_r=20):
+def z_scores(reads, gap=5, w_sz=50):
     """Perform modified z-score transformation of reads.
 
     Parameters
@@ -136,48 +104,41 @@ def z_scores(reads, gap=5, w_sz=50, min_r=20):
             interest that should be excluded in the z_score calculation.
         -w_sz (integer): the max distance (in nt) away from the current position
             one should include in zscore calulcation.
-        -min_r (integer): density threshold. If there are less than this number
-            of reads going into the z_score calculation for a point that point
-            is excluded.  note this is sum of reads in the window
-        -file_name (string): the base file_name, can be passed in to customize
-            the message printed
 
     Returns
     -------
-        -z_score (2xn array): a 2xn array with the first column being position
+        -z_scores (2xn array): a 2xn array with the first column being position
             and the second column being the z_score.
     """
+    _validate_gap_window(gap, w_sz)
+    _validate_reads(reads)
     # make array of zscores - same length as raw reads, trimming based on window size:
-    z_score = zeros([len(reads) - 2 * (gap + w_sz), 2])
+    z_scores = zeros([len(reads) - 2 * (gap + w_sz), 2])
 
     # first column of return array is the location of the raw reads
-    z_score[:, 0] = reads[gap + w_sz : len(reads) - (gap + w_sz), 0]
+    z_scores[:, 0] = reads[gap + w_sz : len(reads) - (gap + w_sz), 0]
 
     # Iterate through each valid read, recording z-score
     for i in range((gap + w_sz + 1), (len(reads) - (gap + w_sz))):
         # calculate the z score with values from the left:
-        l_score = _l_score_helper(gap, w_sz, min_r, reads, i)
+        l_vals = _get_upper_reads(
+            i + gap, reads[i, 0] + gap, reads[i, 0] + gap + w_sz, reads
+        )
+        l_score = _calc_z_score(_remove_outliers(l_vals), reads[i, 1])
+
         # calculate z score with reads from the right:
-        r_score = _r_score_helper(gap, w_sz, min_r, reads, i)
+        r_vals = _get_lower_reads(
+            i - gap, reads[i, 0] - gap, reads[i, 0] - gap - w_sz, reads
+        )
+        r_score = _calc_z_score(_remove_outliers(r_vals), reads[i, 1])
 
         # The location in which this z-score should go into the final array
         i_score_pos = i - (gap + w_sz)
 
-        # set the zscore to be the smaller valid score of the left/right scores
-        # If neither score is valid, Z-score is 0
-        z_score[i_score_pos, 1] = 0
-        if l_score is not None:
-            if r_score is not None:
-                z_score[i_score_pos, 1] = (
-                    r_score if abs(r_score) < abs(l_score) else l_score
-                )
-            else:
-                z_score[i_score_pos, 1] = l_score
-
-        elif r_score is not None:
-            z_score[i_score_pos, 1] = r_score
+        # set the zscore to be the smaller valid score of the left/right scores.
+        z_scores[i_score_pos, 1] = r_score if abs(r_score) < abs(l_score) else l_score
 
-    return z_score
+    return z_scores
 
 
 def parse_args_zscores(args):
@@ -211,16 +172,6 @@ def parse_args_zscores(args):
                                         Default to 50.",
         default=50,
     )
-    parser.add_argument(
-        "--min_r",
-        help="min_r (integer): density threshold.\
-                                        If there are less than this number of\
-                                        reads going into the z_score\
-                                        calculation for a point that point is\
-                                        excluded.  note this is sum of reads in\
-                                        the window.  Default is 20",
-        default=20,
-    )
     parser.add_argument(
         "--save_file",
         help="Save the z_scores file as a new\
@@ -243,30 +194,31 @@ def main_zscores():
     filename = args.filename
     print(f"Calculating zscores for file {filename}.")
     reads, chrom = open_wig(filename)
-    z_score = z_scores(
-        reads, gap=int(args.gap), w_sz=int(args.w_sz), min_r=int(args.min_r)
-    )
+    z_score = z_scores(reads, gap=int(args.gap), w_sz=int(args.w_sz))
 
     # Save file, if applicable
     if args.save_file:
-        filename = abspath(filename).replace("\\", "/")
-        file_loc = filename[: filename.rfind("/")]
-        z_score_dir = make_new_dir([file_loc, "/Z_scores"])
-        file_start = filename[filename.rfind("/") : filename.rfind(".")]
-        z_score_file = "".join([z_score_dir, file_start, "_zscores.wig"])
-        write_wig(z_score, z_score_file, chrom)
-        print(f"Wrote z_scores to {z_score_file}")
-
+        _save_zscore(filename, z_score, chrom)
     print(
         "\n".join(
             [
-                f"Ran zscores.py with the following settings:",
+                "Ran zscores.py with the following settings:",
                 f"gap: {args.gap}, w_sz: {args.w_sz},",
-                f"min_r: {args.min_r}, file_name: {args.filename}",
+                f"file_name: {args.filename}",
             ]
         )
     )
 
 
+def _save_zscore(filename, z_score, chrom):
+    filename = abspath(filename).replace("\\", "/")
+    file_loc = filename[: filename.rfind("/")]
+    z_score_dir = make_new_dir([file_loc, "/Z_scores"])
+    file_start = filename[filename.rfind("/") : filename.rfind(".")]
+    z_score_file = "".join([z_score_dir, file_start, "_zscores.wig"])
+    write_wig(z_score, z_score_file, chrom)
+    print(f"Wrote z_scores to {z_score_file}")
+
+
 if __name__ == "__main__":
     main_zscores()
diff --git a/tests/test_file_funcs.py b/tests/test_file_funcs.py
@@ -4,28 +4,28 @@
 from numpy import array
 from numpy.testing import assert_array_equal
 
-from rendseq.file_funcs import make_new_dir, open_wig, validate_reads, write_wig
+from rendseq.file_funcs import _validate_reads, make_new_dir, open_wig, write_wig
 
 
 class TestValidateReads:
     def test_correct(self):
         """validate a correct read array"""
         try:
-            validate_reads(array([[1, 2], [3, 4]]))
+            _validate_reads(array([[1, 2], [3, 4]]))
         except Exception as e:
             assert False, f"validate_reads invalid exception: {e}"
 
     def test_incorrect_dim(self):
         """read array has too many columns"""
         with pytest.raises(ValueError) as e_info:
-            validate_reads(array([[1, 2, 3], [4, 5, 6]]))
+            _validate_reads(array([[1, 2, 3], [4, 5, 6]]))
 
         assert e_info.value.args[0] == "reads must be (n,2), not (2, 3)"
 
     def test_incorrect_type(self):
         """read array isn't actually an array"""
         with pytest.raises(ValueError) as e_info:
-            validate_reads([1, 2, 3])
+            _validate_reads([1, 2, 3])
 
         assert e_info.value.args[0] == "reads must be numpy array, not <class 'list'>"