Skip to content

Commit

Permalink
Cleanup + Bug fixes in z_scores module. (#49)
Browse files Browse the repository at this point in the history
Jenny pointed out some bugs in the peak calling code.  This PR resolves those errors (hopefully) and also cleans up the z score calling code.
  • Loading branch information
miraep8 authored Mar 3, 2023
1 parent 9cace9a commit 361407b
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 339 deletions.
6 changes: 3 additions & 3 deletions rendseq/file_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pandas import read_csv


def validate_reads(reads):
def _validate_reads(reads):
"""Make sure the given reads meet our format requirements.
Parameters
Expand Down Expand Up @@ -48,7 +48,7 @@ def write_wig(wig_track, wig_file_name, chrom_name):
- wig_track (required) - the wig data you wish to write (in 2xn array)
- wig_file_name (string) - the new file you will write to
"""
validate_reads(wig_track)
_validate_reads(wig_track)
d_inds = where(wig_track[:, 0] < 1)
wig_track = delete(wig_track, d_inds, axis=0)
with open(wig_file_name, "w+", encoding="utf-8") as wig_file:
Expand Down Expand Up @@ -83,7 +83,7 @@ def open_wig(filename):
chrom = line[line.rfind("=") + 1 :].rstrip()
# next we read all the wig file data and return that if it's valid:
reads = asarray(read_csv(filename, sep="\t", header=1, names=["bp", "count"]))
validate_reads(reads)
_validate_reads(reads)
return reads, chrom


Expand Down
200 changes: 76 additions & 124 deletions rendseq/zscores.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,48 +5,62 @@
import warnings
from os.path import abspath

from numpy import mean, std, zeros
from numpy import mean, nan, std, zeros

from rendseq.file_funcs import make_new_dir, open_wig, validate_reads, write_wig
from rendseq.file_funcs import _validate_reads, make_new_dir, open_wig, write_wig


def _adjust_down(cur_ind, target_val, reads):
def _get_lower_reads(cur_ind, target_start, target_stop, reads):
"""Calculate the lower reads index in range for the z-score calculation."""
validate_reads(reads)
cur_ind = min(cur_ind, len(reads) - 1)
while reads[cur_ind, 0] > target_val:
cur_ind = max(min(cur_ind, len(reads) - 1), 0)
start_ind = cur_ind
vals = [0 for _ in range(int(reads[cur_ind, 0] - target_start - 1))]
while cur_ind >= 0 and reads[cur_ind, 0] >= target_stop:
if cur_ind != start_ind:
vals.extend(
0 for _ in range(int(reads[cur_ind, 0] - reads[cur_ind - 1, 0] - 1))
)
vals.append(reads[cur_ind, 1])
cur_ind -= 1

if cur_ind == 0:
break
return cur_ind
return vals


def _adjust_up(cur_ind, target_val, reads):
"""Calculate the higher reads index in range for the z-score calculation."""
if len(reads) < 1:
raise ValueError("requires non-empty reads")
def _get_upper_reads(cur_ind, target_start, target_stop, reads):
"""Fetch the upper reads needed for z score calculation with zero padding."""
cur_ind = min(max(cur_ind, 0), len(reads) - 1)
start_ind = cur_ind
vals = [0 for _ in range(int(reads[cur_ind, 0] - target_start - 1))]

cur_ind = min(max(cur_ind, 0), len(reads))

while reads[cur_ind, 0] < target_val:
if cur_ind >= len(reads) - 1:
break
while reads[cur_ind, 0] < target_stop and not cur_ind >= len(reads) - 1:
if cur_ind != start_ind:
vals.extend(
0 for _ in range(int(reads[cur_ind, 0] - reads[cur_ind - 1, 0] - 1))
)
vals.append(reads[cur_ind, 1])
cur_ind += 1

return cur_ind
return vals


def _z_score(val, v_mean, v_std):
def _calc_z_score(vals, calc_val):
"""Calculate a z-score given a value, mean, and standard deviation.
vals = values to calculate the z score with respect to.
calc_val = value to calculate z score for.
NOTE: The z_score() of a constant vector is 0
"""
score = 0 if v_std == 0 else (val - v_mean) / v_std
return score
v_std = std(vals)
v_mean = mean(vals)
if nan in [v_mean, v_std]:
return calc_val
if v_std == 0:
return 0 if v_mean == calc_val else (calc_val - v_mean) / 0.2
return (calc_val - v_mean) / v_std


def _remove_outliers(vals):
def _remove_outliers(vals, method="remove_by_std"):
"""Normalize window of reads by removing outliers (values 2.5 std > mean).
Parameters
Expand All @@ -59,74 +73,28 @@ def _remove_outliers(vals):
removed.
"""
normalized_vals = vals
if len(vals) > 1:
v_mean = mean(vals)

if method == "remove_by_std" and len(vals) > 1:
v_std = std(vals)
if v_std != 0:
normalized_vals = [v for v in vals if abs(_z_score(v, v_mean, v_std)) < 2.5]
normalized_vals = [v for v in vals if abs(_calc_z_score(vals, v)) < 2.5]

return normalized_vals


def _calc_score(vals, min_r, cur_val):
"""Compute the z score.
Parameters
----------
-vals raw read count values array
-min_r: the minumum number of reads needed to calculate score
-cur_val: the value for which the z score is being calculated
Returns
-------
-score: the zscore for the current value, or None if insufficent reads
"""
score = None
if sum(vals + cur_val) > min_r:
v_mean = mean(vals)
v_std = std(vals)

score = _z_score(cur_val, v_mean, v_std)

return score


def score_helper(start, stop, min_r, reads, i):
"""Find the z-score of reads[i] relative to the subsection of reads.
Goes from start to stop, with a read cutoff of min_r
"""
reads_outlierless = _remove_outliers(list(reads[start:stop, 1]))
return _calc_score(reads_outlierless, min_r, reads[i, 1])


def validate_gap_window(gap, w_sz):
def _validate_gap_window(gap, w_sz):
"""Check that gap and window size are reasonable in r/l_score_helper."""
if w_sz < 1:
raise ValueError("Window size must be larger than 1 to find a z-score")
if gap < 0:
raise ValueError("Gap size must be at least zero to find a z-score")
if gap == 0:
warnings.warn("Warning...a gap size of 0 includes the current position.")


def _l_score_helper(gap, w_sz, min_r, reads, i):
"""Find the z_score based on reads to the left of the current pos."""
validate_gap_window(gap, w_sz)
l_start = _adjust_up(i - (gap + w_sz), reads[i, 0] - (gap + w_sz), reads)
l_stop = _adjust_up(i - gap, reads[i, 0] - gap, reads)
return score_helper(l_start, l_stop, min_r, reads, i)


def _r_score_helper(gap, w_sz, min_r, reads, i):
"""Find the z_score based on reads to the right of the current pos."""
validate_gap_window(gap, w_sz)
r_start = _adjust_down(i + gap, reads[i, 0] + gap, reads)
r_stop = _adjust_down(i + gap + w_sz, reads[i, 0] + gap + w_sz, reads)
return score_helper(r_start, r_stop, min_r, reads, i)
warnings.warn(
"Warning...a gap size of 0 includes the current position.", stacklevel=2
)


def z_scores(reads, gap=5, w_sz=50, min_r=20):
def z_scores(reads, gap=5, w_sz=50):
"""Perform modified z-score transformation of reads.
Parameters
Expand All @@ -136,48 +104,41 @@ def z_scores(reads, gap=5, w_sz=50, min_r=20):
interest that should be excluded in the z_score calculation.
-w_sz (integer): the max distance (in nt) away from the current position
one should include in zscore calulcation.
-min_r (integer): density threshold. If there are less than this number
of reads going into the z_score calculation for a point that point
is excluded. note this is sum of reads in the window
-file_name (string): the base file_name, can be passed in to customize
the message printed
Returns
-------
-z_score (2xn array): a 2xn array with the first column being position
-z_scores (2xn array): a 2xn array with the first column being position
and the second column being the z_score.
"""
_validate_gap_window(gap, w_sz)
_validate_reads(reads)
# make array of zscores - same length as raw reads, trimming based on window size:
z_score = zeros([len(reads) - 2 * (gap + w_sz), 2])
z_scores = zeros([len(reads) - 2 * (gap + w_sz), 2])

# first column of return array is the location of the raw reads
z_score[:, 0] = reads[gap + w_sz : len(reads) - (gap + w_sz), 0]
z_scores[:, 0] = reads[gap + w_sz : len(reads) - (gap + w_sz), 0]

# Iterate through each valid read, recording z-score
for i in range((gap + w_sz + 1), (len(reads) - (gap + w_sz))):
# calculate the z score with values from the left:
l_score = _l_score_helper(gap, w_sz, min_r, reads, i)
l_vals = _get_upper_reads(
i + gap, reads[i, 0] + gap, reads[i, 0] + gap + w_sz, reads
)
l_score = _calc_z_score(_remove_outliers(l_vals), reads[i, 1])

# calculate z score with reads from the right:
r_score = _r_score_helper(gap, w_sz, min_r, reads, i)
r_vals = _get_lower_reads(
i - gap, reads[i, 0] - gap, reads[i, 0] - gap - w_sz, reads
)
r_score = _calc_z_score(_remove_outliers(r_vals), reads[i, 1])

# The location in which this z-score should go into the final array
i_score_pos = i - (gap + w_sz)

# set the zscore to be the smaller valid score of the left/right scores
# If neither score is valid, Z-score is 0
z_score[i_score_pos, 1] = 0
if l_score is not None:
if r_score is not None:
z_score[i_score_pos, 1] = (
r_score if abs(r_score) < abs(l_score) else l_score
)
else:
z_score[i_score_pos, 1] = l_score

elif r_score is not None:
z_score[i_score_pos, 1] = r_score
# set the zscore to be the smaller valid score of the left/right scores.
z_scores[i_score_pos, 1] = r_score if abs(r_score) < abs(l_score) else l_score

return z_score
return z_scores


def parse_args_zscores(args):
Expand Down Expand Up @@ -211,16 +172,6 @@ def parse_args_zscores(args):
Default to 50.",
default=50,
)
parser.add_argument(
"--min_r",
help="min_r (integer): density threshold.\
If there are less than this number of\
reads going into the z_score\
calculation for a point that point is\
excluded. note this is sum of reads in\
the window. Default is 20",
default=20,
)
parser.add_argument(
"--save_file",
help="Save the z_scores file as a new\
Expand All @@ -243,30 +194,31 @@ def main_zscores():
filename = args.filename
print(f"Calculating zscores for file {filename}.")
reads, chrom = open_wig(filename)
z_score = z_scores(
reads, gap=int(args.gap), w_sz=int(args.w_sz), min_r=int(args.min_r)
)
z_score = z_scores(reads, gap=int(args.gap), w_sz=int(args.w_sz))

# Save file, if applicable
if args.save_file:
filename = abspath(filename).replace("\\", "/")
file_loc = filename[: filename.rfind("/")]
z_score_dir = make_new_dir([file_loc, "/Z_scores"])
file_start = filename[filename.rfind("/") : filename.rfind(".")]
z_score_file = "".join([z_score_dir, file_start, "_zscores.wig"])
write_wig(z_score, z_score_file, chrom)
print(f"Wrote z_scores to {z_score_file}")

_save_zscore(filename, z_score, chrom)
print(
"\n".join(
[
f"Ran zscores.py with the following settings:",
"Ran zscores.py with the following settings:",
f"gap: {args.gap}, w_sz: {args.w_sz},",
f"min_r: {args.min_r}, file_name: {args.filename}",
f"file_name: {args.filename}",
]
)
)


def _save_zscore(filename, z_score, chrom):
filename = abspath(filename).replace("\\", "/")
file_loc = filename[: filename.rfind("/")]
z_score_dir = make_new_dir([file_loc, "/Z_scores"])
file_start = filename[filename.rfind("/") : filename.rfind(".")]
z_score_file = "".join([z_score_dir, file_start, "_zscores.wig"])
write_wig(z_score, z_score_file, chrom)
print(f"Wrote z_scores to {z_score_file}")


if __name__ == "__main__":
main_zscores()
8 changes: 4 additions & 4 deletions tests/test_file_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,28 @@
from numpy import array
from numpy.testing import assert_array_equal

from rendseq.file_funcs import make_new_dir, open_wig, validate_reads, write_wig
from rendseq.file_funcs import _validate_reads, make_new_dir, open_wig, write_wig


class TestValidateReads:
def test_correct(self):
"""validate a correct read array"""
try:
validate_reads(array([[1, 2], [3, 4]]))
_validate_reads(array([[1, 2], [3, 4]]))
except Exception as e:
assert False, f"validate_reads invalid exception: {e}"

def test_incorrect_dim(self):
"""read array has too many columns"""
with pytest.raises(ValueError) as e_info:
validate_reads(array([[1, 2, 3], [4, 5, 6]]))
_validate_reads(array([[1, 2, 3], [4, 5, 6]]))

assert e_info.value.args[0] == "reads must be (n,2), not (2, 3)"

def test_incorrect_type(self):
"""read array isn't actually an array"""
with pytest.raises(ValueError) as e_info:
validate_reads([1, 2, 3])
_validate_reads([1, 2, 3])

assert e_info.value.args[0] == "reads must be numpy array, not <class 'list'>"

Expand Down
Loading

0 comments on commit 361407b

Please sign in to comment.