Skip to content

Commit

Permalink
Make window step default to window size
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite authored and mergify[bot] committed Nov 19, 2020
1 parent 25eb5ca commit 1880cfd
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 27 deletions.
10 changes: 5 additions & 5 deletions sgkit/stats/popgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def diversity(
[0.5 , 0.5 ]])
>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE
array([[1.83333333, 1.83333333],
[1. , 1. ]])
Expand Down Expand Up @@ -239,7 +239,7 @@ def divergence(
[0.625 , 0.5 ]]])
>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
array([[[1.83333333, 1.5 ],
[1.5 , 1.83333333]],
Expand Down Expand Up @@ -431,7 +431,7 @@ def Fst(
[ 0.2 , nan]]])
>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
array([[[ nan, -0.22222222],
[-0.22222222, nan]],
Expand Down Expand Up @@ -523,7 +523,7 @@ def Tajimas_D(
[-3.35891429, -3.35891429]])
>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
array([[-0.22349574, -0.22349574],
[-2.18313233, -2.18313233]])
Expand Down Expand Up @@ -661,7 +661,7 @@ def pbs(
>>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names})
>>> # Divide into two windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE
array([ 0. , -0.160898])
"""
Expand Down
36 changes: 16 additions & 20 deletions sgkit/tests/test_popgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_diversity__windowed(sample_size):
sample_cohorts = np.full_like(ts.samples(), 0)
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
ds = ds.assign_coords({"cohorts": ["co_0"]})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
ds = diversity(ds)
div = ds["stat_diversity"].sel(cohorts="co_0").compute()

Expand All @@ -103,7 +103,7 @@ def test_diversity__windowed(sample_size):
ds = count_variant_alleles(ts_to_dataset(ts)) # type: ignore[no-untyped-call]
ac = ds["variant_allele_count"].values
mpd = allel.mean_pairwise_difference(ac, fill=0)
ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25)
ska_div = allel.moving_statistic(mpd, np.sum, size=25)
np.testing.assert_allclose(
div[:-1], ska_div
) # scikit-allel has final window missing
Expand Down Expand Up @@ -159,7 +159,7 @@ def test_divergence__windowed(sample_size, n_cohorts, chunks):
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
ds = divergence(ds)
div = ds["stat_divergence"].values
# test off-diagonal entries, by replacing diagonal with NaNs
Expand Down Expand Up @@ -192,7 +192,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
ds = divergence(ds)
div = ds["stat_divergence"].values
# test off-diagonal entries, by replacing diagonal with NaNs
Expand All @@ -205,7 +205,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
ac1 = ds1["variant_allele_count"].values
ac2 = ds2["variant_allele_count"].values
mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0)
ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25) # noqa: F841
ska_div = allel.moving_statistic(mpd, np.sum, size=25) # noqa: F841
# TODO: investigate why numbers are different
np.testing.assert_allclose(
div[:-1], ska_div
Expand All @@ -226,7 +226,7 @@ def test_Fst__Hudson(sample_size):
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window
ds = Fst(ds, estimator="Hudson")
fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values

Expand Down Expand Up @@ -254,7 +254,7 @@ def test_Fst__Nei(sample_size, n_cohorts):
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window
ds = Fst(ds, estimator="Nei")
fst = ds.stat_Fst.values

Expand Down Expand Up @@ -289,7 +289,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
fst_ds = Fst(ds, estimator="Nei")
fst = fst_ds["stat_Fst"].values

Expand All @@ -312,7 +312,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):

ac1 = fst_ds.cohort_allele_count.values[:, 0, :]
ac2 = fst_ds.cohort_allele_count.values[:, 1, :]
ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25, step=25)
ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25)

np.testing.assert_allclose(
fst[:-1], ska_fst
Expand All @@ -326,7 +326,7 @@ def test_Tajimas_D(sample_size):
sample_cohorts = np.full_like(ts.samples(), 0)
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window
ds = Tajimas_D(ds)
d = ds.stat_Tajimas_D.compute()
ts_d = ts.Tajimas_D()
Expand All @@ -348,7 +348,7 @@ def test_pbs(sample_size, n_cohorts):
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window

ds = pbs(ds)
stat_pbs = ds["stat_pbs"]
Expand All @@ -360,9 +360,7 @@ def test_pbs(sample_size, n_cohorts):

ska_pbs_value = np.full([1, n_cohorts, n_cohorts, n_cohorts], np.nan)
for i, j, k in itertools.combinations(range(n_cohorts), 3):
ska_pbs_value[0, i, j, k] = allel.pbs(
ac1, ac2, ac3, window_size=n_variants, window_step=n_variants
)
ska_pbs_value[0, i, j, k] = allel.pbs(ac1, ac2, ac3, window_size=n_variants)

np.testing.assert_allclose(stat_pbs, ska_pbs_value)

Expand All @@ -382,7 +380,7 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)

ds = pbs(ds)
stat_pbs = ds["stat_pbs"].values
Expand All @@ -396,9 +394,7 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
n_windows = ds.dims["windows"] - 1
ska_pbs_value = np.full([n_windows, n_cohorts, n_cohorts, n_cohorts], np.nan)
for i, j, k in itertools.combinations(range(n_cohorts), 3):
ska_pbs_value[:, i, j, k] = allel.pbs(
ac1, ac2, ac3, window_size=25, window_step=25
)
ska_pbs_value[:, i, j, k] = allel.pbs(ac1, ac2, ac3, window_size=25)

np.testing.assert_allclose(stat_pbs[:-1], ska_pbs_value)

Expand All @@ -418,7 +414,7 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
[np.full_like(subset, i) for i, subset in enumerate(subsets)]
)
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
ds = window(ds, size=3, step=3)
ds = window(ds, size=3)

gh = Garud_h(ds)
h1 = gh.stat_Garud_h1.values
Expand All @@ -431,7 +427,7 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
gt = ds.call_genotype.values[:, sample_cohorts == c, :]
ska_gt = allel.GenotypeArray(gt)
ska_ha = ska_gt.to_haplotypes()
ska_h = allel.moving_garud_h(ska_ha, size=3, step=3)
ska_h = allel.moving_garud_h(ska_ha, size=3)

np.testing.assert_allclose(h1[:, c], ska_h[0])
np.testing.assert_allclose(h12[:, c], ska_h[1])
Expand Down
10 changes: 10 additions & 0 deletions sgkit/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,16 @@ def test_window():
window(ds, 2, 2)


def test_window__default_step():
ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
assert not has_windows(ds)
ds = window(ds, 2)
assert has_windows(ds)
np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])


@pytest.mark.parametrize(
"n_variant, n_contig, window_contigs_exp, window_starts_exp, window_stops_exp",
[
Expand Down
1 change: 1 addition & 0 deletions sgkit/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def register_variable(cls, spec: Spec) -> Tuple[str, Spec]:
if spec.default_name in cls.registered_variables:
raise ValueError(f"`{spec.default_name}` already registered")
cls.registered_variables[spec.default_name] = spec
print(spec.__doc__)
return spec.default_name, spec

@classmethod
Expand Down
6 changes: 4 additions & 2 deletions sgkit/window.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Iterable, Tuple, Union
from typing import Any, Callable, Iterable, Optional, Tuple, Union

import dask.array as da
import numpy as np
Expand All @@ -15,7 +15,7 @@
def window(
ds: Dataset,
size: int,
step: int,
step: Optional[int] = None,
merge: bool = True,
) -> Dataset:
"""Add fixed-size windowing information to a dataset.
Expand All @@ -32,6 +32,7 @@ def window(
The window size (number of variants).
step
The distance (number of variants) between start positions of windows.
Defaults to ``size``.
merge
If True (the default), merge the input dataset and the computed
output variables into a single dataset, otherwise return only
Expand All @@ -47,6 +48,7 @@ def window(
- :data:`sgkit.variables.window_stop_spec` (windows):
The index values of window stop positions.
"""
step = step or size
n_variants = ds.dims["variants"]
n_contigs = len(ds.attrs["contigs"])
contig_ids = np.arange(n_contigs)
Expand Down

0 comments on commit 1880cfd

Please sign in to comment.