Make window step default to window size

sgkit-dev · Nov 19, 2020 · 1880cfd · 1880cfd
1 parent 25eb5ca
commit 1880cfd
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 27 deletions.
diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py
@@ -76,7 +76,7 @@ def diversity(
         [0.5       , 0.5       ]])
 
     >>> # Divide into windows of size three (variants)
-    >>> ds = sg.window(ds, size=3, step=3)
+    >>> ds = sg.window(ds, size=3)
     >>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE
     array([[1.83333333, 1.83333333],
         [1.        , 1.        ]])
@@ -239,7 +239,7 @@ def divergence(
             [0.625     , 0.5       ]]])
 
     >>> # Divide into windows of size three (variants)
-    >>> ds = sg.window(ds, size=3, step=3)
+    >>> ds = sg.window(ds, size=3)
     >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
     array([[[1.83333333, 1.5       ],
             [1.5       , 1.83333333]],
@@ -431,7 +431,7 @@ def Fst(
             [ 0.2       ,         nan]]])
 
     >>> # Divide into windows of size three (variants)
-    >>> ds = sg.window(ds, size=3, step=3)
+    >>> ds = sg.window(ds, size=3)
     >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
     array([[[        nan, -0.22222222],
             [-0.22222222,         nan]],
@@ -523,7 +523,7 @@ def Tajimas_D(
         [-3.35891429, -3.35891429]])
 
     >>> # Divide into windows of size three (variants)
-    >>> ds = sg.window(ds, size=3, step=3)
+    >>> ds = sg.window(ds, size=3)
     >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
     array([[-0.22349574, -0.22349574],
         [-2.18313233, -2.18313233]])
@@ -661,7 +661,7 @@ def pbs(
     >>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names})
 
     >>> # Divide into two windows of size three (variants)
-    >>> ds = sg.window(ds, size=3, step=3)
+    >>> ds = sg.window(ds, size=3)
     >>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE
     array([ 0.      , -0.160898])
     """

diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
@@ -87,7 +87,7 @@ def test_diversity__windowed(sample_size):
     sample_cohorts = np.full_like(ts.samples(), 0)
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     ds = ds.assign_coords({"cohorts": ["co_0"]})
-    ds = window(ds, size=25, step=25)
+    ds = window(ds, size=25)
     ds = diversity(ds)
     div = ds["stat_diversity"].sel(cohorts="co_0").compute()
 
@@ -103,7 +103,7 @@ def test_diversity__windowed(sample_size):
     ds = count_variant_alleles(ts_to_dataset(ts))  # type: ignore[no-untyped-call]
     ac = ds["variant_allele_count"].values
     mpd = allel.mean_pairwise_difference(ac, fill=0)
-    ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25)
+    ska_div = allel.moving_statistic(mpd, np.sum, size=25)
     np.testing.assert_allclose(
         div[:-1], ska_div
     )  # scikit-allel has final window missing
@@ -159,7 +159,7 @@ def test_divergence__windowed(sample_size, n_cohorts, chunks):
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
-    ds = window(ds, size=25, step=25)
+    ds = window(ds, size=25)
     ds = divergence(ds)
     div = ds["stat_divergence"].values
     # test off-diagonal entries, by replacing diagonal with NaNs
@@ -192,7 +192,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
-    ds = window(ds, size=25, step=25)
+    ds = window(ds, size=25)
     ds = divergence(ds)
     div = ds["stat_divergence"].values
     # test off-diagonal entries, by replacing diagonal with NaNs
@@ -205,7 +205,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
     ac1 = ds1["variant_allele_count"].values
     ac2 = ds2["variant_allele_count"].values
     mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0)
-    ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25)  # noqa: F841
+    ska_div = allel.moving_statistic(mpd, np.sum, size=25)  # noqa: F841
     # TODO: investigate why numbers are different
     np.testing.assert_allclose(
         div[:-1], ska_div
@@ -226,7 +226,7 @@ def test_Fst__Hudson(sample_size):
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
     n_variants = ds.dims["variants"]
-    ds = window(ds, size=n_variants, step=n_variants)  # single window
+    ds = window(ds, size=n_variants)  # single window
     ds = Fst(ds, estimator="Hudson")
     fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values
 
@@ -254,7 +254,7 @@ def test_Fst__Nei(sample_size, n_cohorts):
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
     n_variants = ds.dims["variants"]
-    ds = window(ds, size=n_variants, step=n_variants)  # single window
+    ds = window(ds, size=n_variants)  # single window
     ds = Fst(ds, estimator="Nei")
     fst = ds.stat_Fst.values
 
@@ -289,7 +289,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
-    ds = window(ds, size=25, step=25)
+    ds = window(ds, size=25)
     fst_ds = Fst(ds, estimator="Nei")
     fst = fst_ds["stat_Fst"].values
 
@@ -312,7 +312,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):
 
     ac1 = fst_ds.cohort_allele_count.values[:, 0, :]
     ac2 = fst_ds.cohort_allele_count.values[:, 1, :]
-    ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25, step=25)
+    ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25)
 
     np.testing.assert_allclose(
         fst[:-1], ska_fst
@@ -326,7 +326,7 @@ def test_Tajimas_D(sample_size):
     sample_cohorts = np.full_like(ts.samples(), 0)
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     n_variants = ds.dims["variants"]
-    ds = window(ds, size=n_variants, step=n_variants)  # single window
+    ds = window(ds, size=n_variants)  # single window
     ds = Tajimas_D(ds)
     d = ds.stat_Tajimas_D.compute()
     ts_d = ts.Tajimas_D()
@@ -348,7 +348,7 @@ def test_pbs(sample_size, n_cohorts):
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
     n_variants = ds.dims["variants"]
-    ds = window(ds, size=n_variants, step=n_variants)  # single window
+    ds = window(ds, size=n_variants)  # single window
 
     ds = pbs(ds)
     stat_pbs = ds["stat_pbs"]
@@ -360,9 +360,7 @@ def test_pbs(sample_size, n_cohorts):
 
     ska_pbs_value = np.full([1, n_cohorts, n_cohorts, n_cohorts], np.nan)
     for i, j, k in itertools.combinations(range(n_cohorts), 3):
-        ska_pbs_value[0, i, j, k] = allel.pbs(
-            ac1, ac2, ac3, window_size=n_variants, window_step=n_variants
-        )
+        ska_pbs_value[0, i, j, k] = allel.pbs(ac1, ac2, ac3, window_size=n_variants)
 
     np.testing.assert_allclose(stat_pbs, ska_pbs_value)
 
@@ -382,7 +380,7 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
     cohort_names = [f"co_{i}" for i in range(n_cohorts)]
     ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
-    ds = window(ds, size=25, step=25)
+    ds = window(ds, size=25)
 
     ds = pbs(ds)
     stat_pbs = ds["stat_pbs"].values
@@ -396,9 +394,7 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
     n_windows = ds.dims["windows"] - 1
     ska_pbs_value = np.full([n_windows, n_cohorts, n_cohorts, n_cohorts], np.nan)
     for i, j, k in itertools.combinations(range(n_cohorts), 3):
-        ska_pbs_value[:, i, j, k] = allel.pbs(
-            ac1, ac2, ac3, window_size=25, window_step=25
-        )
+        ska_pbs_value[:, i, j, k] = allel.pbs(ac1, ac2, ac3, window_size=25)
 
     np.testing.assert_allclose(stat_pbs[:-1], ska_pbs_value)
 
@@ -418,7 +414,7 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
         [np.full_like(subset, i) for i, subset in enumerate(subsets)]
     )
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
-    ds = window(ds, size=3, step=3)
+    ds = window(ds, size=3)
 
     gh = Garud_h(ds)
     h1 = gh.stat_Garud_h1.values
@@ -431,7 +427,7 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
         gt = ds.call_genotype.values[:, sample_cohorts == c, :]
         ska_gt = allel.GenotypeArray(gt)
         ska_ha = ska_gt.to_haplotypes()
-        ska_h = allel.moving_garud_h(ska_ha, size=3, step=3)
+        ska_h = allel.moving_garud_h(ska_ha, size=3)
 
         np.testing.assert_allclose(h1[:, c], ska_h[0])
         np.testing.assert_allclose(h12[:, c], ska_h[1])

diff --git a/sgkit/tests/test_window.py b/sgkit/tests/test_window.py
@@ -84,6 +84,16 @@ def test_window():
         window(ds, 2, 2)
 
 
+def test_window__default_step():
+    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
+    assert not has_windows(ds)
+    ds = window(ds, 2)
+    assert has_windows(ds)
+    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
+    np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
+    np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])
+
+
 @pytest.mark.parametrize(
     "n_variant, n_contig, window_contigs_exp, window_starts_exp, window_stops_exp",
     [

diff --git a/sgkit/variables.py b/sgkit/variables.py
@@ -37,6 +37,7 @@ def register_variable(cls, spec: Spec) -> Tuple[str, Spec]:
         if spec.default_name in cls.registered_variables:
             raise ValueError(f"`{spec.default_name}` already registered")
         cls.registered_variables[spec.default_name] = spec
+        print(spec.__doc__)
         return spec.default_name, spec
 
     @classmethod

diff --git a/sgkit/window.py b/sgkit/window.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Iterable, Tuple, Union
+from typing import Any, Callable, Iterable, Optional, Tuple, Union
 
 import dask.array as da
 import numpy as np
@@ -15,7 +15,7 @@
 def window(
     ds: Dataset,
     size: int,
-    step: int,
+    step: Optional[int] = None,
     merge: bool = True,
 ) -> Dataset:
     """Add fixed-size windowing information to a dataset.
@@ -32,6 +32,7 @@ def window(
         The window size (number of variants).
     step
         The distance (number of variants) between start positions of windows.
+        Defaults to ``size``.
     merge
         If True (the default), merge the input dataset and the computed
         output variables into a single dataset, otherwise return only
@@ -47,6 +48,7 @@ def window(
     - :data:`sgkit.variables.window_stop_spec` (windows):
       The index values of window stop positions.
     """
+    step = step or size
     n_variants = ds.dims["variants"]
     n_contigs = len(ds.attrs["contigs"])
     contig_ids = np.arange(n_contigs)