Skip to content

Commit

Permalink
Merge branch 'master' into load_and_save
Browse files Browse the repository at this point in the history
  • Loading branch information
mergify[bot] authored Nov 20, 2020
2 parents 13559a9 + 1880cfd commit 05cd754
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 30 deletions.
7 changes: 4 additions & 3 deletions sgkit/stats/association.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,10 @@ def linear_regression(
T = B / np.sqrt(RSS / dof / XLPS)
assert T.shape == (n_loop_covar, n_outcome)
# Match to p-values
# Note: t dist not implemented in Dask so this will
# coerce result to numpy (`T` will still be da.Array)
P = 2 * stats.distributions.t.sf(np.abs(T), dof)
# Note: t dist not implemented in Dask so this must be delayed
P = da.map_blocks(
lambda t: 2 * stats.distributions.t.sf(np.abs(T), dof), T, dtype="float64"
)
assert P.shape == (n_loop_covar, n_outcome)

return LinearRegressionResult(beta=B, t_value=T, p_value=P)
Expand Down
10 changes: 5 additions & 5 deletions sgkit/stats/popgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def diversity(
[0.5 , 0.5 ]])
>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE
array([[1.83333333, 1.83333333],
[1. , 1. ]])
Expand Down Expand Up @@ -239,7 +239,7 @@ def divergence(
[0.625 , 0.5 ]]])
>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
array([[[1.83333333, 1.5 ],
[1.5 , 1.83333333]],
Expand Down Expand Up @@ -431,7 +431,7 @@ def Fst(
[ 0.2 , nan]]])
>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
array([[[ nan, -0.22222222],
[-0.22222222, nan]],
Expand Down Expand Up @@ -523,7 +523,7 @@ def Tajimas_D(
[-3.35891429, -3.35891429]])
>>> # Divide into windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
array([[-0.22349574, -0.22349574],
[-2.18313233, -2.18313233]])
Expand Down Expand Up @@ -661,7 +661,7 @@ def pbs(
>>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names})
>>> # Divide into two windows of size three (variants)
>>> ds = sg.window(ds, size=3, step=3)
>>> ds = sg.window(ds, size=3)
>>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE
array([ 0. , -0.160898])
"""
Expand Down
14 changes: 14 additions & 0 deletions sgkit/tests/test_association.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import warnings
from typing import Any, Dict, List, Optional, Sequence, Tuple

import dask.array as da
import numpy as np
import pandas as pd
import pytest
import xarray as xr
from pandas import DataFrame
from xarray import Dataset

from sgkit import variables
from sgkit.stats.association import gwas_linear_regression, linear_regression
from sgkit.typing import ArrayLike

Expand Down Expand Up @@ -171,6 +173,18 @@ def validate(dfp: DataFrame, dft: DataFrame) -> None:
validate(dfp, dft)


def test_gwas_linear_regression__lazy_results(ds):
res = gwas_linear_regression(
ds, dosage="dosage", covariates="covar_0", traits="trait_0"
)
for v in [
variables.variant_beta,
variables.variant_t_value,
variables.variant_p_value,
]:
assert isinstance(res[v].data, da.Array)


def test_gwas_linear_regression__multi_trait(ds):
def run(traits: Sequence[str]) -> Dataset:
return gwas_linear_regression(
Expand Down
36 changes: 16 additions & 20 deletions sgkit/tests/test_popgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_diversity__windowed(sample_size):
sample_cohorts = np.full_like(ts.samples(), 0)
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
ds = ds.assign_coords({"cohorts": ["co_0"]})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
ds = diversity(ds)
div = ds["stat_diversity"].sel(cohorts="co_0").compute()

Expand All @@ -103,7 +103,7 @@ def test_diversity__windowed(sample_size):
ds = count_variant_alleles(ts_to_dataset(ts)) # type: ignore[no-untyped-call]
ac = ds["variant_allele_count"].values
mpd = allel.mean_pairwise_difference(ac, fill=0)
ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25)
ska_div = allel.moving_statistic(mpd, np.sum, size=25)
np.testing.assert_allclose(
div[:-1], ska_div
) # scikit-allel has final window missing
Expand Down Expand Up @@ -159,7 +159,7 @@ def test_divergence__windowed(sample_size, n_cohorts, chunks):
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
ds = divergence(ds)
div = ds["stat_divergence"].values
# test off-diagonal entries, by replacing diagonal with NaNs
Expand Down Expand Up @@ -192,7 +192,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
ds = divergence(ds)
div = ds["stat_divergence"].values
# test off-diagonal entries, by replacing diagonal with NaNs
Expand All @@ -205,7 +205,7 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
ac1 = ds1["variant_allele_count"].values
ac2 = ds2["variant_allele_count"].values
mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0)
ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25) # noqa: F841
ska_div = allel.moving_statistic(mpd, np.sum, size=25) # noqa: F841
# TODO: investigate why numbers are different
np.testing.assert_allclose(
div[:-1], ska_div
Expand All @@ -226,7 +226,7 @@ def test_Fst__Hudson(sample_size):
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window
ds = Fst(ds, estimator="Hudson")
fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values

Expand Down Expand Up @@ -254,7 +254,7 @@ def test_Fst__Nei(sample_size, n_cohorts):
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window
ds = Fst(ds, estimator="Nei")
fst = ds.stat_Fst.values

Expand Down Expand Up @@ -289,7 +289,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)
fst_ds = Fst(ds, estimator="Nei")
fst = fst_ds["stat_Fst"].values

Expand All @@ -312,7 +312,7 @@ def test_Fst__windowed(sample_size, n_cohorts, chunks):

ac1 = fst_ds.cohort_allele_count.values[:, 0, :]
ac2 = fst_ds.cohort_allele_count.values[:, 1, :]
ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25, step=25)
ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25)

np.testing.assert_allclose(
fst[:-1], ska_fst
Expand All @@ -326,7 +326,7 @@ def test_Tajimas_D(sample_size):
sample_cohorts = np.full_like(ts.samples(), 0)
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window
ds = Tajimas_D(ds)
d = ds.stat_Tajimas_D.compute()
ts_d = ts.Tajimas_D()
Expand All @@ -348,7 +348,7 @@ def test_pbs(sample_size, n_cohorts):
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
n_variants = ds.dims["variants"]
ds = window(ds, size=n_variants, step=n_variants) # single window
ds = window(ds, size=n_variants) # single window

ds = pbs(ds)
stat_pbs = ds["stat_pbs"]
Expand All @@ -360,9 +360,7 @@ def test_pbs(sample_size, n_cohorts):

ska_pbs_value = np.full([1, n_cohorts, n_cohorts, n_cohorts], np.nan)
for i, j, k in itertools.combinations(range(n_cohorts), 3):
ska_pbs_value[0, i, j, k] = allel.pbs(
ac1, ac2, ac3, window_size=n_variants, window_step=n_variants
)
ska_pbs_value[0, i, j, k] = allel.pbs(ac1, ac2, ac3, window_size=n_variants)

np.testing.assert_allclose(stat_pbs, ska_pbs_value)

Expand All @@ -382,7 +380,7 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
cohort_names = [f"co_{i}" for i in range(n_cohorts)]
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names})
ds = window(ds, size=25, step=25)
ds = window(ds, size=25)

ds = pbs(ds)
stat_pbs = ds["stat_pbs"].values
Expand All @@ -396,9 +394,7 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
n_windows = ds.dims["windows"] - 1
ska_pbs_value = np.full([n_windows, n_cohorts, n_cohorts, n_cohorts], np.nan)
for i, j, k in itertools.combinations(range(n_cohorts), 3):
ska_pbs_value[:, i, j, k] = allel.pbs(
ac1, ac2, ac3, window_size=25, window_step=25
)
ska_pbs_value[:, i, j, k] = allel.pbs(ac1, ac2, ac3, window_size=25)

np.testing.assert_allclose(stat_pbs[:-1], ska_pbs_value)

Expand All @@ -418,7 +414,7 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
[np.full_like(subset, i) for i, subset in enumerate(subsets)]
)
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
ds = window(ds, size=3, step=3)
ds = window(ds, size=3)

gh = Garud_h(ds)
h1 = gh.stat_Garud_h1.values
Expand All @@ -431,7 +427,7 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
gt = ds.call_genotype.values[:, sample_cohorts == c, :]
ska_gt = allel.GenotypeArray(gt)
ska_ha = ska_gt.to_haplotypes()
ska_h = allel.moving_garud_h(ska_ha, size=3, step=3)
ska_h = allel.moving_garud_h(ska_ha, size=3)

np.testing.assert_allclose(h1[:, c], ska_h[0])
np.testing.assert_allclose(h12[:, c], ska_h[1])
Expand Down
10 changes: 10 additions & 0 deletions sgkit/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,16 @@ def test_window():
window(ds, 2, 2)


def test_window__default_step():
ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
assert not has_windows(ds)
ds = window(ds, 2)
assert has_windows(ds)
np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])


@pytest.mark.parametrize(
"n_variant, n_contig, window_contigs_exp, window_starts_exp, window_stops_exp",
[
Expand Down
1 change: 1 addition & 0 deletions sgkit/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def register_variable(cls, spec: Spec) -> Tuple[str, Spec]:
if spec.default_name in cls.registered_variables:
raise ValueError(f"`{spec.default_name}` already registered")
cls.registered_variables[spec.default_name] = spec
print(spec.__doc__)
return spec.default_name, spec

@classmethod
Expand Down
6 changes: 4 additions & 2 deletions sgkit/window.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Iterable, Tuple, Union
from typing import Any, Callable, Iterable, Optional, Tuple, Union

import dask.array as da
import numpy as np
Expand All @@ -15,7 +15,7 @@
def window(
ds: Dataset,
size: int,
step: int,
step: Optional[int] = None,
merge: bool = True,
) -> Dataset:
"""Add fixed-size windowing information to a dataset.
Expand All @@ -32,6 +32,7 @@ def window(
The window size (number of variants).
step
The distance (number of variants) between start positions of windows.
Defaults to ``size``.
merge
If True (the default), merge the input dataset and the computed
output variables into a single dataset, otherwise return only
Expand All @@ -47,6 +48,7 @@ def window(
- :data:`sgkit.variables.window_stop_spec` (windows):
The index values of window stop positions.
"""
step = step or size
n_variants = ds.dims["variants"]
n_contigs = len(ds.attrs["contigs"])
contig_ids = np.arange(n_contigs)
Expand Down

0 comments on commit 05cd754

Please sign in to comment.