-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Moving karyotype
to anoph
#702
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,112 @@ | ||||||||||||||||||
import pandas as pd # type: ignore | ||||||||||||||||||
from pandas import CategoricalDtype | ||||||||||||||||||
import numpy as np # type: ignore | ||||||||||||||||||
import allel # type: ignore | ||||||||||||||||||
|
||||||||||||||||||
from numpydoc_decorator import doc | ||||||||||||||||||
from ..util import check_types, _karyotype_tags_n_alt | ||||||||||||||||||
from . import base_params | ||||||||||||||||||
from typing import Optional | ||||||||||||||||||
|
||||||||||||||||||
from .snp_data import AnophelesSnpData | ||||||||||||||||||
from .karyotype_params import inversion_param | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
class AnophelesKaryotypeData(AnophelesSnpData): | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Nit, suggest slight naming change, then will hopefully also make sense if you later on add in here some functions for calculating inversion allele frequencies. |
||||||||||||||||||
def __init__( | ||||||||||||||||||
self, | ||||||||||||||||||
inversion_tag_path: Optional[str] = None, | ||||||||||||||||||
**kwargs, | ||||||||||||||||||
): | ||||||||||||||||||
# N.B., this class is designed to work cooperatively, and | ||||||||||||||||||
# so it's important that any remaining parameters are passed | ||||||||||||||||||
# to the superclass constructor. | ||||||||||||||||||
super().__init__(**kwargs) | ||||||||||||||||||
|
||||||||||||||||||
# If provided, this analysis version will override the | ||||||||||||||||||
# default value provided in the release configuration. | ||||||||||||||||||
Comment on lines
+26
to
+27
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Cruft. |
||||||||||||||||||
self._inversion_tag_path = inversion_tag_path | ||||||||||||||||||
|
||||||||||||||||||
@check_types | ||||||||||||||||||
@doc( | ||||||||||||||||||
summary="Load tag SNPs for a given inversion in Ag.", | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame: | ||||||||||||||||||
# needs to be modified depending on where we are hosting | ||||||||||||||||||
import importlib.resources | ||||||||||||||||||
from .. import resources | ||||||||||||||||||
|
||||||||||||||||||
if not self._inversion_tag_path: | ||||||||||||||||||
raise FileNotFoundError( | ||||||||||||||||||
"The file containing the inversion tags is missing." | ||||||||||||||||||
) | ||||||||||||||||||
Comment on lines
+39
to
+42
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
else: | ||||||||||||||||||
with importlib.resources.path(resources, self._inversion_tag_path) as path: | ||||||||||||||||||
df_tag_snps = pd.read_csv(path, sep=",") | ||||||||||||||||||
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index() | ||||||||||||||||||
|
||||||||||||||||||
@check_types | ||||||||||||||||||
@doc( | ||||||||||||||||||
summary="Infer karyotype from tag SNPs for a given inversion in Ag.", | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
def karyotype( | ||||||||||||||||||
self, | ||||||||||||||||||
inversion: inversion_param, | ||||||||||||||||||
sample_sets: Optional[base_params.sample_sets] = None, | ||||||||||||||||||
sample_query: Optional[base_params.sample_query] = None, | ||||||||||||||||||
sample_query_options: Optional[base_params.sample_query_options] = None, | ||||||||||||||||||
) -> pd.DataFrame: | ||||||||||||||||||
# load tag snp data | ||||||||||||||||||
df_tagsnps = self.load_inversion_tags(inversion=inversion) | ||||||||||||||||||
inversion_pos = df_tagsnps["position"] | ||||||||||||||||||
inversion_alts = df_tagsnps["alt_allele"] | ||||||||||||||||||
contig = inversion[0:2] | ||||||||||||||||||
|
||||||||||||||||||
# get snp calls for inversion region | ||||||||||||||||||
start, end = np.min(inversion_pos), np.max(inversion_pos) | ||||||||||||||||||
region = f"{contig}:{start}-{end}" | ||||||||||||||||||
|
||||||||||||||||||
ds_snps = self.snp_calls( | ||||||||||||||||||
region=region, | ||||||||||||||||||
sample_sets=sample_sets, | ||||||||||||||||||
sample_query=sample_query, | ||||||||||||||||||
sample_query_options=sample_query_options, | ||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
with self._spinner("Inferring karyotype from tag SNPs"): | ||||||||||||||||||
# access variables we need | ||||||||||||||||||
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data) | ||||||||||||||||||
pos = allel.SortedIndex(ds_snps["variant_position"].values) | ||||||||||||||||||
samples = ds_snps["sample_id"].values | ||||||||||||||||||
alts = ds_snps["variant_allele"].values.astype(str) | ||||||||||||||||||
|
||||||||||||||||||
# subset to position of inversion tags | ||||||||||||||||||
mask = pos.locate_intersection(inversion_pos)[0] | ||||||||||||||||||
alts = alts[mask] | ||||||||||||||||||
geno = geno.compress(mask, axis=0).compute() | ||||||||||||||||||
|
||||||||||||||||||
# infer karyotype | ||||||||||||||||||
gn_alt = _karyotype_tags_n_alt( | ||||||||||||||||||
gt=geno, alts=alts, inversion_alts=inversion_alts | ||||||||||||||||||
) | ||||||||||||||||||
is_called = geno.is_called() | ||||||||||||||||||
|
||||||||||||||||||
# calculate mean genotype for each sample whilst masking missing calls | ||||||||||||||||||
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0) | ||||||||||||||||||
total_sites = np.sum(is_called, axis=0) | ||||||||||||||||||
|
||||||||||||||||||
df = pd.DataFrame( | ||||||||||||||||||
{ | ||||||||||||||||||
"sample_id": samples, | ||||||||||||||||||
"inversion": inversion, | ||||||||||||||||||
f"karyotype_{inversion}_mean": av_gts, | ||||||||||||||||||
# round the genotypes then convert to int | ||||||||||||||||||
f"karyotype_{inversion}": av_gts.round().astype(int), | ||||||||||||||||||
"total_tag_snps": total_sites, | ||||||||||||||||||
}, | ||||||||||||||||||
) | ||||||||||||||||||
# Allow filling missing values with "<NA>" visible placeholder. | ||||||||||||||||||
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True) | ||||||||||||||||||
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype) | ||||||||||||||||||
|
||||||||||||||||||
return df |
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,10 @@ | ||||||||||||||||||
"""Parameter definitions for karyotype analysis functions.""" | ||||||||||||||||||
|
||||||||||||||||||
from typing import Literal | ||||||||||||||||||
|
||||||||||||||||||
from typing_extensions import Annotated, TypeAlias | ||||||||||||||||||
|
||||||||||||||||||
inversion_param: TypeAlias = Annotated[ | ||||||||||||||||||
Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"], | ||||||||||||||||||
"Name of inversion to infer karyotype for.", | ||||||||||||||||||
] | ||||||||||||||||||
Comment on lines
+7
to
+10
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These values will only be valid for Ag. Af would have different possible values. Suggest broadening here to just a string.
Suggested change
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -75,3 +75,24 @@ def test_locate_region(region_raw): | |
assert region == Region("2RL", 48714463, 48715355) | ||
if region_raw == "2RL:24,630,355-24,633,221": | ||
assert region == Region("2RL", 24630355, 24633221) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"inversion", | ||
["2La", "2Rb", "2Rc_col", "X_x"], | ||
) | ||
def test_karyotyping(inversion): | ||
af1 = setup_af1() | ||
|
||
if inversion == "X_x": | ||
with pytest.raises(TypeError): | ||
af1.karyotype( | ||
inversion=inversion, sample_sets="AG1000G-GH", sample_query=None | ||
) | ||
else: | ||
with pytest.raises(FileNotFoundError): | ||
af1.karyotype( | ||
inversion=inversion, | ||
sample_sets="1229-VO-GH-DADZIE-VMF00095", | ||
sample_query=None, | ||
) | ||
Comment on lines
+87
to
+98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -159,19 +159,30 @@ def test_xpehh_gwss(): | |
assert_allclose(xpehh[:, 2][100], 0.4817561326426265) | ||
|
||
|
||
def test_karyotyping(): | ||
@pytest.mark.parametrize( | ||
"inversion", | ||
["2La", "2Rb", "2Rc_col", "X_x"], | ||
) | ||
def test_karyotyping(inversion): | ||
ag3 = setup_ag3(cohorts_analysis="20230516") | ||
|
||
df = ag3.karyotype(inversion="2La", sample_sets="AG1000G-GH", sample_query=None) | ||
|
||
assert isinstance(df, pd.DataFrame) | ||
expected_cols = [ | ||
"sample_id", | ||
"inversion", | ||
"karyotype_2La_mean", | ||
"karyotype_2La", | ||
"total_tag_snps", | ||
] | ||
assert set(df.columns) == set(expected_cols) | ||
assert all(df["karyotype_2La"].isin([0, 1, 2])) | ||
assert all(df["karyotype_2La_mean"].between(0, 2)) | ||
if inversion == "X_x": | ||
with pytest.raises(TypeError): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be a |
||
ag3.karyotype( | ||
inversion=inversion, sample_sets="AG1000G-GH", sample_query=None | ||
) | ||
else: | ||
df = ag3.karyotype( | ||
inversion=inversion, sample_sets="AG1000G-GH", sample_query=None | ||
) | ||
assert isinstance(df, pd.DataFrame) | ||
expected_cols = [ | ||
"sample_id", | ||
"inversion", | ||
f"karyotype_{inversion}_mean", | ||
f"karyotype_{inversion}", | ||
"total_tag_snps", | ||
] | ||
assert set(df.columns) == set(expected_cols) | ||
assert all(df[f"karyotype_{inversion}"].isin([0, 1, 2])) | ||
assert all(df[f"karyotype_{inversion}_mean"].between(0, 2)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If
_karyotype_tags_n_alt
is only used in this module, perhaps it should be moved here.