Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moving karyotype to anoph #702

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions malariagen_data/af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def __init__(
taxon_colors=TAXON_COLORS,
virtual_contigs=None,
gene_names=None,
inversion_tag_path=None,
)

def __repr__(self):
Expand Down
94 changes: 2 additions & 92 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,11 @@

import dask
import pandas as pd # type: ignore
from pandas import CategoricalDtype
import numpy as np # type: ignore
import allel # type: ignore
import plotly.express as px # type: ignore

import malariagen_data
from .anopheles import AnophelesDataResource

from numpydoc_decorator import doc
from .util import check_types, _karyotype_tags_n_alt
from .anoph import base_params
from typing import Optional, Literal, Annotated, TypeAlias

# silence dask performance warnings
dask.config.set(**{"array.slicing.split_native_chunks": False}) # type: ignore
Expand All @@ -35,6 +28,7 @@
GENE_NAMES = {
"AGAP004707": "Vgsc/para",
}
INVERSION_TAG_PATH = "karyotype_tag_snps.csv"


def _setup_aim_palettes():
Expand Down Expand Up @@ -83,12 +77,6 @@ def _setup_aim_palettes():
}


inversion_param: TypeAlias = Annotated[
Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
"Name of inversion to infer karyotype for.",
]


class Ag3(AnophelesDataResource):
"""Provides access to data from Ag3.x releases.

Expand Down Expand Up @@ -203,6 +191,7 @@ def __init__(
taxon_colors=TAXON_COLORS,
virtual_contigs=VIRTUAL_CONTIGS,
gene_names=GENE_NAMES,
inversion_tag_path=INVERSION_TAG_PATH,
)

# set up caches
Expand Down Expand Up @@ -355,82 +344,3 @@ def _results_cache_add_analysis_params(self, params):
super()._results_cache_add_analysis_params(params)
# override parent class to add AIM analysis
params["aim_analysis"] = self._aim_analysis

@check_types
@doc(
summary="Load tag SNPs for a given inversion in Ag.",
)
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
# needs to be modified depending on where we are hosting
import importlib.resources
from . import resources

with importlib.resources.path(resources, "karyotype_tag_snps.csv") as path:
df_tag_snps = pd.read_csv(path, sep=",")
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()

@check_types
@doc(
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
)
def karyotype(
self,
inversion: inversion_param,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
sample_query_options: Optional[base_params.sample_query_options] = None,
) -> pd.DataFrame:
# load tag snp data
df_tagsnps = self.load_inversion_tags(inversion=inversion)
inversion_pos = df_tagsnps["position"]
inversion_alts = df_tagsnps["alt_allele"]
contig = inversion[0:2]

# get snp calls for inversion region
start, end = np.min(inversion_pos), np.max(inversion_pos)
region = f"{contig}:{start}-{end}"

ds_snps = self.snp_calls(
region=region,
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
)

with self._spinner("Inferring karyotype from tag SNPs"):
# access variables we need
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
pos = allel.SortedIndex(ds_snps["variant_position"].values)
samples = ds_snps["sample_id"].values
alts = ds_snps["variant_allele"].values.astype(str)

# subset to position of inversion tags
mask = pos.locate_intersection(inversion_pos)[0]
alts = alts[mask]
geno = geno.compress(mask, axis=0).compute()

# infer karyotype
gn_alt = _karyotype_tags_n_alt(
gt=geno, alts=alts, inversion_alts=inversion_alts
)
is_called = geno.is_called()

# calculate mean genotype for each sample whilst masking missing calls
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
total_sites = np.sum(is_called, axis=0)

df = pd.DataFrame(
{
"sample_id": samples,
"inversion": inversion,
f"karyotype_{inversion}_mean": av_gts,
# round the genotypes then convert to int
f"karyotype_{inversion}": av_gts.round().astype(int),
"total_tag_snps": total_sites,
},
)
# Allow filling missing values with "<NA>" visible placeholder.
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)

return df
112 changes: 112 additions & 0 deletions malariagen_data/anoph/karyotype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import pandas as pd # type: ignore
from pandas import CategoricalDtype
import numpy as np # type: ignore
import allel # type: ignore

from numpydoc_decorator import doc
from ..util import check_types, _karyotype_tags_n_alt
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If _karyotype_tags_n_alt is only used in this module, perhaps it should be moved here.

from . import base_params
from typing import Optional

from .snp_data import AnophelesSnpData
from .karyotype_params import inversion_param


class AnophelesKaryotypeData(AnophelesSnpData):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
class AnophelesKaryotypeData(AnophelesSnpData):
class AnophelesKaryotypeAnalysis(AnophelesSnpData):

Nit, suggest slight naming change, then will hopefully also make sense if you later on add in here some functions for calculating inversion allele frequencies.

def __init__(
self,
inversion_tag_path: Optional[str] = None,
**kwargs,
):
# N.B., this class is designed to work cooperatively, and
# so it's important that any remaining parameters are passed
# to the superclass constructor.
super().__init__(**kwargs)

# If provided, this analysis version will override the
# default value provided in the release configuration.
Comment on lines +26 to +27
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# If provided, this analysis version will override the
# default value provided in the release configuration.

Cruft.

self._inversion_tag_path = inversion_tag_path

@check_types
@doc(
summary="Load tag SNPs for a given inversion in Ag.",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
summary="Load tag SNPs for a given inversion in Ag.",
summary="Load tag SNPs for a given inversion.",

)
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
# needs to be modified depending on where we are hosting
import importlib.resources
from .. import resources

if not self._inversion_tag_path:
raise FileNotFoundError(
"The file containing the inversion tags is missing."
)
Comment on lines +39 to +42
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FileNotFoundError isn't quite the right exception class here.

Suggested change
if not self._inversion_tag_path:
raise FileNotFoundError(
"The file containing the inversion tags is missing."
)
if self._inversion_tag_path is None:
raise NotImplementedError(
"No inversion tags are available for this data resource."
)

else:
with importlib.resources.path(resources, self._inversion_tag_path) as path:
df_tag_snps = pd.read_csv(path, sep=",")
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()

@check_types
@doc(
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
summary="Infer karyotype from tag SNPs for a given inversion.",

)
def karyotype(
self,
inversion: inversion_param,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
sample_query_options: Optional[base_params.sample_query_options] = None,
) -> pd.DataFrame:
# load tag snp data
df_tagsnps = self.load_inversion_tags(inversion=inversion)
inversion_pos = df_tagsnps["position"]
inversion_alts = df_tagsnps["alt_allele"]
contig = inversion[0:2]

# get snp calls for inversion region
start, end = np.min(inversion_pos), np.max(inversion_pos)
region = f"{contig}:{start}-{end}"

ds_snps = self.snp_calls(
region=region,
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
)

with self._spinner("Inferring karyotype from tag SNPs"):
# access variables we need
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
pos = allel.SortedIndex(ds_snps["variant_position"].values)
samples = ds_snps["sample_id"].values
alts = ds_snps["variant_allele"].values.astype(str)

# subset to position of inversion tags
mask = pos.locate_intersection(inversion_pos)[0]
alts = alts[mask]
geno = geno.compress(mask, axis=0).compute()

# infer karyotype
gn_alt = _karyotype_tags_n_alt(
gt=geno, alts=alts, inversion_alts=inversion_alts
)
is_called = geno.is_called()

# calculate mean genotype for each sample whilst masking missing calls
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
total_sites = np.sum(is_called, axis=0)

df = pd.DataFrame(
{
"sample_id": samples,
"inversion": inversion,
f"karyotype_{inversion}_mean": av_gts,
# round the genotypes then convert to int
f"karyotype_{inversion}": av_gts.round().astype(int),
"total_tag_snps": total_sites,
},
)
# Allow filling missing values with "<NA>" visible placeholder.
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)

return df
10 changes: 10 additions & 0 deletions malariagen_data/anoph/karyotype_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Parameter definitions for karyotype analysis functions."""

from typing import Literal

from typing_extensions import Annotated, TypeAlias

inversion_param: TypeAlias = Annotated[
Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
"Name of inversion to infer karyotype for.",
]
Comment on lines +7 to +10
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These values will only be valid for Ag. Af would have different possible values. Suggest broadening here to just a string.

Suggested change
inversion_param: TypeAlias = Annotated[
Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
"Name of inversion to infer karyotype for.",
]
inversion_param: TypeAlias = Annotated[
str,
"Name of inversion to infer karyotype for.",
]

4 changes: 4 additions & 0 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
plotly_params,
xpehh_params,
)
from .anoph.karyotype import AnophelesKaryotypeData
from .anoph.aim_data import AnophelesAimData
from .anoph.base import AnophelesBase
from .anoph.cnv_data import AnophelesCnvData
Expand Down Expand Up @@ -94,6 +95,7 @@ class AnophelesDataResource(
AnophelesPca,
PlinkConverter,
AnophelesIgv,
AnophelesKaryotypeData,
AnophelesAimData,
AnophelesHapData,
AnophelesSnpData,
Expand Down Expand Up @@ -138,6 +140,7 @@ def __init__(
taxon_colors: Optional[Mapping[str, str]],
virtual_contigs: Optional[Mapping[str, Sequence[str]]],
gene_names: Optional[Mapping[str, str]],
inversion_tag_path: Optional[str],
):
super().__init__(
url=url,
Expand Down Expand Up @@ -171,6 +174,7 @@ def __init__(
taxon_colors=taxon_colors,
virtual_contigs=virtual_contigs,
gene_names=gene_names,
inversion_tag_path=inversion_tag_path,
)

@property
Expand Down
21 changes: 21 additions & 0 deletions tests/integration/test_af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,24 @@ def test_locate_region(region_raw):
assert region == Region("2RL", 48714463, 48715355)
if region_raw == "2RL:24,630,355-24,633,221":
assert region == Region("2RL", 24630355, 24633221)


@pytest.mark.parametrize(
"inversion",
["2La", "2Rb", "2Rc_col", "X_x"],
)
def test_karyotyping(inversion):
af1 = setup_af1()

if inversion == "X_x":
with pytest.raises(TypeError):
af1.karyotype(
inversion=inversion, sample_sets="AG1000G-GH", sample_query=None
)
else:
with pytest.raises(FileNotFoundError):
af1.karyotype(
inversion=inversion,
sample_sets="1229-VO-GH-DADZIE-VMF00095",
sample_query=None,
)
Comment on lines +87 to +98
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All inversion parameter values should fail with NotImplementedError I think.

39 changes: 25 additions & 14 deletions tests/integration/test_ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,19 +159,30 @@ def test_xpehh_gwss():
assert_allclose(xpehh[:, 2][100], 0.4817561326426265)


def test_karyotyping():
@pytest.mark.parametrize(
"inversion",
["2La", "2Rb", "2Rc_col", "X_x"],
)
def test_karyotyping(inversion):
ag3 = setup_ag3(cohorts_analysis="20230516")

df = ag3.karyotype(inversion="2La", sample_sets="AG1000G-GH", sample_query=None)

assert isinstance(df, pd.DataFrame)
expected_cols = [
"sample_id",
"inversion",
"karyotype_2La_mean",
"karyotype_2La",
"total_tag_snps",
]
assert set(df.columns) == set(expected_cols)
assert all(df["karyotype_2La"].isin([0, 1, 2]))
assert all(df["karyotype_2La_mean"].between(0, 2))
if inversion == "X_x":
with pytest.raises(TypeError):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be a ValueError I think.

ag3.karyotype(
inversion=inversion, sample_sets="AG1000G-GH", sample_query=None
)
else:
df = ag3.karyotype(
inversion=inversion, sample_sets="AG1000G-GH", sample_query=None
)
assert isinstance(df, pd.DataFrame)
expected_cols = [
"sample_id",
"inversion",
f"karyotype_{inversion}_mean",
f"karyotype_{inversion}",
"total_tag_snps",
]
assert set(df.columns) == set(expected_cols)
assert all(df[f"karyotype_{inversion}"].isin([0, 1, 2]))
assert all(df[f"karyotype_{inversion}_mean"].between(0, 2))
Loading