malariagen · jonbrenas · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py
@@ -123,6 +123,7 @@ def __init__(
             taxon_colors=TAXON_COLORS,
             virtual_contigs=None,
             gene_names=None,
+            inversion_tag_path=None,
         )
 
     def __repr__(self):

diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
@@ -2,18 +2,11 @@
 
 import dask
 import pandas as pd  # type: ignore
-from pandas import CategoricalDtype
-import numpy as np  # type: ignore
-import allel  # type: ignore
 import plotly.express as px  # type: ignore
 
 import malariagen_data
 from .anopheles import AnophelesDataResource
 
-from numpydoc_decorator import doc
-from .util import check_types, _karyotype_tags_n_alt
-from .anoph import base_params
-from typing import Optional, Literal, Annotated, TypeAlias
 
 # silence dask performance warnings
 dask.config.set(**{"array.slicing.split_native_chunks": False})  # type: ignore
@@ -35,6 +28,7 @@
 GENE_NAMES = {
     "AGAP004707": "Vgsc/para",
 }
+INVERSION_TAG_PATH = "karyotype_tag_snps.csv"
 
 
 def _setup_aim_palettes():
@@ -83,12 +77,6 @@ def _setup_aim_palettes():
 }
 
 
-inversion_param: TypeAlias = Annotated[
-    Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
-    "Name of inversion to infer karyotype for.",
-]
-
-
 class Ag3(AnophelesDataResource):
     """Provides access to data from Ag3.x releases.
 
@@ -203,6 +191,7 @@ def __init__(
             taxon_colors=TAXON_COLORS,
             virtual_contigs=VIRTUAL_CONTIGS,
             gene_names=GENE_NAMES,
+            inversion_tag_path=INVERSION_TAG_PATH,
         )
 
         # set up caches
@@ -355,82 +344,3 @@ def _results_cache_add_analysis_params(self, params):
         super()._results_cache_add_analysis_params(params)
         # override parent class to add AIM analysis
         params["aim_analysis"] = self._aim_analysis
-
-    @check_types
-    @doc(
-        summary="Load tag SNPs for a given inversion in Ag.",
-    )
-    def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
-        # needs to be modified depending on where we are hosting
-        import importlib.resources
-        from . import resources
-
-        with importlib.resources.path(resources, "karyotype_tag_snps.csv") as path:
-            df_tag_snps = pd.read_csv(path, sep=",")
-        return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()
-
-    @check_types
-    @doc(
-        summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
-    )
-    def karyotype(
-        self,
-        inversion: inversion_param,
-        sample_sets: Optional[base_params.sample_sets] = None,
-        sample_query: Optional[base_params.sample_query] = None,
-        sample_query_options: Optional[base_params.sample_query_options] = None,
-    ) -> pd.DataFrame:
-        # load tag snp data
-        df_tagsnps = self.load_inversion_tags(inversion=inversion)
-        inversion_pos = df_tagsnps["position"]
-        inversion_alts = df_tagsnps["alt_allele"]
-        contig = inversion[0:2]
-
-        # get snp calls for inversion region
-        start, end = np.min(inversion_pos), np.max(inversion_pos)
-        region = f"{contig}:{start}-{end}"
-
-        ds_snps = self.snp_calls(
-            region=region,
-            sample_sets=sample_sets,
-            sample_query=sample_query,
-            sample_query_options=sample_query_options,
-        )
-
-        with self._spinner("Inferring karyotype from tag SNPs"):
-            # access variables we need
-            geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
-            pos = allel.SortedIndex(ds_snps["variant_position"].values)
-            samples = ds_snps["sample_id"].values
-            alts = ds_snps["variant_allele"].values.astype(str)
-
-            # subset to position of inversion tags
-            mask = pos.locate_intersection(inversion_pos)[0]
-            alts = alts[mask]
-            geno = geno.compress(mask, axis=0).compute()
-
-            # infer karyotype
-            gn_alt = _karyotype_tags_n_alt(
-                gt=geno, alts=alts, inversion_alts=inversion_alts
-            )
-            is_called = geno.is_called()
-
-            # calculate mean genotype for each sample whilst masking missing calls
-            av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
-            total_sites = np.sum(is_called, axis=0)
-
-            df = pd.DataFrame(
-                {
-                    "sample_id": samples,
-                    "inversion": inversion,
-                    f"karyotype_{inversion}_mean": av_gts,
-                    # round the genotypes then convert to int
-                    f"karyotype_{inversion}": av_gts.round().astype(int),
-                    "total_tag_snps": total_sites,
-                },
-            )
-            # Allow filling missing values with "<NA>" visible placeholder.
-            kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
-            df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)
-
-        return df
diff --git a/malariagen_data/anoph/karyotype.py b/malariagen_data/anoph/karyotype.py
@@ -0,0 +1,112 @@
+import pandas as pd  # type: ignore
+from pandas import CategoricalDtype
+import numpy as np  # type: ignore
+import allel  # type: ignore
+
+from numpydoc_decorator import doc
+from ..util import check_types, _karyotype_tags_n_alt
+from . import base_params
+from typing import Optional
+
+from .snp_data import AnophelesSnpData
+from .karyotype_params import inversion_param
+
+
+class AnophelesKaryotypeData(AnophelesSnpData):
-class AnophelesKaryotypeData(AnophelesSnpData):
+class AnophelesKaryotypeAnalysis(AnophelesSnpData):
-class AnophelesKaryotypeData(AnophelesSnpData):
+class AnophelesKaryotypeAnalysis(AnophelesSnpData):
+    def __init__(
+        self,
+        inversion_tag_path: Optional[str] = None,
+        **kwargs,
+    ):
+        # N.B., this class is designed to work cooperatively, and
+        # so it's important that any remaining parameters are passed
+        # to the superclass constructor.
+        super().__init__(**kwargs)
+
+        # If provided, this analysis version will override the
+        # default value provided in the release configuration.
-        # If provided, this analysis version will override the
-        # default value provided in the release configuration.
-        # If provided, this analysis version will override the
-        # default value provided in the release configuration.
+        self._inversion_tag_path = inversion_tag_path
+
+    @check_types
+    @doc(
+        summary="Load tag SNPs for a given inversion in Ag.",
-        summary="Load tag SNPs for a given inversion in Ag.",
+        summary="Load tag SNPs for a given inversion.",
-        summary="Load tag SNPs for a given inversion in Ag.",
+        summary="Load tag SNPs for a given inversion.",
+    )
+    def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
+        # needs to be modified depending on where we are hosting
+        import importlib.resources
+        from .. import resources
+
+        if not self._inversion_tag_path:
+            raise FileNotFoundError(
+                "The file containing the inversion tags is missing."
+            )
-        if not self._inversion_tag_path:
-            raise FileNotFoundError(
-                "The file containing the inversion tags is missing."
-            )
+        if self._inversion_tag_path is None:
+            raise NotImplementedError(
+                "No inversion tags are available for this data resource."
+            )
-        if not self._inversion_tag_path:
-            raise FileNotFoundError(
-                "The file containing the inversion tags is missing."
-            )
+        if self._inversion_tag_path is None:
+            raise NotImplementedError(
+                "No inversion tags are available for this data resource."
+            )
+        else:
+            with importlib.resources.path(resources, self._inversion_tag_path) as path:
+                df_tag_snps = pd.read_csv(path, sep=",")
+            return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()
+
+    @check_types
+    @doc(
+        summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
-        summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
+        summary="Infer karyotype from tag SNPs for a given inversion.",
-        summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
+        summary="Infer karyotype from tag SNPs for a given inversion.",
+    )
+    def karyotype(
+        self,
+        inversion: inversion_param,
+        sample_sets: Optional[base_params.sample_sets] = None,
+        sample_query: Optional[base_params.sample_query] = None,
+        sample_query_options: Optional[base_params.sample_query_options] = None,
+    ) -> pd.DataFrame:
+        # load tag snp data
+        df_tagsnps = self.load_inversion_tags(inversion=inversion)
+        inversion_pos = df_tagsnps["position"]
+        inversion_alts = df_tagsnps["alt_allele"]
+        contig = inversion[0:2]
+
+        # get snp calls for inversion region
+        start, end = np.min(inversion_pos), np.max(inversion_pos)
+        region = f"{contig}:{start}-{end}"
+
+        ds_snps = self.snp_calls(
+            region=region,
+            sample_sets=sample_sets,
+            sample_query=sample_query,
+            sample_query_options=sample_query_options,
+        )
+
+        with self._spinner("Inferring karyotype from tag SNPs"):
+            # access variables we need
+            geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
+            pos = allel.SortedIndex(ds_snps["variant_position"].values)
+            samples = ds_snps["sample_id"].values
+            alts = ds_snps["variant_allele"].values.astype(str)
+
+            # subset to position of inversion tags
+            mask = pos.locate_intersection(inversion_pos)[0]
+            alts = alts[mask]
+            geno = geno.compress(mask, axis=0).compute()
+
+            # infer karyotype
+            gn_alt = _karyotype_tags_n_alt(
+                gt=geno, alts=alts, inversion_alts=inversion_alts
+            )
+            is_called = geno.is_called()
+
+            # calculate mean genotype for each sample whilst masking missing calls
+            av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
+            total_sites = np.sum(is_called, axis=0)
+
+            df = pd.DataFrame(
+                {
+                    "sample_id": samples,
+                    "inversion": inversion,
+                    f"karyotype_{inversion}_mean": av_gts,
+                    # round the genotypes then convert to int
+                    f"karyotype_{inversion}": av_gts.round().astype(int),
+                    "total_tag_snps": total_sites,
+                },
+            )
+            # Allow filling missing values with "<NA>" visible placeholder.
+            kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
+            df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)
+
+        return df
diff --git a/malariagen_data/anoph/karyotype_params.py b/malariagen_data/anoph/karyotype_params.py
@@ -0,0 +1,10 @@
+"""Parameter definitions for karyotype analysis functions."""
+
+from typing import Literal
+
+from typing_extensions import Annotated, TypeAlias
+
+inversion_param: TypeAlias = Annotated[
+    Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
+    "Name of inversion to infer karyotype for.",
+]
-inversion_param: TypeAlias = Annotated[
-    Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
-    "Name of inversion to infer karyotype for.",
-]
+inversion_param: TypeAlias = Annotated[
+    str,
+    "Name of inversion to infer karyotype for.",
+]
-inversion_param: TypeAlias = Annotated[
-    Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
-    "Name of inversion to infer karyotype for.",
-]
+inversion_param: TypeAlias = Annotated[
+    str,
+    "Name of inversion to infer karyotype for.",
+]
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
@@ -29,6 +29,7 @@
     plotly_params,
     xpehh_params,
 )
+from .anoph.karyotype import AnophelesKaryotypeData
 from .anoph.aim_data import AnophelesAimData
 from .anoph.base import AnophelesBase
 from .anoph.cnv_data import AnophelesCnvData
@@ -94,6 +95,7 @@ class AnophelesDataResource(
     AnophelesPca,
     PlinkConverter,
     AnophelesIgv,
+    AnophelesKaryotypeData,
     AnophelesAimData,
     AnophelesHapData,
     AnophelesSnpData,
@@ -138,6 +140,7 @@ def __init__(
         taxon_colors: Optional[Mapping[str, str]],
         virtual_contigs: Optional[Mapping[str, Sequence[str]]],
         gene_names: Optional[Mapping[str, str]],
+        inversion_tag_path: Optional[str],
     ):
         super().__init__(
             url=url,
@@ -171,6 +174,7 @@ def __init__(
             taxon_colors=taxon_colors,
             virtual_contigs=virtual_contigs,
             gene_names=gene_names,
+            inversion_tag_path=inversion_tag_path,
         )
 
     @property

diff --git a/tests/integration/test_af1.py b/tests/integration/test_af1.py
@@ -75,3 +75,24 @@ def test_locate_region(region_raw):
         assert region == Region("2RL", 48714463, 48715355)
     if region_raw == "2RL:24,630,355-24,633,221":
         assert region == Region("2RL", 24630355, 24633221)
+
+
+@pytest.mark.parametrize(
+    "inversion",
+    ["2La", "2Rb", "2Rc_col", "X_x"],
+)
+def test_karyotyping(inversion):
+    af1 = setup_af1()
+
+    if inversion == "X_x":
+        with pytest.raises(TypeError):
+            af1.karyotype(
+                inversion=inversion, sample_sets="AG1000G-GH", sample_query=None
+            )
+    else:
+        with pytest.raises(FileNotFoundError):
+            af1.karyotype(
+                inversion=inversion,
+                sample_sets="1229-VO-GH-DADZIE-VMF00095",
+                sample_query=None,
+            )
diff --git a/tests/integration/test_ag3.py b/tests/integration/test_ag3.py
@@ -159,19 +159,30 @@ def test_xpehh_gwss():
     assert_allclose(xpehh[:, 2][100], 0.4817561326426265)
 
 
-def test_karyotyping():
+@pytest.mark.parametrize(
+    "inversion",
+    ["2La", "2Rb", "2Rc_col", "X_x"],
+)
+def test_karyotyping(inversion):
     ag3 = setup_ag3(cohorts_analysis="20230516")
 
-    df = ag3.karyotype(inversion="2La", sample_sets="AG1000G-GH", sample_query=None)
-
-    assert isinstance(df, pd.DataFrame)
-    expected_cols = [
-        "sample_id",
-        "inversion",
-        "karyotype_2La_mean",
-        "karyotype_2La",
-        "total_tag_snps",
-    ]
-    assert set(df.columns) == set(expected_cols)
-    assert all(df["karyotype_2La"].isin([0, 1, 2]))
-    assert all(df["karyotype_2La_mean"].between(0, 2))
+    if inversion == "X_x":
+        with pytest.raises(TypeError):
+            ag3.karyotype(
+                inversion=inversion, sample_sets="AG1000G-GH", sample_query=None
+            )
+    else:
+        df = ag3.karyotype(
+            inversion=inversion, sample_sets="AG1000G-GH", sample_query=None
+        )
+        assert isinstance(df, pd.DataFrame)
+        expected_cols = [
+            "sample_id",
+            "inversion",
+            f"karyotype_{inversion}_mean",
+            f"karyotype_{inversion}",
+            "total_tag_snps",
+        ]
+        assert set(df.columns) == set(expected_cols)
+        assert all(df[f"karyotype_{inversion}"].isin([0, 1, 2]))
+        assert all(df[f"karyotype_{inversion}_mean"].between(0, 2))