malariagen · alimanfoo · Dec 4, 2024 · Sep 2, 2024 · Sep 2, 2024 · Sep 3, 2024
diff --git a/malariagen_data/anoph/aim_data.py b/malariagen_data/anoph/aim_data.py
@@ -65,7 +65,11 @@ def _prep_aims_param(self, *, aims: aim_params.aims) -> str:
     @check_types
     @doc(
         summary="Access ancestry informative marker variants.",
-        returns="A dataset containing AIM positions and discriminating alleles.",
+        returns="""
+        A dataset with 2 dimensions: **variants** the number of AIMs sites, and **alleles** which will always be 2, each representing one of the species. It contains 2 coordinates:
+        **variant_contig** has **variants** values and contains the chromosome arm of each AIM, and **variant_position** has **variants** values and contains the position of each AIM. It contains 1 data variable:
+        **variant_allele** has (**variants**, **allele**) values and contains the discriminating alleles for each AIM.
+        """,
     )
     def aim_variants(self, aims: aim_params.aims) -> xr.Dataset:
         self._require_aim_analysis()
@@ -113,7 +117,16 @@ def _aim_calls_dataset(self, *, aims, sample_set):
             calls.
         """,
         returns="""
-            A dataset containing AIM SNP sites, alleles and genotype calls.
+        A dataset with 4 dimensions:
+        **variants** the number of AIMs sites,
+        **samples** the number of samples,
+        **ploidy** the ploidy (2),
+        and **alleles** which will always be 2, each representing one of the species. It contains 3 coordinates:
+        **sample_id** has **samples** values and contains the identifier of each sample,
+        **variant_contig** has **variants** values and contains the chromosome arm of each AIM,
+        and **variant_position** has **variants** values and contains the position of each AIM. It contains 2 data variables:
+        **call_genotype** has (**variants**, **samples**, **ploidy**) values and contains both calls for each sample and each AIM,
+        **variant_allele** has (**variants**, **allele**) values and contains the discriminating alleles for each AIM.
         """,
     )
     def aim_calls(

diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
@@ -397,7 +397,16 @@ def _read_sample_sets(self, *, single_release: str):
     @check_types
     @doc(
         summary="Access a dataframe of sample sets",
-        returns="A dataframe of sample sets, one row per sample set.",
+        returns="""A dataframe of sample sets, one row per sample set. It contains five columns:
+         **sample_set** is the name of the sample set,
+         **sample_count** is the number of samples the sample set contains,
+         **study_id** is the identifier for the study that generated the sample set,
+         **study_url** is the URL of the study on the MalariaGEN website,
+         **term_of_use_expiry** is the date when the terms of use expire,
+         **terms_of_use_url** is the URL of the terms of use,
+         **release** is the identifier of the release containing the sample set,
+         **unrestricted_use** whether the sample set can be without restriction (e.g., if the terms of use have expired).
+            """,
     )
     def sample_sets(
         self,
@@ -441,6 +450,7 @@ def sample_sets(
     @check_types
     @doc(
         summary="Find which release a sample set was included in.",
+        returns="The release the sample set is part of.",
     )
     def lookup_release(self, sample_set: base_params.sample_set) -> str:
         if self._cache_sample_set_to_release is None:
@@ -455,6 +465,7 @@ def lookup_release(self, sample_set: base_params.sample_set) -> str:
     @check_types
     @doc(
         summary="Find which study a sample set belongs to.",
+        returns="The study the sample set belongs to.",
     )
     def lookup_study(self, sample_set: base_params.sample_set) -> str:
         if self._cache_sample_set_to_study is None:
@@ -468,6 +479,7 @@ def lookup_study(self, sample_set: base_params.sample_set) -> str:
     @check_types
     @doc(
         summary="Find the study info for a sample set.",
+        returns="The info for the study the sample set belongs to.",
     )
     def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
         if self._cache_sample_set_to_study_info is None:
@@ -483,6 +495,7 @@ def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
     @check_types
     @doc(
         summary="Find the terms-of-use info for a sample set.",
+        returns="The terms-of-use info for the sample set.",
     )
     def lookup_terms_of_use_info(self, sample_set: base_params.sample_set) -> dict:
         if self._cache_sample_set_to_terms_of_use_info is None:

diff --git a/malariagen_data/anoph/cnv_data.py b/malariagen_data/anoph/cnv_data.py
@@ -170,7 +170,19 @@ def _cnv_hmm_dataset(self, *, contig, sample_set, inline_array, chunks):
     @check_types
     @doc(
         summary="Access CNV HMM data from CNV calling.",
-        returns="An xarray dataset of CNV HMM calls and associated data.",
+        returns="""A dataset with 2 dimensions:
+        `variants` the number of CNV regions in the selected region,
+        `samples` the number of samples. There are 4 coordinates:
+        `variant_position` has `variants` values and contains the initial position of each CNV region,
+        `variant_end` has `variants` values and contains the final position of each CNV region,
+        `variant_contig` has `variants` values and contains the contig of each CNV region,
+        `sample_id` has `samples` values and contains the identifier of each sample. It contains 5 data variables:
+        `call_CN`, it has (`variants`, `samples`) values and contains the number of copies for each sample and each CNV region,
+        `call_RawCov`, it has (`variants`, `samples`) values and contains the raw coverage for each sample and each CNV region,
+        `call_NormCov`, it has (`variants`, `samples`) values and contains the normalized coverage for each sample and each CNV region,
+        `sample_coverage_variance`, it has `samples` values and contains the variance of the coverage for each sample,
+        `sample_id_high_variance`, it has `samples` values and contains whether each sample has a high variance.
+        """,
     )
     def cnv_hmm(
         self,
@@ -377,7 +389,19 @@ def _cnv_coverage_calls_dataset(
     @check_types
     @doc(
         summary="Access CNV HMM data from genome-wide CNV discovery and filtering.",
-        returns="An xarray dataset of CNV alleles and genotypes.",
+        returns="""A dataset with 2 dimensions:
+        `variants` the number of CNV regions in the selected region,
+        `samples` the number of samples. There are 5 coordinates:
+        `variant_position` has `variants` values and contains the initial position of each CNV region,
+        `variant_end` has `variants` values and contains the final position of each CNV region,
+        `variant_contig` has `variants` values and contains the contig of each CNV region,
+        `variant_id` has `variants` values and contains the identifier for each CNV region,
+        `sample_id` has `samples` values and contains the identifier of each sample. It contains 4 data variables:
+        `variant_CIPOS`, it has `variants` values and contains the confidence interval for the start position for each CNV region,
+        `variant_CIEND`, it has `variants` values and contains the confidence interval for the end position for each CNV region,
+        `variant_filter_pass`, it has `variants` values and is True for each CNV region that passes quality filters,
+        `call_genotype`, it has (`variants`, `samples`) values and contains the coverage call for each sample and each CNV region,
+        """,
     )
     def cnv_coverage_calls(
         self,
@@ -533,7 +557,21 @@ def _cnv_discordant_read_calls_dataset(
     @check_types
     @doc(
         summary="Access CNV discordant read calls data.",
-        returns="An xarray dataset of CNV alleles and genotypes.",
+        returns="""A dataset with 2 dimensions:
+        `variants` the number of discordant read calls in the selected region,
+        `samples` the number of samples. There are 5 coordinates:
+        `variant_position` has `variants` values and contains the initial position of each discordant read call,
+        `variant_end` has `variants` values and contains the final position of each discordant read call,
+        `variant_id` has `variants` values and contains the identifier of each discordant read call,
+        `variant_contig` has `variants` values and contains the contig of each discordant read call,
+        `sample_id` has `samples` values and contains the identifier of each sample. It contains 6 data variables:
+        `variant_Region`, it has `variants` values and contains the identifier of the region covered by each discordant read call,
+        `variant_StartBreakpointMethod`, it has `variants` values and specifies how the start breakpoint was determined for each discordant read call,
+        `variant_EndBreakpointMethod`, it has `variants` values and specifies how the end breakpoint was determined for each discordant read call,
+        `call_genotype`, it has (`variants`, `samples`) values and contains the number of copies of each discordant read call for each sample,
+        `sample_coverage_variance`, it has `samples` values and contains the variance of the coverage for each sample,
+        `sample_id_high_variance`, it has `samples` values and contains whether each sample has a high variance.
+        """,
     )
     def cnv_discordant_read_calls(
         self,

diff --git a/malariagen_data/anoph/distance.py b/malariagen_data/anoph/distance.py
@@ -86,6 +86,7 @@ def __init__(self, **kwargs):
         summary="""
             Compute pairwise distances between samples using biallelic SNP genotypes.
         """,
+        returns=("dist", "samples", "n_snps_used"),
     )
     def biallelic_diplotype_pairwise_distances(
         self,
@@ -107,7 +108,9 @@ def biallelic_diplotype_pairwise_distances(
         random_seed: base_params.random_seed = 42,
         inline_array: base_params.inline_array = base_params.inline_array_default,
         chunks: base_params.chunks = base_params.native_chunks,
-    ) -> Tuple[np.ndarray, np.ndarray, int]:
+    ) -> Tuple[
+        distance_params.dist, distance_params.samples, distance_params.n_snps_used
+    ]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
         name = "biallelic_diplotype_pairwise_distances"
@@ -234,6 +237,7 @@ def _biallelic_diplotype_pairwise_distances(
         summary="""
             Construct a neighbour-joining tree between samples using biallelic SNP genotypes.
         """,
+        returns=("Z", "samples", "n_snps_used"),
     )
     def njt(
         self,
@@ -260,7 +264,7 @@ def njt(
         random_seed: base_params.random_seed = 42,
         inline_array: base_params.inline_array = base_params.inline_array_default,
         chunks: base_params.chunks = base_params.native_chunks,
-    ) -> Tuple[np.ndarray, np.ndarray, int]:
+    ) -> Tuple[distance_params.Z, distance_params.samples, distance_params.n_snps_used]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
         name = "njt_v1"

diff --git a/malariagen_data/anoph/distance_params.py b/malariagen_data/anoph/distance_params.py
@@ -2,6 +2,8 @@
 
 from typing_extensions import Annotated, TypeAlias
 
+import numpy as np
+
 distance_metric: TypeAlias = Annotated[
     Literal[
         "cityblock",
@@ -20,6 +22,32 @@
 
 default_nj_algorithm: nj_algorithm = "dynamic"
 
+dist: TypeAlias = Annotated[
+    np.ndarray,
+    """
+    A numpy array containing the distance between each pair of samples.
+    """,
+]
+
+Z: TypeAlias = Annotated[
+    np.ndarray,
+    """
+    A neighbour-joining tree encoded as a numpy array. Each row in the
+    array contains data for one internal node in the tree, in the order
+    in which they were created by the neighbour-joining algorithm.
+    Within each row there are five values: left child node identifier,
+    right child node identifier, distance to left child, distance to
+    right child, total number of leaves. This data structure is similar
+    to that returned by scipy's hierarchical clustering functions,
+    except that here we have two distance values for each internal node
+    rather than one because distances to the children may be different.
+    """,
+]
+
+samples: TypeAlias = Annotated[np.ndarray, "The list of the sample identifiers"]
+
+n_snps_used: TypeAlias = Annotated[int, "The number of SNPs used"]
+
 center_x: TypeAlias = Annotated[int | float, "X coordinate where plotting is centered."]
 
 center_y: TypeAlias = Annotated[int | float, "Y coordinate where plotting is centered."]

diff --git a/malariagen_data/anoph/fst_params.py b/malariagen_data/anoph/fst_params.py
@@ -26,7 +26,11 @@
 df_pairwise_fst: TypeAlias = Annotated[
     pd.DataFrame,
     """
-    A dataframe of pairwise Fst and standard error values.
+    A dataframe of pairwise Fst and standard error values. It has
+    4 columns:
+    `cohort1` and `cohort2` are the two cohorts,
+    `fst` is the value of the Fst between the two cohorts,
+    `se` is the standard error.
     """,
 ]
 

diff --git a/malariagen_data/anoph/genome_features.py b/malariagen_data/anoph/genome_features.py
@@ -119,7 +119,7 @@ def _prep_gff_attributes(
     @check_types
     @doc(
         summary="Access genome feature annotations.",
-        returns="A dataframe of genome annotations, one row per feature.",
+        returns="A dataframe of genome annotations, one row per feature. The dataframe follows the GFF3 format (https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md), including extra attributes `ID`, `Parent`, `Name` and `description` depending on the dataset.",
     )
     def genome_features(
         self,

diff --git a/malariagen_data/anoph/hap_data.py b/malariagen_data/anoph/hap_data.py
@@ -321,7 +321,17 @@ def _haplotypes_for_contig(
     @check_types
     @doc(
         summary="Access haplotype data.",
-        returns="A dataset of haplotypes and associated data.",
+        returns="""A dataset with 4 dimensions:
+        `variants` the number of sites in the selected region,
+        `allele` the number of alleles (2),
+        `samples` the number of samples,
+        and `ploidy` the ploidy (2). There are 3 coordinates:
+        `variant_position` has `variants` values and contains the position of each site,
+        `variant_contig` has `variants` values and contains the contig of each site,
+        `sample_id` has `samples` values and contains the identifier of each sample. The data variables are:
+        `variant_allele`, it has (`variants`, `alleles`) values and contains the reference followed by the alternate allele for each site,
+        `call_genotype`, it has (`variants`, `samples`, `ploidy`) values and contains both calls for each site and each sample.
+        """,
     )
     def haplotypes(
         self,

diff --git a/malariagen_data/anoph/het_params.py b/malariagen_data/anoph/het_params.py
@@ -47,7 +47,13 @@
     pd.DataFrame,
     """
     A DataFrame where each row provides data about a single run of
-    homozygosity.
+    homozygosity. The columns are:
+    `sample_id` containing the identifier of the sample,
+    `contig` containing the contig,
+    `roh_start` containing the start of the run of homozygosity,
+    `roh_stop` containing the end of the run of homozygosity,
+    `roh_length` containing the length of the run of homozygosity,
+    `roh_is_marginal` containing whether the run of homozygosity is marginal.
     """,
 ]