Feat: Feature gene annotation (#132)

* annotate (in .uns['cnv']['var']) which window contains which genes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove black magic comma * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * first pass at gene convolution annotation * sorting some conflicts * feature annotation working * adding test to calcualte gene average * Converting gene matrix into sparse matrix * updating gene position to using pandas * linting * adding more tests * Delete .vscode directory * adding scanpy to conf.py * changing scanpy.tl.pca to sc.tl.pca to get build docs to work * Saving CNV matrix as sparse matrix * adding back infercnv.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing linting error * editing code to appease ruff * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Converting gene conv to numpy array * Making the calculation of per gene values optional * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: redst4r <redst4r@web.de> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Gregor Sturm <gregor.sturm@i-med.ac.at> Co-authored-by: Gregor Sturm <mail@gregor-sturm.de>
icbi-lab · Jun 26, 2024 · 0073690 · 0073690
1 parent ad98e58
commit 0073690
Show file tree

Hide file tree

Showing 5 changed files with 391 additions and 33 deletions.
diff --git a/.gitignore b/.gitignore
@@ -28,5 +28,9 @@ __pycache__/
 /.idea/
 /.vscode/
 
+infercnv_env/
 # Notebooks
 .ipynb_checkpoints
+
+# Node.js
+node_modules/
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,9 @@ authors = [
 maintainers = [
     {name = "Gregor Sturm", email="gregor.sturm@i-med.ac.at"}
 ]
+contributors = [
+    {name = "Grant Neilson", email = "grant.neilson5@gmail.com"}
+]
 urls.Documentation = "https://infercnvpy.readthedocs.io/"
 urls.Source = "https://github.com/icbi-lab/infercnvpy"
 urls.Home-page = "https://github.com/icbi-lab/infercnvpy"
@@ -30,6 +33,7 @@ dependencies = [
     'pycairo>=1.20; sys_platform == "win32"',
     'leidenalg',
     'pyreadr',
+    'pytest-benchmark',
     # for debug logging (referenced from the issue template)
     "session-info",
     "ipython"

diff --git a/src/infercnvpy/tl/_infercnv.py b/src/infercnvpy/tl/_infercnv.py
@@ -4,6 +4,7 @@
 from multiprocessing import cpu_count
 
 import numpy as np
+import pandas as pd
 import scipy.ndimage
 import scipy.sparse
 from anndata import AnnData
@@ -30,7 +31,8 @@ def infercnv(
     inplace: bool = True,
     layer: str | None = None,
     key_added: str = "cnv",
-) -> None | tuple[dict, scipy.sparse.csr_matrix]:
+    calculate_gene_values: bool = False,
+) -> None | tuple[dict, scipy.sparse.csr_matrix, np.ndarray | None]:
     """Infer Copy Number Variation (CNV) by averaging gene expression over genomic regions.
 
     This method is heavily inspired by `infercnv <https://github.com/broadinstitute/inferCNV/>`_
@@ -79,6 +81,13 @@ def infercnv(
         Key under which the cnv matrix will be stored in adata if `inplace=True`.
         Will store the matrix in `adata.obsm["X_{key_added}"] and additional information
         in `adata.uns[key_added]`.
+    calculate_gene_values
+        If True per gene CNVs will be calculated and stored in `adata.layers["gene_values_{key_added}"]`.
+        As many genes will be included in each segment the resultant per gene value will be an average of the genes included in the segment.
+        Additionally not all genes will be included in the per gene CNV, due to the window size and step size not always being a multiple of
+        the number of genes. Any genes not included in the per gene CNV will be filled with NaN.
+        Note this will significantly increase the memory and computation time, it is recommended to decrease the chunksize to ~100 if this is set to True.
+
 
     Returns
     -------
@@ -108,7 +117,7 @@ def infercnv(
 
     var = tmp_adata.var.loc[:, ["chromosome", "start", "end"]]  # type: ignore
 
-    chr_pos, chunks = zip(
+    chr_pos, chunks, convolved_dfs = zip(
         *process_map(
             _infercnv_chunk,
             [expr[i : i + chunksize, :] for i in range(0, adata.shape[0], chunksize)],
@@ -118,19 +127,38 @@ def infercnv(
             itertools.repeat(window_size),
             itertools.repeat(step),
             itertools.repeat(dynamic_threshold),
+            itertools.repeat(calculate_gene_values),
             tqdm_class=tqdm,
             max_workers=cpu_count() if n_jobs is None else n_jobs,
         ),
         strict=False,
     )
+
     res = scipy.sparse.vstack(chunks)
 
+    chr_pos = chr_pos[0]
+
+    if calculate_gene_values:
+        per_gene_df = pd.concat(convolved_dfs, axis=0)
+        # Ensure the DataFrame has the correct row index
+        per_gene_df.index = adata.obs.index
+        # Ensure the per gene CNV matches the adata var (genes) index, any genes
+        # that are not included in the CNV will be filled with NaN
+        per_gene_df = per_gene_df.reindex(columns=adata.var_names, fill_value=np.nan)
+        # This needs to be a numpy array as colnames are too large to save in anndata
+        per_gene_mtx = per_gene_df.values
+    else:
+        per_gene_mtx = None
+
     if inplace:
         adata.obsm[f"X_{key_added}"] = res
-        adata.uns[key_added] = {"chr_pos": chr_pos[0]}
+        adata.uns[key_added] = {"chr_pos": chr_pos}
+
+        if calculate_gene_values:
+            adata.layers[f"gene_values_{key_added}"] = per_gene_mtx
 
     else:
-        return chr_pos[0], res
+        return chr_pos, res, per_gene_mtx
 
 
 def _natural_sort(l: Sequence):
@@ -148,8 +176,15 @@ def alphanum_key(key):
     return sorted(l, key=alphanum_key)
 
 
-def _running_mean(x: np.ndarray | scipy.sparse.spmatrix, n: int = 50, step: int = 10) -> np.ndarray:
-    """Compute a pyramidially weighted running mean.
+def _running_mean(
+    x: np.ndarray | scipy.sparse.spmatrix,
+    n: int = 50,
+    step: int = 10,
+    gene_list: list = None,
+    calculate_gene_values: bool = False,
+) -> tuple[np.ndarray, pd.DataFrame | None]:
+    """
+    Compute a pyramidially weighted running mean.
 
     Densifies the matrix. Use `step` and `chunksize` to save memory.
 
@@ -160,27 +195,112 @@ def _running_mean(x: np.ndarray | scipy.sparse.spmatrix, n: int = 50, step: int
     n
         Length of the running window
     step
-        only compute running windows ever `step` columns, e.g. if step is 10
-        0:100, 10:110, 20:120 etc. Saves memory.
+        only compute running windows every `step` columns, e.g. if step is 10
+        0:99, 10:109, 20:119 etc. Saves memory.
+    gene_list
+        List of gene names to be used in the convolution
+    calculate_gene_values
+        If True per gene CNVs will be calculated and stored in `adata.layers["gene_values_{key_added}"]`.
     """
     if n < x.shape[1]:  # regular convolution: the filter is smaller than the #genes
         r = np.arange(1, n + 1)
-
         pyramid = np.minimum(r, r[::-1])
-        smoothed_x = np.apply_along_axis(lambda row: np.convolve(row, pyramid, mode="same"), axis=1, arr=x) / np.sum(
-            pyramid
-        )
-        return smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)]
+        smoothed_x = np.apply_along_axis(
+            lambda row: np.convolve(row, pyramid, mode="valid"),
+            axis=1,
+            arr=x,
+        ) / np.sum(pyramid)
+
+        ## get the indices of the genes used in the convolution
+        convolution_indices = get_convolution_indices(x, n)[np.arange(0, smoothed_x.shape[1], step)]
+        ## Pull out the genes used in the convolution
+        convolved_gene_names = gene_list[convolution_indices]
+        smoothed_x = smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)]
+
+        if calculate_gene_values:
+            convolved_gene_values = _calculate_gene_averages(convolved_gene_names, smoothed_x)
+        else:
+            convolved_gene_values = None
 
-    else:  # there's less genes than the filtersize. just apply a single conv with a smaller filter (no sliding)
-        r = np.arange(1, x.shape[1] + 1)
-        pyramid = np.minimum(r, r[::-1])
-        smoothed_x = x @ pyramid.reshape(-1, 1)
-        smoothed_x = smoothed_x / pyramid.sum()
-        return smoothed_x
+        return smoothed_x, convolved_gene_values
+
+    else:  # If there is less genes than the window size, set the window size to the number of genes and perform a single convolution
+        n = x.shape[1]  # set the filter size to the number of genes
+        r = np.arange(1, n + 1)
+        ## As we are only doing one convolution the values should be equal
+        pyramid = np.array([1] * n)
+        smoothed_x = np.apply_along_axis(
+            lambda row: np.convolve(row, pyramid, mode="valid"),
+            axis=1,
+            arr=x,
+        ) / np.sum(pyramid)
+
+        if calculate_gene_values:
+            ## As all genes are used the convolution the values are identical for all genes
+            convolved_gene_values = pd.DataFrame(np.repeat(smoothed_x, len(gene_list), axis=1), columns=gene_list)
+        else:
+            convolved_gene_values = None
+
+        return smoothed_x, convolved_gene_values
 
 
-def _running_mean_by_chromosome(expr, var, window_size, step) -> tuple[dict, np.ndarray]:
+def _calculate_gene_averages(
+    convolved_gene_names: np.ndarray,
+    smoothed_x: np.ndarray,
+) -> pd.DataFrame:
+    """
+    Calculate the average value of each gene in the convolution
+
+    Parameters
+    ----------
+    convolved_gene_names
+        A numpy array with the gene names used in the convolution
+    smoothed_x
+        A numpy array with the smoothed gene expression values
+
+    Returns
+    -------
+    convolved_gene_values
+        A DataFrame with the average value of each gene in the convolution
+    """
+    ## create a dictionary to store the gene values per sample
+    gene_to_values = {}
+    # Calculate the number of genes in each convolution, will be same as the window size default=100
+    length = len(convolved_gene_names[0])
+    # Convert the flattened convolved gene names to a list
+    flatten_list = list(convolved_gene_names.flatten())
+
+    # For each sample in smoothed_x find the value for each gene and store it in a dictionary
+    for sample, row in enumerate(smoothed_x):
+        # Create sample level in the dictionary
+        if sample not in gene_to_values:
+            gene_to_values[sample] = {}
+        # For each gene in the flattened gene list find the value and store it in the dictionary
+        for i, gene in enumerate(flatten_list):
+            if gene not in gene_to_values[sample]:
+                gene_to_values[sample][gene] = []
+            # As the gene list has been flattend we can use the floor division of the index
+            # to get the correct position of the gene to get the value and store it in the dictionary
+            gene_to_values[sample][gene].append(row[i // length])
+
+    for sample in gene_to_values:
+        for gene in gene_to_values[sample]:
+            gene_to_values[sample][gene] = np.mean(gene_to_values[sample][gene])
+
+    convolved_gene_values = pd.DataFrame(gene_to_values).T
+    return convolved_gene_values
+
+
+def get_convolution_indices(x, n):
+    indices = []
+    for i in range(x.shape[1] - n + 1):
+        indices.append(np.arange(i, i + n))
+    return np.array(indices)
+
+
+def _running_mean_by_chromosome(
+    expr, var, window_size, step, calculate_gene_values
+) -> tuple[dict, np.ndarray, pd.DataFrame | None]:
     """Compute the running mean for each chromosome independently. Stack the resulting arrays ordered by chromosome.
 
     Parameters
@@ -206,16 +326,31 @@ def _running_mean_by_chromosome(expr, var, window_size, step) -> tuple[dict, np.
     """
     chromosomes = _natural_sort([x for x in var["chromosome"].unique() if x.startswith("chr") and x != "chrM"])
 
-    def _running_mean_for_chromosome(chr):
-        genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values
-        tmp_x = expr[:, var.index.get_indexer(genes)]
-        return _running_mean(tmp_x, n=window_size, step=step)
+    running_means = [
+        _running_mean_for_chromosome(chr, expr, var, window_size, step, calculate_gene_values) for chr in chromosomes
+    ]
 
-    running_means = [_running_mean_for_chromosome(chr) for chr in chromosomes]
+    running_means, convolved_dfs = zip(*running_means, strict=False)
 
-    chr_start_pos = dict(zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]), strict=False))
+    chr_start_pos = {}
+    for chr, i in zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]), strict=False):
+        chr_start_pos[chr] = i
 
-    return chr_start_pos, np.hstack(running_means)
+    ## Concatenate the gene dfs
+    if calculate_gene_values:
+        convolved_dfs = pd.concat(convolved_dfs, axis=1)
+
+    return chr_start_pos, np.hstack(running_means), convolved_dfs
+
+
+def _running_mean_for_chromosome(chr, expr, var, window_size, step, calculate_gene_values):
+    genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values
+    tmp_x = expr[:, var.index.get_indexer(genes)]
+    x_conv, convolved_gene_values = _running_mean(
+        tmp_x, n=window_size, step=step, gene_list=genes, calculate_gene_values=calculate_gene_values
+    )
+
+    return x_conv, convolved_gene_values
 
 
 def _get_reference(
@@ -264,7 +399,7 @@ def _get_reference(
     return reference
 
 
-def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_threshold):
+def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_threshold, calculate_gene_values=False):
     """The actual infercnv work is happening here.
 
     Process chunks of serveral thousand genes independently since this
@@ -291,17 +426,23 @@ def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_t
     # Step 2 - clip log fold changes
     x_clipped = np.clip(x_centered, -lfc_cap, lfc_cap)
     # Step 3 - smooth by genomic position
-    chr_pos, x_smoothed = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step)
+    chr_pos, x_smoothed, conv_df = _running_mean_by_chromosome(
+        x_clipped, var, window_size=window_size, step=step, calculate_gene_values=calculate_gene_values
+    )
     # Step 4 - center by cell
-    x_cell_centered = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis]
+    x_res = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis]
+    if calculate_gene_values:
+        gene_res = conv_df - np.median(conv_df, axis=1)[:, np.newaxis]
+    else:
+        gene_res = None
 
-    # Noise filtering
-    x_res = x_cell_centered
     # step 5 - standard deviation based noise filtering
     if dynamic_threshold is not None:
         noise_thres = dynamic_threshold * np.std(x_res)
         x_res[np.abs(x_res) < noise_thres] = 0
+        if calculate_gene_values:
+            gene_res[np.abs(gene_res) < noise_thres] = 0
 
     x_res = scipy.sparse.csr_matrix(x_res)
 
-    return chr_pos, x_res
+    return chr_pos, x_res, gene_res
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -51,6 +51,56 @@ def adata_mock(request):
     return adata
 
 
+@pytest.fixture
+def adata_full_mock():
+    np.random.seed(0)  # Set the seed to a fixed value
+    obs = pd.DataFrame().assign(sample=["sample1", "sample2", "sample3", "sample4"])  # Added "sample4"
+    var = pd.DataFrame().assign(
+        gene=["gene1", "gene2", "gene3", "gene4", "gene5", "gene6", "gene7", "gene8", "gene9", "gene10"],
+        start=[100, 200, 300, 400, 500, 0, 100, 200, 300, 400],
+        end=[199, 299, 399, 499, 599, 99, 199, 299, 399, 499],
+        chromosome=["chr1", "chr1", "chr1", "chr1", "chr1", "chr2", "chr2", "chr2", "chr2", "chr2"],
+    )
+    var.index = var["gene"]
+    X = sp.csr_matrix(np.random.randint(low=0, high=50, size=(4, 10)))  # Changed size to (4, 10)
+    adata = sc.AnnData(X=X, obs=obs, var=var)
+
+    return adata
+
+
+@pytest.fixture
+def gene_res_actual():
+    df = pd.DataFrame(
+        {
+            "gene1": [0.75, -1.00, 0.00, 0.00],
+            "gene2": [0.00, 0.00, 0.75, 0.00],
+            "gene3": [0.000000, 0.000000, 0.91666667, 0.000000],
+            "gene4": [0.00, 0.00, 1.25, 0.00],
+            "gene5": [-0.75, 0.00, 1.25, 0.00],
+            "gene6": [0.000000, 0.000000, 0.000000, 0.921875],
+            "gene7": [0.000000, 0.000000, 0.000000, 0.703125],
+            "gene8": [0.0, 0.0, 0.0, 0.0],
+            "gene9": [0.0, 0.0, 0.0, 0.0],
+            "gene10": [0.75, 0.00, 0.00, 0.00],
+        }
+    )
+    df.index = df.index.astype(str)
+    return df
+
+
+@pytest.fixture
+def x_res_actual():
+    arr = np.array(
+        [
+            [1.00, 0.00, 0.00, 0.00, 0.00, 1.00],
+            [-1.00, 0.00, 0.00, 0.00, 0.00, 0.00],
+            [0.00, 1.25, 1.25, 0.00, 0.00, 0.00],
+            [0.00, 0.00, 0.00, 0.875, 0.00, 0.00],
+        ]
+    )
+    return arr
+
+
 @pytest.fixture(params=[np.array, sp.csr_matrix, sp.csc_matrix])
 def adata_ithgex(request):
     return sc.AnnData(