icbi-lab · grst · Jun 26, 2024 · Sep 25, 2022 · Oct 15, 2022 · Oct 17, 2022
diff --git a/.gitignore b/.gitignore
@@ -27,5 +27,9 @@ __pycache__/
 /.idea/
 /.vscode/
 
+infercnv_env/
 # Notebooks
 .ipynb_checkpoints
+
+# Node.js
+node_modules/
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,9 @@ authors = [
 maintainers = [
     {name = "Gregor Sturm", email="gregor.sturm@i-med.ac.at"}
 ]
+contributors = [
+    {name = "Grant Neilson", email = "grant.neilson5@gmail.com"}
+]
 urls.Documentation = "https://infercnvpy.readthedocs.io/"
 urls.Source = "https://github.com/icbi-lab/infercnvpy"
 urls.Home-page = "https://github.com/icbi-lab/infercnvpy"

diff --git a/src/infercnvpy/io/_genepos.py b/src/infercnvpy/io/_genepos.py
@@ -39,8 +39,8 @@ def genomic_position_from_gtf(
         If True, add the annotations directly to adata, otherwise return a dataframe.
     """
     gtf = gtfparse.read_gtf(
-        gtf_file, usecols=["seqname", "feature", "start", "end", "gene_id", "gene_name"]
-    ).to_pandas()
+        gtf_file, usecols=["seqname", "feature", "start", "end", "gene_id", "gene_name"], result_type="pandas"
+    )
     gtf = (
         gtf.loc[
             gtf["feature"] == "gene",

diff --git a/src/infercnvpy/tl/__init__.py b/src/infercnvpy/tl/__init__.py
@@ -41,24 +41,24 @@ def pca(
 ) -> Union[np.ndarray, None]:
     """Compute the PCA on the result of :func:`infercnvpy.tl.infercnv`.
 
-    Thin wrapper around :func:`scanpy.tl.pca`.
+    Thin wrapper around :func:`sc.tl.pca`.
 
     Parameters
     ----------
     adata
         annotated data matrix
     svd_solver
-        See :func:`scanpy.tl.pca`.
+        See :func:`sc.tl.pca`.
     zero_center
-        See :func:`scanpy.tl.pca`.
+        See :func:`sc.tl.pca`.
     inplace
         If True, store the result in adata.obsm. Otherwise return the PCA matrix.
     use_rep
         Key under which the result of infercnv is stored in adata
     key_added
         Key under which the result will be stored in adata.obsm if `inplace=True`.
     **kwargs
-        Additional arguments passed to :func:`scanpy.tl.pca`.
+        Additional arguments passed to :func:`sc.tl.pca`.
     """
     if f"X_{use_rep}" not in adata.obsm:
         raise KeyError(f"X_{use_rep} is not in adata.obsm. Did you run `tl.infercnv`?")

diff --git a/src/infercnvpy/tl/_infercnv.py b/src/infercnvpy/tl/_infercnv.py
@@ -5,6 +5,7 @@
 from typing import Union
 
 import numpy as np
+import pandas as pd
 import scipy.ndimage
 import scipy.sparse
 from anndata import AnnData
@@ -109,7 +110,7 @@ def infercnv(
 
     var = tmp_adata.var.loc[:, ["chromosome", "start", "end"]]  # type: ignore
 
-    chr_pos, chunks = zip(
+    chr_pos, chunks, convolved_dfs = zip(
         *process_map(
             _infercnv_chunk,
             [expr[i : i + chunksize, :] for i in range(0, adata.shape[0], chunksize)],
@@ -123,14 +124,20 @@ def infercnv(
             max_workers=cpu_count() if n_jobs is None else n_jobs,
         )
     )
+
     res = scipy.sparse.vstack(chunks)
+    chr_pos = chr_pos[0]
+    per_gene_df = pd.concat(convolved_dfs, axis=0)
+    # Ensure the DataFrame has the correct index
+    per_gene_df.index = adata.obs.index
 
     if inplace:
         adata.obsm[f"X_{key_added}"] = res
-        adata.uns[key_added] = {"chr_pos": chr_pos[0]}
+        adata.obsm[f"gene_values_{key_added}"] = per_gene_df
+        adata.uns[key_added] = {"chr_pos": chr_pos}
 
     else:
-        return chr_pos[0], res
+        return chr_pos, res, per_gene_df
 
 
 def _natural_sort(l: Sequence):
@@ -148,8 +155,14 @@ def alphanum_key(key):
     return sorted(l, key=alphanum_key)
 
 
-def _running_mean(x: Union[np.ndarray, scipy.sparse.spmatrix], n: int = 50, step: int = 10) -> np.ndarray:
-    """Compute a pyramidially weighted running mean.
+def _running_mean(
+    x: Union[np.ndarray, scipy.sparse.spmatrix],
+    n: int = 50,
+    step: int = 10,
+    gene_list: list = None,
+) -> np.ndarray:
+    """
+    Compute a pyramidially weighted running mean.
 
     Densifies the matrix. Use `step` and `chunksize` to save memory.
 
@@ -161,26 +174,83 @@ def _running_mean(x: Union[np.ndarray, scipy.sparse.spmatrix], n: int = 50, step
         Length of the running window
     step
         only compute running windows ever `step` columns, e.g. if step is 10
-        0:100, 10:110, 20:120 etc. Saves memory.
+        0:99, 10:109, 20:119 etc. Saves memory.
     """
     if n < x.shape[1]:  # regular convolution: the filter is smaller than the #genes
         r = np.arange(1, n + 1)
-
         pyramid = np.minimum(r, r[::-1])
-        smoothed_x = np.apply_along_axis(lambda row: np.convolve(row, pyramid, mode="same"), axis=1, arr=x) / np.sum(
-            pyramid
-        )
-        return smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)]
+        smoothed_x = np.apply_along_axis(
+            lambda row: np.convolve(row, pyramid, mode="valid"),
+            axis=1,
+            arr=x,
+        ) / np.sum(pyramid)
 
-    else:  # there's less genes than the filtersize. just apply a single conv with a smaller filter (no sliding)
-        r = np.arange(1, x.shape[1] + 1)
-        pyramid = np.minimum(r, r[::-1])
-        smoothed_x = x @ pyramid.reshape(-1, 1)
-        smoothed_x = smoothed_x / pyramid.sum()
-        return smoothed_x
+        ## get the indices of the genes used in the convolution
+        convolution_indices = get_convolution_indices(x, n)[np.arange(0, smoothed_x.shape[1], step)]
+        ## Pull out the genes used in the convolution
+        convolved_gene_names = gene_list[convolution_indices]
+        smoothed_x = smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)]
 
+        convolved_gene_values = _calculate_gene_averages(convolved_gene_names, smoothed_x)
 
-def _running_mean_by_chromosome(expr, var, window_size, step) -> tuple[dict, np.ndarray]:
+        return smoothed_x, convolved_gene_values
+
+    else:  # If there is less genes than the window size, set the window size to the number of genes and perform a single convolution
+        n = x.shape[1]  # set the filter size to the number of genes
+        r = np.arange(1, n + 1)
+        ## As we are only doing one convolution the values should be equal
+        pyramid = np.array([1] * n)
+        # Old code to make the pyramid
+        # pyramid = np.minimum(r, r[::-1])
+        smoothed_x = np.apply_along_axis(
+            lambda row: np.convolve(row, pyramid, mode="valid"),
+            axis=1,
+            arr=x,
+        ) / np.sum(pyramid)
+
+        ## As all genes are used the convolution the values are identical for all genes
+        convolved_gene_values = pd.DataFrame(np.repeat(smoothed_x, len(gene_list), axis=1), columns=gene_list)
+
+        return smoothed_x, convolved_gene_values
+
+
+def _calculate_gene_averages(convolved_gene_names, smoothed_x):
+    ## create a dictionary to store the gene values per sample
+    gene_to_values = {}
+    # Calculate the number of genes in each convolution, will be same as the window size default=100
+    length = len(convolved_gene_names[0])
+    # Convert the flattened convolved gene names to a list
+    flatten_list = list(convolved_gene_names.flatten())
+
+    # For each sample in smoothed_x find the value for each gene and store it in a dictionary
+    for sample, row in enumerate(smoothed_x):
+        # Create sample level in the dictionary
+        if sample not in gene_to_values:
+            gene_to_values[sample] = {}
+        # For each gene in the flattened gene list find the value and store it in the dictionary
+        for i, gene in enumerate(flatten_list):
+            if gene not in gene_to_values[sample]:
+                gene_to_values[sample][gene] = []
+            # As the gene list has been flattend we can use the floor division of the index
+            # to get the correct position of the gene to get the value and store it in the dictionary
+            gene_to_values[sample][gene].append(row[i // length])
+
+    for sample in gene_to_values:
+        for gene in gene_to_values[sample]:
+            gene_to_values[sample][gene] = np.mean(gene_to_values[sample][gene])
+
+    convolved_gene_values = pd.DataFrame(gene_to_values).T
+    return convolved_gene_values
+
+
+def get_convolution_indices(x, n):
+    indices = []
+    for i in range(x.shape[1] - n + 1):
+        indices.append(np.arange(i, i + n))
+    return np.array(indices)
+
+
+def _running_mean_by_chromosome(expr, var, window_size, step) -> tuple[dict, np.ndarray, pd.DataFrame]:
     """Compute the running mean for each chromosome independently. Stack the resulting arrays ordered by chromosome.
 
     Parameters
@@ -206,16 +276,24 @@ def _running_mean_by_chromosome(expr, var, window_size, step) -> tuple[dict, np.
     """
     chromosomes = _natural_sort([x for x in var["chromosome"].unique() if x.startswith("chr") and x != "chrM"])
 
-    def _running_mean_for_chromosome(chr):
-        genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values
-        tmp_x = expr[:, var.index.get_indexer(genes)]
-        return _running_mean(tmp_x, n=window_size, step=step)
+    running_means = [_running_mean_for_chromosome(chr, expr, var, window_size, step) for chr in chromosomes]
+
+    running_means, convolved_dfs = zip(*running_means)
+
+    chr_start_pos = {chr: i for chr, i in zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]))}  # noqa: C416
+
+    ## Concatenate the gene dfs
+    convolved_dfs = pd.concat(convolved_dfs, axis=1)
+
+    return chr_start_pos, np.hstack(running_means), convolved_dfs
 
-    running_means = [_running_mean_for_chromosome(chr) for chr in chromosomes]
 
-    chr_start_pos = dict(zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means])))
+def _running_mean_for_chromosome(chr, expr, var, window_size, step):
+    genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values
+    tmp_x = expr[:, var.index.get_indexer(genes)]
+    x_conv, convolved_gene_values = _running_mean(tmp_x, n=window_size, step=step, gene_list=genes)
 
-    return chr_start_pos, np.hstack(running_means)
+    return x_conv, convolved_gene_values
 
 
 def _get_reference(
@@ -291,17 +369,17 @@ def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_t
     # Step 2 - clip log fold changes
     x_clipped = np.clip(x_centered, -lfc_cap, lfc_cap)
     # Step 3 - smooth by genomic position
-    chr_pos, x_smoothed = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step)
+    chr_pos, x_smoothed, conv_df = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step)
     # Step 4 - center by cell
-    x_cell_centered = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis]
+    x_res = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis]
+    gene_res = conv_df - np.median(conv_df, axis=1)[:, np.newaxis]
 
-    # Noise filtering
-    x_res = x_cell_centered
     # step 5 - standard deviation based noise filtering
     if dynamic_threshold is not None:
         noise_thres = dynamic_threshold * np.std(x_res)
         x_res[np.abs(x_res) < noise_thres] = 0
+        gene_res[np.abs(gene_res) < noise_thres] = 0
 
     x_res = scipy.sparse.csr_matrix(x_res)
 
-    return chr_pos, x_res
+    return chr_pos, x_res, gene_res
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -51,6 +51,56 @@ def adata_mock(request):
     return adata
 
 
+@pytest.fixture
+def adata_full_mock():
+    np.random.seed(0)  # Set the seed to a fixed value
+    obs = pd.DataFrame().assign(sample=["sample1", "sample2", "sample3", "sample4"])  # Added "sample4"
+    var = pd.DataFrame().assign(
+        gene=["gene1", "gene2", "gene3", "gene4", "gene5", "gene6", "gene7", "gene8", "gene9", "gene10"],
+        start=[100, 200, 300, 400, 500, 0, 100, 200, 300, 400],
+        end=[199, 299, 399, 499, 599, 99, 199, 299, 399, 499],
+        chromosome=["chr1", "chr1", "chr1", "chr1", "chr1", "chr2", "chr2", "chr2", "chr2", "chr2"],
+    )
+    var.index = var["gene"]
+    X = sp.csr_matrix(np.random.randint(low=0, high=50, size=(4, 10)))  # Changed size to (4, 10)
+    adata = sc.AnnData(X=X, obs=obs, var=var)
+
+    return adata
+
+
+@pytest.fixture
+def gene_res_actual():
+    df = pd.DataFrame(
+        {
+            "gene1": [0.75, -1.00, 0.00, 0.00],
+            "gene2": [0.00, 0.00, 0.75, 0.00],
+            "gene3": [0.000000, 0.000000, 0.916667, 0.000000],
+            "gene4": [0.00, 0.00, 1.25, 0.00],
+            "gene5": [-0.75, 0.00, 1.25, 0.00],
+            "gene6": [0.000000, 0.000000, 0.000000, 0.921875],
+            "gene7": [0.000000, 0.000000, 0.000000, 0.703125],
+            "gene8": [0.0, 0.0, 0.0, 0.0],
+            "gene9": [0.0, 0.0, 0.0, 0.0],
+            "gene10": [0.75, 0.00, 0.00, 0.00],
+        }
+    )
+    df.index = df.index.astype(str)
+    return df
+
+
+@pytest.fixture
+def x_res_actual():
+    arr = np.array(
+        [
+            [1.00, 0.00, 0.00, 0.00, 0.00, 1.00],
+            [-1.00, 0.00, 0.00, 0.00, 0.00, 0.00],
+            [0.00, 1.25, 1.25, 0.00, 0.00, 0.00],
+            [0.00, 0.00, 0.00, 0.875, 0.00, 0.00],
+        ]
+    )
+    return arr
+
+
 @pytest.fixture(params=[np.array, sp.csr_matrix, sp.csc_matrix])
 def adata_ithgex(request):
     return sc.AnnData(