From fae25300900f4b287eb8f94de3b86b098e83d8e4 Mon Sep 17 00:00:00 2001
From: bfurtwa <bfurtwaengler@web.de>
Date: Thu, 21 Oct 2021 15:14:06 +0200
Subject: [PATCH] enable custom background in enrichment_test

---
 sceptre/sceptre.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/sceptre/sceptre.py b/sceptre/sceptre.py
index a79c6ef..267f5c6 100644
--- a/sceptre/sceptre.py
+++ b/sceptre/sceptre.py
@@ -1627,13 +1627,14 @@ def enrichment_test(
     adata: AnnData,
     gene_set: Sequence[str],
     categories: Sequence[str],
+    background: Union[pd.DataFrame, None] = None,
     sep: str = ";",
     key: str = "enrichment_test",
     pval_thesh: float = 0.05,
     use_raw: bool = True,
 ):
     """Perform a term enrichment test on the selected genes in selected term category.
-    All genes in the matrix are used as background.
+    If no background is provided, all genes in the matrix are used as background.
 
     Parameters
     ----------
@@ -1641,6 +1642,8 @@ def enrichment_test(
         The annotated data matrix.
     gene_set
         Genes to be tested against the background.
+    background
+        Dataframe of genes in the rows and categories in the columns.
     categories
         The names of the variables in `adata.var` that store the term annotations.
         E.g. `'Biological Process'`.
@@ -1674,6 +1677,8 @@ def enrichment_test(
             .astype(str)[~(ad.var[cat] == "nan")]
             .apply(lambda x: x.split(sep))
         )
+        # remove 'nan' term
+        gene_terms = gene_terms.apply(lambda x: [t for t in x if t != 'nan'])
 
         all_gene_set_terms = pd.DataFrame(
             index=set([l for subl in gene_terms.loc[gene_set].values for l in subl])
@@ -1682,11 +1687,17 @@ def enrichment_test(
             x = (
                 gene_terms[gene_set].apply(lambda x: term in x).sum()
             )  # number of test proteins with the term
-            M = len(gene_terms)  # number of background proteins
+            if background is not None:
+                M = len(background) # number of background proteins
+            else:
+                M = len(gene_terms)  # number of background proteins
             N = len(gene_set)  # number of test proteins
-            n = gene_terms.apply(
-                lambda x: term in x
-            ).sum()  # number of background proteins with the term
+            if background is not None:
+                n = background[cat].str.contains(term, regex=False).sum() # number of background proteins with the term
+            else:
+                n = gene_terms.apply(
+                    lambda x: term in x
+                ).sum()  # number of background proteins with the term
             p = hypergeom.sf(x - 1, M, n, N)
             expected = n / M * N
             enrichment = x / expected