vanheeringen-lab · siebrenf · Jan 11, 2023 · Jan 11, 2023 · Jan 12, 2023 · Jan 12, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Added
 - `ananse influence` now accepts a whitelist of genes and/or interactions from the network
   - these will be added back into the influence network after selecting top edges.
+- experimental p300 ChIP-seq input (using H3K27ac models) for `ananse binding`
 
 ### Fixed
 - No region overlap (pandas str.replace() no longer uses regex by default)

diff --git a/ananse/commands/binding.py b/ananse/commands/binding.py
@@ -10,6 +10,7 @@ def binding(args):
         args.outdir,
         atac_bams=args.atac_bams,
         histone_bams=args.histone_bams,
+        p300_bams=args.p300_bams,
         cage_tpms=args.cage_tpms,
         columns=args.columns,
         regions=args.regions,

diff --git a/ananse/peakpredictor.py b/ananse/peakpredictor.py
@@ -42,6 +42,7 @@
 class PeakPredictor:
     atac_data = None
     histone_data = None
+    p300_data = None
     cage_data = None
     factor_models = {}
 
@@ -50,6 +51,7 @@ def __init__(
         reference=None,
         atac_bams=None,
         histone_bams=None,
+        p300_bams=None,
         cage_tpms=None,
         columns=None,
         regions=None,
@@ -66,11 +68,14 @@ def __init__(
             raise NotADirectoryError(f"Could not find {reference}")
         self.data_dir = reference
 
-        if atac_bams is None and histone_bams is None and cage_tpms is None:
+        if all(b is None for b in [atac_bams, histone_bams, p300_bams, cage_tpms]):
             raise ValueError(
-                "Need either ATAC-seq and/or H3K27ac BAM file(s), "
-                "ATAC-seq and/or H3K27ac coverage table(s), "
-                "or a CAGE bidirectional sites TPM file."
+                "Need either "
+                "- ATAC-seqor H3K27ac ChIP-seq BAM file(s), "
+                "- ATAC-seq, p300 or H3K27ac ChIP-seq coverage table(s), "
+                "- a CAGE bidirectional sites TPM table, "
+                "- a combination of ATAC and H3K27ac data. "
+                "See the documentation for examples."
             )
 
         if genome is None:
@@ -115,6 +120,12 @@ def __init__(
                 self.load_counts(histone_bams, columns, "H3K27ac")
             else:
                 self.load_histone(histone_bams, update_models=False)
+        # load p300 ChIP-seq data
+        if p300_bams is not None:
+            if _istable(p300_bams):
+                self.load_counts(p300_bams, columns, "p300")
+            else:
+                raise NotImplementedError
 
         self._set_model_type()
 
@@ -593,11 +604,11 @@ def load_counts(self, table, columns=None, attribute="ATAC"):
             (default: all columns)
         attribute : str, optional
             data type contained in the counts table
-            (options: ["ATAC", "H3K27ac"], default: "ATAC")
+            (options: ["ATAC", "H3K27ac", "p300"], default: "ATAC")
         """
         # error checking
-        if attribute not in ["ATAC", "H3K27ac"]:
-            raise ValueError("Attribute must be either ATAC or H3K27ac!")
+        if attribute not in ["ATAC", "H3K27ac", "p300"]:
+            raise ValueError("Attribute must be either ATAC, H3K27ac or p300!")
         logger.info(f"Loading {attribute} data")
         if isinstance(table, list):
             table = table[0]
@@ -655,6 +666,8 @@ def load_counts(self, table, columns=None, attribute="ATAC"):
             self.atac_data = self._normalize_reads(df, attribute)
         elif attribute == "H3K27ac":
             self.histone_data = self._normalize_reads(df, attribute)
+        elif attribute == "p300":
+            self.p300_data = self._normalize_reads(df, attribute)
 
     def load_atac(self, bams, update_models=True):
         """Load ATAC-seq counts from BAM files.
@@ -700,6 +713,9 @@ def _set_model_type(self):
                 cols += ["ATAC.relative"]
         if self.histone_data is not None:
             cols += ["H3K27ac"]
+        if self.p300_data is not None:
+            # no p300 models available
+            cols += ["H3K27ac"]
         if self.cage_data is not None:
             cols += ["CAGE"]
             if self.region_type == "CAGE":
@@ -708,9 +724,24 @@ def _set_model_type(self):
             cols += ["average", "dist"]
         cols = sorted(cols)
         logger.info(f"  Columns being used for model type: {cols}")
-        self._X_columns = cols
         self._model_type = "_".join(cols)
 
+        if self.p300_data is not None:
+            if (
+                self.histone_data is not None
+                or self.atac_data is not None
+                or self.cage_data is not None
+            ):
+                raise NotImplementedError("no idea how to combine these data types")
+
+            # The prediction models expect dataframe columns in a certain order
+            # since we use h3k27ac models for p300 data,
+            # make sure it is in the right place
+            i = cols.index("H3K27ac")
+            cols[i] = "p300"
+
+        self._X_columns = cols
+
         # Load models
         logger.info("Loading models")
         for fname in glob(os.path.join(self.data_dir, self._model_type, "*.pkl")):
@@ -774,6 +805,8 @@ def _load_data(self, factor):
             tmp = tmp.join(self.atac_data)
         if self.histone_data is not None:
             tmp = tmp.join(self.histone_data)
+        if self.p300_data is not None:
+            tmp = tmp.join(self.p300_data)
         if self.cage_data is not None:
             tmp = tmp.join(self.cage_data)
 
@@ -846,7 +879,7 @@ def predict_factor_activity(self, nregions=50_000):
 
         activity = pd.DataFrame()
         state = np.random.RandomState(567)  # Consistently select same regions
-        for df in (self.atac_data, self.histone_data, self.cage_data):
+        for df in (self.atac_data, self.histone_data, self.p300_data, self.cage_data):
             if df is None:
                 continue
 
@@ -975,6 +1008,7 @@ def predict_peaks(
     outdir,
     atac_bams=None,
     histone_bams=None,
+    p300_bams=None,
     cage_tpms=None,
     columns=None,
     regions=None,
@@ -1108,9 +1142,10 @@ def predict_peaks(
     p = PeakPredictor(
         reference=reference,
         atac_bams=atac_bams,
-        columns=columns,
         histone_bams=histone_bams,
+        p300_bams=p300_bams,
         cage_tpms=cage_tpms,
+        columns=columns,
         regions=regions,
         genome=genome,
         pfmfile=pfmfile,
@@ -1128,6 +1163,9 @@ def predict_peaks(
         if p.histone_data is not None:
             hdf.put(key="_h3k27ac", value=p.histone_data, format="table")
 
+        if p.p300_data is not None:
+            hdf.put(key="_p300", value=p.p300_data, format="table")
+
         if p.cage_data is not None:
             hdf.put(key="_cage", value=p.cage_data, format="table")
 

diff --git a/scripts/ananse b/scripts/ananse
@@ -71,6 +71,15 @@ if __name__ == "__main__":
         default=None,
         nargs="+",
     )
+    group.add_argument(
+        "-P",
+        "--p300-counts",
+        dest="p300_bams",
+        metavar="CTS",
+        help="Experimental: p300 ChIP-seq counts table with reads per peak",
+        default=None,
+        # nargs="+",
+    )
     group.add_argument(
         "-C",
         "--cage-tpms",

diff --git a/tests/test_04_peakpredictor.py b/tests/test_04_peakpredictor.py
@@ -209,7 +209,7 @@ def test__jaccard_motif_graph(peakpredictor):
 def test_command_binding(outdir, genome):
     Args = namedtuple(
         "args",
-        "outdir atac_bams histone_bams cage_tpms columns regions reference tfs genome pfmfile pfmscorefile jaccard_cutoff ncore",
+        "outdir atac_bams histone_bams p300_bams cage_tpms columns regions reference tfs genome pfmfile pfmscorefile jaccard_cutoff ncore",
     )
     out_dir = os.path.join(outdir, "binding")
     bed = os.path.join(outdir, "bed3.bed")
@@ -268,6 +268,7 @@ def test_command_binding(outdir, genome):
         outdir=out_dir,
         atac_bams=["tests/data/GRCz11_chr9/chr9.bam"],
         histone_bams=None,
+        p300_bams=None,
         cage_tpms=None,
         columns=None,
         regions=[bed],  # ["tests/data/GRCz11_chr9/GRCz11_chr9_regions.bed"],