diff --git a/CHANGELOG.md b/CHANGELOG.md index 43451ea..5c45fe3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - `ananse influence` now accepts a whitelist of genes and/or interactions from the network - these will be added back into the influence network after selecting top edges. +- experimental p300 ChIP-seq input (using H3K27ac models) for `ananse binding` ### Fixed - No region overlap (pandas str.replace() no longer uses regex by default) diff --git a/ananse/commands/binding.py b/ananse/commands/binding.py index 0fecb85..8e14442 100644 --- a/ananse/commands/binding.py +++ b/ananse/commands/binding.py @@ -10,6 +10,7 @@ def binding(args): args.outdir, atac_bams=args.atac_bams, histone_bams=args.histone_bams, + p300_bams=args.p300_bams, cage_tpms=args.cage_tpms, columns=args.columns, regions=args.regions, diff --git a/ananse/peakpredictor.py b/ananse/peakpredictor.py index 1e9649c..73dee0d 100644 --- a/ananse/peakpredictor.py +++ b/ananse/peakpredictor.py @@ -42,6 +42,7 @@ class PeakPredictor: atac_data = None histone_data = None + p300_data = None cage_data = None factor_models = {} @@ -50,6 +51,7 @@ def __init__( reference=None, atac_bams=None, histone_bams=None, + p300_bams=None, cage_tpms=None, columns=None, regions=None, @@ -66,11 +68,14 @@ def __init__( raise NotADirectoryError(f"Could not find {reference}") self.data_dir = reference - if atac_bams is None and histone_bams is None and cage_tpms is None: + if all(b is None for b in [atac_bams, histone_bams, p300_bams, cage_tpms]): raise ValueError( - "Need either ATAC-seq and/or H3K27ac BAM file(s), " - "ATAC-seq and/or H3K27ac coverage table(s), " - "or a CAGE bidirectional sites TPM file." + "Need either " + "- ATAC-seqor H3K27ac ChIP-seq BAM file(s), " + "- ATAC-seq, p300 or H3K27ac ChIP-seq coverage table(s), " + "- a CAGE bidirectional sites TPM table, " + "- a combination of ATAC and H3K27ac data. " + "See the documentation for examples." ) if genome is None: @@ -115,6 +120,12 @@ def __init__( self.load_counts(histone_bams, columns, "H3K27ac") else: self.load_histone(histone_bams, update_models=False) + # load p300 ChIP-seq data + if p300_bams is not None: + if _istable(p300_bams): + self.load_counts(p300_bams, columns, "p300") + else: + raise NotImplementedError self._set_model_type() @@ -593,11 +604,11 @@ def load_counts(self, table, columns=None, attribute="ATAC"): (default: all columns) attribute : str, optional data type contained in the counts table - (options: ["ATAC", "H3K27ac"], default: "ATAC") + (options: ["ATAC", "H3K27ac", "p300"], default: "ATAC") """ # error checking - if attribute not in ["ATAC", "H3K27ac"]: - raise ValueError("Attribute must be either ATAC or H3K27ac!") + if attribute not in ["ATAC", "H3K27ac", "p300"]: + raise ValueError("Attribute must be either ATAC, H3K27ac or p300!") logger.info(f"Loading {attribute} data") if isinstance(table, list): table = table[0] @@ -655,6 +666,8 @@ def load_counts(self, table, columns=None, attribute="ATAC"): self.atac_data = self._normalize_reads(df, attribute) elif attribute == "H3K27ac": self.histone_data = self._normalize_reads(df, attribute) + elif attribute == "p300": + self.p300_data = self._normalize_reads(df, attribute) def load_atac(self, bams, update_models=True): """Load ATAC-seq counts from BAM files. @@ -700,6 +713,9 @@ def _set_model_type(self): cols += ["ATAC.relative"] if self.histone_data is not None: cols += ["H3K27ac"] + if self.p300_data is not None: + # no p300 models available + cols += ["H3K27ac"] if self.cage_data is not None: cols += ["CAGE"] if self.region_type == "CAGE": @@ -708,9 +724,24 @@ def _set_model_type(self): cols += ["average", "dist"] cols = sorted(cols) logger.info(f" Columns being used for model type: {cols}") - self._X_columns = cols self._model_type = "_".join(cols) + if self.p300_data is not None: + if ( + self.histone_data is not None + or self.atac_data is not None + or self.cage_data is not None + ): + raise NotImplementedError("no idea how to combine these data types") + + # The prediction models expect dataframe columns in a certain order + # since we use h3k27ac models for p300 data, + # make sure it is in the right place + i = cols.index("H3K27ac") + cols[i] = "p300" + + self._X_columns = cols + # Load models logger.info("Loading models") for fname in glob(os.path.join(self.data_dir, self._model_type, "*.pkl")): @@ -774,6 +805,8 @@ def _load_data(self, factor): tmp = tmp.join(self.atac_data) if self.histone_data is not None: tmp = tmp.join(self.histone_data) + if self.p300_data is not None: + tmp = tmp.join(self.p300_data) if self.cage_data is not None: tmp = tmp.join(self.cage_data) @@ -846,7 +879,7 @@ def predict_factor_activity(self, nregions=50_000): activity = pd.DataFrame() state = np.random.RandomState(567) # Consistently select same regions - for df in (self.atac_data, self.histone_data, self.cage_data): + for df in (self.atac_data, self.histone_data, self.p300_data, self.cage_data): if df is None: continue @@ -975,6 +1008,7 @@ def predict_peaks( outdir, atac_bams=None, histone_bams=None, + p300_bams=None, cage_tpms=None, columns=None, regions=None, @@ -1108,9 +1142,10 @@ def predict_peaks( p = PeakPredictor( reference=reference, atac_bams=atac_bams, - columns=columns, histone_bams=histone_bams, + p300_bams=p300_bams, cage_tpms=cage_tpms, + columns=columns, regions=regions, genome=genome, pfmfile=pfmfile, @@ -1128,6 +1163,9 @@ def predict_peaks( if p.histone_data is not None: hdf.put(key="_h3k27ac", value=p.histone_data, format="table") + if p.p300_data is not None: + hdf.put(key="_p300", value=p.p300_data, format="table") + if p.cage_data is not None: hdf.put(key="_cage", value=p.cage_data, format="table") diff --git a/scripts/ananse b/scripts/ananse index 0f00cd5..f02f71c 100755 --- a/scripts/ananse +++ b/scripts/ananse @@ -71,6 +71,15 @@ if __name__ == "__main__": default=None, nargs="+", ) + group.add_argument( + "-P", + "--p300-counts", + dest="p300_bams", + metavar="CTS", + help="Experimental: p300 ChIP-seq counts table with reads per peak", + default=None, + # nargs="+", + ) group.add_argument( "-C", "--cage-tpms", diff --git a/tests/test_04_peakpredictor.py b/tests/test_04_peakpredictor.py index 3088996..f1fc588 100644 --- a/tests/test_04_peakpredictor.py +++ b/tests/test_04_peakpredictor.py @@ -209,7 +209,7 @@ def test__jaccard_motif_graph(peakpredictor): def test_command_binding(outdir, genome): Args = namedtuple( "args", - "outdir atac_bams histone_bams cage_tpms columns regions reference tfs genome pfmfile pfmscorefile jaccard_cutoff ncore", + "outdir atac_bams histone_bams p300_bams cage_tpms columns regions reference tfs genome pfmfile pfmscorefile jaccard_cutoff ncore", ) out_dir = os.path.join(outdir, "binding") bed = os.path.join(outdir, "bed3.bed") @@ -268,6 +268,7 @@ def test_command_binding(outdir, genome): outdir=out_dir, atac_bams=["tests/data/GRCz11_chr9/chr9.bam"], histone_bams=None, + p300_bams=None, cage_tpms=None, columns=None, regions=[bed], # ["tests/data/GRCz11_chr9/GRCz11_chr9_regions.bed"],