Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

accept p300 ChIP-seq #202

Open
wants to merge 13 commits into
base: develop
Choose a base branch
from
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Added
- `ananse influence` now accepts a whitelist of genes and/or interactions from the network
- these will be added back into the influence network after selecting top edges.
- experimental p300 ChIP-seq input (using H3K27ac models) for `ananse binding`

### Fixed
- No region overlap (pandas str.replace() no longer uses regex by default)
Expand Down
1 change: 1 addition & 0 deletions ananse/commands/binding.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def binding(args):
args.outdir,
atac_bams=args.atac_bams,
histone_bams=args.histone_bams,
p300_bams=args.p300_bams,
cage_tpms=args.cage_tpms,
columns=args.columns,
regions=args.regions,
Expand Down
58 changes: 48 additions & 10 deletions ananse/peakpredictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
class PeakPredictor:
atac_data = None
histone_data = None
p300_data = None
cage_data = None
factor_models = {}

Expand All @@ -50,6 +51,7 @@ def __init__(
reference=None,
atac_bams=None,
histone_bams=None,
p300_bams=None,
cage_tpms=None,
columns=None,
regions=None,
Expand All @@ -66,11 +68,14 @@ def __init__(
raise NotADirectoryError(f"Could not find {reference}")
self.data_dir = reference

if atac_bams is None and histone_bams is None and cage_tpms is None:
if all(b is None for b in [atac_bams, histone_bams, p300_bams, cage_tpms]):
raise ValueError(
"Need either ATAC-seq and/or H3K27ac BAM file(s), "
"ATAC-seq and/or H3K27ac coverage table(s), "
"or a CAGE bidirectional sites TPM file."
"Need either "
"- ATAC-seqor H3K27ac ChIP-seq BAM file(s), "
"- ATAC-seq, p300 or H3K27ac ChIP-seq coverage table(s), "
"- a CAGE bidirectional sites TPM table, "
"- a combination of ATAC and H3K27ac data. "
"See the documentation for examples."
)

if genome is None:
Expand Down Expand Up @@ -115,6 +120,12 @@ def __init__(
self.load_counts(histone_bams, columns, "H3K27ac")
else:
self.load_histone(histone_bams, update_models=False)
# load p300 ChIP-seq data
if p300_bams is not None:
if _istable(p300_bams):
self.load_counts(p300_bams, columns, "p300")
else:
raise NotImplementedError

self._set_model_type()

Expand Down Expand Up @@ -593,11 +604,11 @@ def load_counts(self, table, columns=None, attribute="ATAC"):
(default: all columns)
attribute : str, optional
data type contained in the counts table
(options: ["ATAC", "H3K27ac"], default: "ATAC")
(options: ["ATAC", "H3K27ac", "p300"], default: "ATAC")
"""
# error checking
if attribute not in ["ATAC", "H3K27ac"]:
raise ValueError("Attribute must be either ATAC or H3K27ac!")
if attribute not in ["ATAC", "H3K27ac", "p300"]:
raise ValueError("Attribute must be either ATAC, H3K27ac or p300!")
logger.info(f"Loading {attribute} data")
if isinstance(table, list):
table = table[0]
Expand Down Expand Up @@ -655,6 +666,8 @@ def load_counts(self, table, columns=None, attribute="ATAC"):
self.atac_data = self._normalize_reads(df, attribute)
elif attribute == "H3K27ac":
self.histone_data = self._normalize_reads(df, attribute)
elif attribute == "p300":
self.p300_data = self._normalize_reads(df, attribute)

def load_atac(self, bams, update_models=True):
"""Load ATAC-seq counts from BAM files.
Expand Down Expand Up @@ -700,6 +713,9 @@ def _set_model_type(self):
cols += ["ATAC.relative"]
if self.histone_data is not None:
cols += ["H3K27ac"]
if self.p300_data is not None:
# no p300 models available
cols += ["H3K27ac"]
if self.cage_data is not None:
cols += ["CAGE"]
if self.region_type == "CAGE":
Expand All @@ -708,9 +724,24 @@ def _set_model_type(self):
cols += ["average", "dist"]
cols = sorted(cols)
logger.info(f" Columns being used for model type: {cols}")
self._X_columns = cols
self._model_type = "_".join(cols)

if self.p300_data is not None:
if (
self.histone_data is not None
or self.atac_data is not None
or self.cage_data is not None
):
raise NotImplementedError("no idea how to combine these data types")

# The prediction models expect dataframe columns in a certain order
# since we use h3k27ac models for p300 data,
# make sure it is in the right place
i = cols.index("H3K27ac")
cols[i] = "p300"

self._X_columns = cols

# Load models
logger.info("Loading models")
for fname in glob(os.path.join(self.data_dir, self._model_type, "*.pkl")):
Expand Down Expand Up @@ -774,6 +805,8 @@ def _load_data(self, factor):
tmp = tmp.join(self.atac_data)
if self.histone_data is not None:
tmp = tmp.join(self.histone_data)
if self.p300_data is not None:
tmp = tmp.join(self.p300_data)
if self.cage_data is not None:
tmp = tmp.join(self.cage_data)

Expand Down Expand Up @@ -846,7 +879,7 @@ def predict_factor_activity(self, nregions=50_000):

activity = pd.DataFrame()
state = np.random.RandomState(567) # Consistently select same regions
for df in (self.atac_data, self.histone_data, self.cage_data):
for df in (self.atac_data, self.histone_data, self.p300_data, self.cage_data):
if df is None:
continue

Expand Down Expand Up @@ -975,6 +1008,7 @@ def predict_peaks(
outdir,
atac_bams=None,
histone_bams=None,
p300_bams=None,
cage_tpms=None,
columns=None,
regions=None,
Expand Down Expand Up @@ -1108,9 +1142,10 @@ def predict_peaks(
p = PeakPredictor(
reference=reference,
atac_bams=atac_bams,
columns=columns,
histone_bams=histone_bams,
p300_bams=p300_bams,
cage_tpms=cage_tpms,
columns=columns,
regions=regions,
genome=genome,
pfmfile=pfmfile,
Expand All @@ -1128,6 +1163,9 @@ def predict_peaks(
if p.histone_data is not None:
hdf.put(key="_h3k27ac", value=p.histone_data, format="table")

if p.p300_data is not None:
hdf.put(key="_p300", value=p.p300_data, format="table")

if p.cage_data is not None:
hdf.put(key="_cage", value=p.cage_data, format="table")

Expand Down
9 changes: 9 additions & 0 deletions scripts/ananse
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ if __name__ == "__main__":
default=None,
nargs="+",
)
group.add_argument(
"-P",
"--p300-counts",
dest="p300_bams",
metavar="CTS",
help="Experimental: p300 ChIP-seq counts table with reads per peak",
default=None,
# nargs="+",
)
group.add_argument(
"-C",
"--cage-tpms",
Expand Down
3 changes: 2 additions & 1 deletion tests/test_04_peakpredictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def test__jaccard_motif_graph(peakpredictor):
def test_command_binding(outdir, genome):
Args = namedtuple(
"args",
"outdir atac_bams histone_bams cage_tpms columns regions reference tfs genome pfmfile pfmscorefile jaccard_cutoff ncore",
"outdir atac_bams histone_bams p300_bams cage_tpms columns regions reference tfs genome pfmfile pfmscorefile jaccard_cutoff ncore",
)
out_dir = os.path.join(outdir, "binding")
bed = os.path.join(outdir, "bed3.bed")
Expand Down Expand Up @@ -268,6 +268,7 @@ def test_command_binding(outdir, genome):
outdir=out_dir,
atac_bams=["tests/data/GRCz11_chr9/chr9.bam"],
histone_bams=None,
p300_bams=None,
cage_tpms=None,
columns=None,
regions=[bed], # ["tests/data/GRCz11_chr9/GRCz11_chr9_regions.bed"],
Expand Down
Loading