forked from broadinstitute/ABC-Enhancer-Gene-Prediction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpeaks.py
executable file
·91 lines (72 loc) · 4.35 KB
/
peaks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
import numpy as np
import os
import os.path
from tools import *
from neighborhoods import *
def make_candidate_regions_from_summits(macs_peaks, accessibility_file, genome_sizes, regions_includelist, regions_blocklist, n_enhancers, peak_extend, outdir):
## Generate enhancer regions from MACS summits
# 1. Count reads in DHS peaks
# 2. Take top N regions, get summits, extend summits, merge
outfile = os.path.join(outdir, os.path.basename(macs_peaks) + ".candidateRegions.bed")
raw_counts_outfile = os.path.join(outdir, os.path.basename(macs_peaks) + "." + os.path.basename(accessibility_file) + ".Counts.bed")
if regions_includelist:
includelist_command = "(bedtools intersect -a {regions_includelist} -b {genome_sizes}.bed -wa | cut -f 1-3 && cat) |"
else:
includelist_command = ""
if regions_blocklist:
blocklist_command = "bedtools intersect -v -wa -a stdin -b {regions_blocklist} | "
else:
blocklist_command = ""
#1. Count DHS/ATAC reads in candidate regions
run_count_reads(accessibility_file, raw_counts_outfile, macs_peaks, genome_sizes, use_fast_count=True)
#2. Take top N regions, get summits, extend summits, merge, remove blocklist, add includelist, sort and merge
#use -sorted in intersect command? Not worth it, both files are small
command = "bedtools sort -i {raw_counts_outfile} -faidx {genome_sizes} | bedtools merge -i stdin -c 4 -o max | sort -nr -k 4 | head -n {n_enhancers} |" + \
"bedtools intersect -b stdin -a {macs_peaks} -wa |" + \
"awk '{{print $1 \"\\t\" $2 + $10 \"\\t\" $2 + $10}}' |" + \
"bedtools slop -i stdin -b {peak_extend} -g {genome_sizes} |" + \
"bedtools sort -i stdin -faidx {genome_sizes} |" + \
"bedtools merge -i stdin | " + \
blocklist_command + \
"cut -f 1-3 | " + includelist_command + \
"bedtools sort -i stdin -faidx {genome_sizes} | bedtools merge -i stdin > {outfile}"
command = command.format(**locals())
p = Popen(command, stdout=PIPE, stderr=PIPE, shell=True)
print("Running: " + command)
(stdoutdata, stderrdata) = p.communicate()
err = str(stderrdata, 'utf-8')
return stdoutdata
def make_candidate_regions_from_peaks(macs_peaks, accessibility_file, genome_sizes, regions_includelist, regions_blocklist, n_enhancers, peak_extend, minPeakWidth, outdir):
## Generate enhancer regions from MACS narrowPeak - do not use summits
outfile = os.path.join(outdir, os.path.basename(macs_peaks) + ".candidateRegions.bed")
raw_counts_outfile = os.path.join(outdir, os.path.basename(macs_peaks) + os.path.basename(accessibility_file) + ".Counts.bed")
if regions_includelist:
includelist_command = "(bedtools intersect -a {regions_includelist} -b {genome_sizes}.bed -wa | cut -f 1-3 && cat) |"
else:
includelist_command = ""
if regions_blocklist:
blocklist_command = "bedtools intersect -v -wa -a stdin -b {regions_blocklist} | "
else:
blocklist_command = ""
#1. Count DHS/ATAC reads in candidate regions
run_count_reads(accessibility_file, raw_counts_outfile, macs_peaks, genome_sizes, use_fast_count=True)
#2. Take top N regions, extend peaks (min size 500), merge, remove blocklist, add includelist, sort and merge
#use -sorted in intersect command? Not worth it, both files are small
command = "bedtools sort -i {raw_counts_outfile} -faidx {genome_sizes} | bedtools merge -i stdin -c 4 -o max | sort -nr -k 4 | head -n {n_enhancers} |" + \
"bedtools intersect -b stdin -a {macs_peaks} -wa |" + \
"bedtools slop -i stdin -b {peak_extend} -g {genome_sizes} |" + \
"awk '{{ l=$3-$2; if (l < {minPeakWidth}) {{ $2 = $2 - int(({minPeakWidth}-l)/2); $3 = $3 + int(({minPeakWidth}-l)/2) }} print $1 \"\\t\" $2 \"\\t\" $3}}' |" + \
"bedtools sort -i stdin -faidx {genome_sizes} |" + \
"bedtools merge -i stdin | " + \
blocklist_command + \
"cut -f 1-3 | " + includelist_command + \
"bedtools sort -i stdin -faidx {genome_sizes} | bedtools merge -i stdin > {outfile}"
command = command.format(**locals())
p = Popen(command, stdout=PIPE, stderr=PIPE, shell=True)
print("Running: " + command)
(stdoutdata, stderrdata) = p.communicate()
err = str(stderrdata, 'utf-8')
if not err == '':
raise RuntimeError("Command failed.")
return stdoutdata