Skip to content

Commit

Permalink
Merge pull request #4 from Xinglab/dev-2
Browse files Browse the repository at this point in the history
Added peak annotator and genome data downloader
  • Loading branch information
zj-zhang authored Jul 22, 2019
2 parents e7a82ed + 8a8e50b commit 66b2996
Show file tree
Hide file tree
Showing 3 changed files with 261 additions and 1 deletion.
71 changes: 71 additions & 0 deletions CLAM/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os
import sys
import subprocess
import peak_annotator


def parser(args):
"""DOCSTRING
Args
Returns
"""
try:
genome = args.genome
download_genome(genome)
except KeyboardInterrupt():
sys.exit(0)


def download_genome(genome):
curr_dir = os.path.abspath('.')

admin = (os.getuid() == 0)
cmd = []
home = os.environ['HOME']
if admin:
profile = '/etc/profile'
else:
profile = '{home}/.bashrc'.format(home=home)

if not os.path.isdir('{home}/.clam_data'.format(home=home)):
os.mkdir('{home}/.clam_data'.format(home=home))
os.chdir('{home}/.clam_data'.format(home=home))

if 'CLAM_DAT' not in os.environ or not os.environ['CLAM_DAT'] == '{home}/.clam_data'.format(home=home):
cmd.append('echo "export CLAM_DAT=\'{clam_data}\'" >> {profile}'.format(
clam_data=os.path.abspath('.'), profile=profile))
cmd.append('source {profile}'.format(profile=profile))
os.environ['CLAM_DAT'] = os.path.abspath('.')

if not check_genome_data(genome):
cmd.append('chmod -R 755 {home}/.clam_data'.format(home=home))
cmd.append(
'wget https://raw.githubusercontent.com/wkdeng/clam_data/master/{genome}.zip'.format(genome=genome))
cmd.append('unzip -o {genome}.zip'.format(genome=genome))
cmd.append('rm {genome}.zip'.format(genome=genome))
for item in cmd:
subprocess.call(item, shell=True, executable='/bin/bash')
print 'Download finished'
os.chdir(curr_dir)

def check_genome_data(genome):
if not os.path.isdir(os.environ['CLAM_DAT'] + '/' + genome):
return False
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/3UTRs.bed'):
return False
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/5UTRs.bed'):
return False
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/cds.bed'):
return False
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/exons.bed'):
return False
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/introns.bed'):
return False
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal200_intron.bed'):
return False
if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal500_intron.bed'):
return False
return True

if __name__ == '__main__':
download_genome('hg38')
145 changes: 145 additions & 0 deletions CLAM/peak_annotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import sys
import os
import pybedtools
import argparse as ap
import logging
import download_data

'''
Assign peaks to genomic regions
Zijun Zhang
8.1.2018
10.25.2018: wrapped to a function with document
DWK
modified to output annotation file
6.12.2019
'''

# pylint: disable-msg=too-many-function-args
# pylint: disable-msg=unexpected-keyword-arg


def parser(args):
"""DOCSTRING
Args
Returns
"""
try:
peak_in = args.peak_in
genome = args.genome
out_file = args.out_file
if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome):
print "Unable to locate CLAM data folder for genomic regions, will try to download."
print "Downloading..."
download_data.download_genome(genome)
genome_data = os.environ['CLAM_DAT']
intersect_gtf_regions(
peak_in, out_file, os.path.join(genome_data, genome))
except KeyboardInterrupt():
sys.exit(0)


def intersect_gtf_regions(peak_fp, outfn, gtf_dir):
'''function: intersect_gtf_regions(peak_fp, outfn, gtf_dir)
Intersect a peak BED file with a list of genomic region annotations (e.g. start/stop codon, UTR, intron),
output the peak-region annotations.
:param peak_fp: filepath to a BED-format peakquit
:param outfn: filepath to output count file, has to end with ".txt"; annotation will be "NNN.annot.txt"
'''
# input arguments

# make pybedtools objects
print "Loading peaks..."
peaks = pybedtools.BedTool(peak_fp)
print "Peak file loaded."
print "Loading genome annotation..."
ref_dict = {
'exon': pybedtools.BedTool(os.path.join(gtf_dir, 'exons.bed')),
'3UTR': pybedtools.BedTool(os.path.join(gtf_dir, '3UTRs.bed')),
'5UTR': pybedtools.BedTool(os.path.join(gtf_dir, '5UTRs.bed')),
'cds': pybedtools.BedTool(os.path.join(gtf_dir, 'cds.bed')),
'intron': pybedtools.BedTool(os.path.join(gtf_dir, 'introns.bed')),
'proximal200': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal200_intron.bed')),
'proximal500': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal500_intron.bed'))
}
print "Genome annotation loaded."

# # process reference for use
target = {
"3UTR": ref_dict['3UTR'],
"5UTR": ref_dict['5UTR'],
"CDS": ref_dict['cds'],
"other_exon": ref_dict['exon']-ref_dict['3UTR']-ref_dict['5UTR']-ref_dict['cds'],
"px200_intron": ref_dict['proximal200'],
"px500_intron": ref_dict['proximal500'].subtract(ref_dict['proximal200']),
"distal_intron": ref_dict['intron'].subtract(ref_dict['exon']).subtract(ref_dict['proximal500'])
}
category_list = ['3UTR', '5UTR', 'CDS',
'other_exon', "px200_intron", "px500_intron", "distal_intron"]
init = True

print "Intersecting peaks with genome annotation..."
for cat in category_list:
bed_arr = []
for interval in target[cat]:
bed_arr.append('\t'.join([str(x) for x in interval.fields]))
bed_arr[-1] = bed_arr[-1] + '\t' + cat
bed_arr = list(dict.fromkeys(bed_arr))
for i in range(len(bed_arr)):
bed_arr[i] = bed_arr[i].split('\t')
target[cat] = pybedtools.BedTool(bed_arr)

if init:
init = False
result_bed = peaks.intersect(target[cat], wa=True, wb=True)
else:
result_bed = result_bed.cat(peaks.intersect(
target[cat], wa=True, wb=True), postmerge=False)
result_bed = result_bed.sort()

print "Preparing output..."
result_bed.saveas(outfn + '_')
prepend = ['## Annotation peaks to genomic regions, all intersected genomic regions are presented.',
'## CLAM version: 1.2.0',
'## Column 1: Peak chromosome',
'## Column 2: Peak start',
'## Column 3: Peak end',
'## Column 4: Peak name',
'## Column 5: Peak score',
'## Column 6: Peak strand',
'## Column 7: Peak signal value',
'## Column 8: Peak pValue',
'## Column 9: Peak qValue',
'## Column 10: Point-source called for this peak',
'## Column 11: Genomic region chromosome',
'## Column 12: Genomic region start',
'## Column 13: Genomic region end',
'## Column 14: Gene ID',
'## Column 15: Quality score',
'## Column 16: Genomic region strand',
'## Column 17: Genomic region type']
if os.path.exists(outfn):
os.remove(outfn)
for line in prepend:
cmd = 'echo "{prepend}" >> {outfn}'.format(
prepend=line, outfn=outfn)
os.system(cmd)
os.system('cat {outtmp} >> {outfn}'.format(
outtmp=outfn + '_', outfn=outfn))
os.remove(outfn+'_')
print "DONE"


if __name__ == '__main__':
# peak_fp, genome, outfn = sys.argv[1], sys.argv[2], sys.argv[3]
os.chdir('/mnt/h/yi_lab/m6a/src/scripts/peakComposition')
peak_in, genome, out_file = 'narrow_peak.unique.bed', 'mm10', 'annotate_peak.bed'
if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome):
print "Unable to find CLAM data folder for genomic regions, please try to download it using download_genome command."
print "Downloading..."
download_data.download_genome(genome)
genome_data = os.environ['CLAM_DAT']
intersect_gtf_regions(
peak_in, out_file, os.path.join(genome_data, genome))
46 changes: 45 additions & 1 deletion bin/CLAM
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,18 @@ def main():
from CLAM import peakcaller
#print args
peakcaller.parser( args )
elif subcommand == 'permutation_callpeak':

elif subcommand == 'permutation_callpeak':
from CLAM import permutation_peakcaller
permutation_peakcaller.parser( args )

elif subcommand == 'peak_annotator':
from CLAM import peak_annotator
peak_annotator.parser(args)

elif subcommand == 'data_downloader':
from CLAM import download_data
download_data.parser(args)


def setup_logger():
Expand Down Expand Up @@ -121,6 +130,12 @@ def get_arg_parser():

# permutation_callpeak
add_permutation_callpeak_parser(subparsers)

# peak_annotator
add_peak_annotator_parser(subparsers)

# data_downloader
add_data_downloader_parser(subparsers)

return argparser

Expand Down Expand Up @@ -277,6 +292,35 @@ def add_permutation_callpeak_parser( subparsers ):
return


def add_peak_annotator_parser(subparsers):
ag_anno = subparsers.add_parser(
"peak_annotator", help="CLAM peak annotator: assign peaks to genomic regions")

# input/output
ag_anno.add_argument("-i", "--input", dest="peak_in", type=str, required=True,
help="Input peak file")

ag_anno.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True,
help="Genome version (hg19, hg38, mm10 avaiable)")

ag_anno.add_argument("-o", "--out-file", dest="out_file", type=str, required=True,
help="Output file")

return


def add_data_downloader_parser(subparsers):
ag_down = subparsers.add_parser(
"data_downloader", help="CLAM data downloader: download data of genomic regions")

# input/output
ag_down.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True,
help="Genome version (hg19, hg38, mm10 avaiable)")

return



if __name__ == '__main__':
try:
main()
Expand Down

0 comments on commit 66b2996

Please sign in to comment.