forked from miha-skalic/motif_scan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscanRBPmotifs
156 lines (142 loc) · 7.21 KB
/
scanRBPmotifs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
# Jan 29, 2016
#
# Calculate motif scores for all PWMs in a given set of sequences in FASTA format
#
# This tool was motivated by the RNA RBP motif scanning tool from CISBP-RNA:
# http://cisbp-rna.ccbr.utoronto.ca/TFTools.php
#
# and branched and heavily modified from the https://github.com/miha-skalic/motif_scan
#
# April 2018, version UMR7216 Paris Diderot
# Transformed by Costas Bouyioukos cbouyio@gmail.column
import sys
import time
import glob
import os
import argparse
import multiprocessing
import pandas as pd
from Bio import motifs, SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
# Command line arguments declaration.
parser = argparse.ArgumentParser(description = "Scan a sequence fasta file to predict binding sites from already known motifs of RnaBPs.")
parser.add_argument("infh", type = argparse.FileType('r'), metavar ="input_file", help = "A fasta file containing the m(or_other)RNA sequences.")
parser.add_argument("outfh", type = argparse.FileType('w'), metavar = "output_file", default = sys.stdout, help = "A table with the results. (or STDOUT)")
parser.add_argument('-d', type = str, dest = "pwm_dir", default = os.path.dirname(os.path.abspath(__file__)) + "/motifs/pwms_all_motifs", help = "Directory containing PWMs. (Default './motifs/pwms_all_motifs')")
parser.add_argument('-r', '--rbpinfo', type = str, dest = 'rbpinfo', default = os.path.dirname(os.path.abspath(__file__)) + "/motifs/RBP_Information.txt", help = "RBP information file for adding meta data to results. (Defaulr: './motifs/RBP_Information.txt')")
parser.add_argument('-p', '--pseudocount', type = float, dest = "pseudocount", default = 0.5, help = "Pseudocount for normalizing PWM. (Default: 0.5)")
parser.add_argument('-t', '--type', type = str, dest = 'seqtype', default = "DNA", help = "Alphabet of input sequence (DNA or RNA). (Default: DNA)")
parser.add_argument('-m', '--minscore', type = float, dest = 'minscore', default = 5, help = "Minimum score for motif hits. (Default: 6)")
parser.add_argument('-s', '--seq', type = str, dest = 'testseq', default = None, help = "Supply a test sequence to scan (FASTA files will be ignored).")
parser.add_argument('-c', type = int, default = 1, dest = "cores", metavar = "CORES", help = "Number of processing cores. (Default: 1)")
parser.add_argument('-x', '--excel', action = "store_true", dest = "excel", default = False, help = "Format the RBP_ID column with HYPERLINK(url) for importing into Excel")
parser.add_argument('-v', '--version', action = "version", version = '%(prog)s version 0.2.66')
## Funcitons
# FIXME all the print statement, follow the python3 format.
def load_motifs(db, *args):
"""Load all motifs from a given directory.
Will look for all *.txt files and put all motifs in a dictionary of PSSMs.
"""
tic = time.time()
motifs = {}
print >> sys.stderr, "Loading motifs..."
for file in glob.glob(db + "/*.txt"):
try:
id = os.path.splitext(os.path.basename(file))[0]
motifs[id] = pwm2pssm(file, *args)
except:
continue
#print >> sys.stderr, "\b.", sys.stderr.flush() #FIXME I do not know why this returns None.
toc = time.time()
print >> sys.stderr, "done in %0.3f seconds!" % (float(toc - tic))
return motifs
def pwm2pssm(file, pseudocount):
"""Convert load PWM and covernt it to PSSM (calculate the log_odds).
"""
pwm = pd.read_table(file)
# Assuming we are doing RNA motif scanning. Need to replace U with T as Biopython's motif scanner only does DNA
pwm.rename(columns = {'U':'T'}, inplace=True)
pwm = pwm.drop("Pos", 1).to_dict(orient = 'list')
pwm = motifs.Motif(alphabet = IUPAC.IUPACUnambiguousDNA(), counts = pwm)
#FIXME USE ONLY for human genome.
pseudocount = {'A':0.59, 'C': 0.41, 'G': 0.41, 'T': 0.59}
pwm = pwm.counts.normalize(pseudocount)
# Can optionally add background, but for now assuming uniform probability
#FIXME put he background as a parameter!
background = 0.41
pssm = pwm.log_odds()
return(pssm)
def scan(pssm, seqrecord, minscore, motif_id):
"""Scan a sequence for hits of a single RBP.
"""
results = []
for position, score in pssm.search(seqrecord.seq, threshold = minscore, both = False):
end_position = position + len(pssm.consensus)
geneName = seqrecord.id.split("|")[0]
transcriptName = seqrecord.id.split("|")[1]
values = [motif_id, geneName, transcriptName, position + 1, end_position, str(seqrecord.seq[position:end_position].transcribe()), round(score, 3)]
results.append(values)
return results
def collect(x, db, excel):
"""Finilise results into a DataFrame for output.
"""
# Get metadata
columns = ["RBP_ID", "Motif_ID", "DBID", "RBP_Name", "RBP_Status", "Family_Name", "RBDs", "RBP_Species"]
meta = pd.read_table(db).loc[:, columns]
meta = meta[meta['RBP_Species'].isin(['Homo_sapiens', 'Mus_musculus'])]
meta['RBP_Name'] = meta['RBP_Name'].str.upper()
if excel:
meta['RBP_ID'] = "=HYPERLINK(\"http://cisbp-rna.ccbr.utoronto.ca/TFreport.php?searchTF=" + meta['RBP_ID'] + "\")"
# Create DataFrame from motif hits
hits = pd.DataFrame(x, columns = ['Motif_ID', 'Trarget_Gene', 'Target_TR', 'Start', 'End', 'Sequence', 'Score'])
# Merge metadata with hits
pdm = pd.merge(meta, hits).sort_values(['Start', 'Motif_ID'])
#pdms = pdm.to_string(header = False, index = False)
return pdm
def scan_all(pssms, seqrecord, opts):
"""Scan a sequence for all PSSM motifs of RBPs.
"""
# FIXME resolve the issue with parallelisation of many sequences. "Number of open files". Consult the multiprocessing module for that and monitor the number of open files too.
hits = []
tasks = []
p = multiprocessing.Pool(opts.cores)
for motif_id, pssm in pssms.iteritems():
tasks.append((pssm, seqrecord, opts.minscore, motif_id,))
results = [p.apply_async(scan, t) for t in tasks]
for r in results:
hits.extend(r.get())
# Collect results
print >> sys.stderr, "Getting metadata and finalizing... "
final = collect(hits, opts.rbpinfo, opts.excel)
print >> sys.stderr, "done for seq %s" % (seqrecord.id)
return final
def main():
# When called as a script instantiate the options-arguments object.
optsArgs = parser.parse_args()
# Load PWMs
pssms = load_motifs(optsArgs.pwm_dir, optsArgs.pseudocount)
if optsArgs.testseq is None :
tic = time.time()
print >> sys.stderr, "Scanning sequences..."
print >> optsArgs.outfh, "RBP_ID\tMotif_ID\tDBID\tRBP_Name\tRBP_Status\tFamily_Name\tRBDs\tRBP_Species\tTarget_Gene\tTarget_TR\tStart\tEnd\tSequence\tScore"
for seqrecord in SeqIO.parse(optsArgs.infh, "fasta"):
seq = seqrecord.seq
if optsArgs.seqtype == "RNA":
seq = seq.back_transcribe()
seq.alphabet = IUPAC.IUPACUnambiguousDNA()
final = scan_all(pssms, seqrecord, optsArgs)
print final.to_csv(optsArgs.outfh, sep="\t", index = False, header = False)
toc = time.time()
print >> sys.stderr, "done in %0.2f seconds!" % (float(toc - tic))
else:
if optsArgs.seqtype == 'RNA':
seq = Seq(opts.testseq, IUPAC.IUPACUnambiguousRNA()).back_transcribe()
seq.alphabet = IUPAC.IUPACUnambiguousDNA()
else:
seq = Seq(opts.testseq, IUPAC.IUPACUnambiguousDNA())
final = scan_all(pssms, seq, optsArgs)
print final.to_csv(optsArgs.outfh, sep="\t", index = False)
if __name__ == '__main__':
main()