-
Notifications
You must be signed in to change notification settings - Fork 10
/
run.py
284 lines (214 loc) · 12.1 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import sys
import types
import networkx as nx
from optparse import OptionParser, OptionGroup
# Local package imports
import PathLinker as pl
import ksp_Astar as ksp
from PageRank import pagerank, writePageRankWeights
def main(args):
usage = '''
PathLinker.py [options] NETWORK NODE_TYPES
REQUIRED arguments:
NETWORK - A tab-delimited file with one directed interaction per
line. Each line should have at least 2 columns: tail, head. Edges
are directed from tail to head. This file can have a third column
specifying the edge weight, which is required unless the --PageRank
option is used (see --PageRank help for a note on these weights).
To run PathLinker on an unweighted graph, set all edge weights
to 1 in the input network.
NODE_TYPES - A tab-delimited file denoting nodes as receptors or TRs. The first
column is the node name, the second is the node type, either 'source'
(or 'receptor') or 'target' (or 'tr' or 'tf'). Nodes which are neither receptors nor TRs may
be omitted from this file or may be given a type which is neither 'source'
nor 'target'.
'''
parser = OptionParser(usage=usage)
# General Options
parser.add_option('-o', '--output', type='string', default='out', metavar='STR',\
help='A string to prepend to all output files. (default="out")')
parser.add_option('', '--write-paths', action='store_true', default=False,\
help='If given, also output a list of paths found by KSP in addition to the ranked edges.')
parser.add_option('', '--no-log-transform', action='store_true', default=False,\
help='Normally input edge weights are log-transformed. This option disables that step.')
parser.add_option('', '--largest-connected-component', action='store_true', default=False,\
help='Run PathLinker on only the largest weakly connected component of the graph. May provide performance speedup.')
parser.add_option('', '--edge-penalty', type='float', default=1.0,\
help='Factor by which to divide every edge weight. The effect of this option is to penalize the score of every path by a factor equal to (the number of edges in the path)^(this factor). (default=1.0)')
# Random Walk Group
group = OptionGroup(parser, 'Random Walk Options')
group.add_option('', '--PageRank', action='store_true', default=False,\
help='Run the PageRank algorithm to generate edge visitation flux values, which are then used as weights for KSP. A weight column in the network file is not needed if this option is given, as the PageRank visitation fluxes are used for edge weights in KSP. If a weight column is given, these weights are interpreted as a weighted PageRank graph.')
group.add_option('-q', '--q-param', action='store', type='float', default=0.5,\
help='The value of q indicates the probability that the random walker teleports back to a source node during the random walk process. (default=0.5)')
group.add_option('-e', '--epsilon', action='store', type='float', default=0.0001,\
help='A small value used to test for convergence of the iterative implementation of PageRank. (default=0.0001)')
group.add_option('', '--max-iters', action='store', type='int', default=500,\
help='Maximum number of iterations to run the PageRank algorithm. (default=500)')
parser.add_option_group(group)
# k shortest paths Group
group = OptionGroup(parser, 'k Shortest Paths Options')
group.add_option('-k', '--k-param', type='int', default=100,\
help='The number of shortest paths to find. (default=100)')
group.add_option('','--allow-mult-targets', action='store_true', default=False,\
help='By default, PathLinker will remove outgoing edges from targets to ensure that there is only one target on each path. If --allow-mult-targets is specified, these edges are not removed.')
group.add_option('','--allow-mult-sources', action='store_true', default=False,\
help='By default, PathLinker will remove incoming edges to sources to ensure that there is only one source on each path. If --allow-mult-sources is specified, these edges are not removed.')
group.add_option('', '--include-tied-paths', action='store_true', default=False,\
help='Return the k shortest paths as well as all paths with length equal to that of the kth path.')
parser.add_option_group(group)
# parse the command line arguments
(opts, args) = parser.parse_args()
# get the required arguments
num_req_args = 2
if len(args)!=num_req_args:
parser.print_help()
sys.exit('\nERROR: PathLinker.py requires %d positional arguments, %d given.' %(num_req_args, len(args)))
NETWORK_FILE = args[0]
NODE_VALUES_FILE = args[1]
# Validate options
if(opts.PageRank and opts.no_log_transform):
sys.exit('\nERROR: Options --PageRank and --no-log-transform should not be used together. PageRank weights are probabilities, and must be log-transformed to have an additive interpretation.')
net = None
## Read the network from file
with open(NETWORK_FILE, 'r') as network_file:
net = pl.readNetworkFile(network_file, opts.PageRank)
# Print info about the network
print(nx.info(net))
# Operate on only the largest connected component
if(opts.largest_connected_component):
conn_comps = nx.weakly_connected_component_subgraphs(net)
# This is the only portion of the program which prevents
# compatibility between Python 2 & 3. In 2, this object is a
# generator, but in 3 it is a list. Just check the type and
# handle accordingly to provide cross-compatibility.
if(isinstance(conn_comps, types.GeneratorType)):
net = next(conn_comps)
elif(isinstance(conn_comps, list)):
net = conn_comps[0]
else:
print("Compatibility error between NetworkX and Python versions. Connected components object from NetworkX does not have acceptable type.")
exit(-1)
print("\n Using only the largest weakly connected component:\n"+nx.info(net))
# Read the sources and targets on which to run PageRank and KSP
sources = set()
targets = set()
# Read the receptors and TRs file
print("Reading sources and targets from " + NODE_VALUES_FILE)
for line in open(NODE_VALUES_FILE, 'r').readlines():
items = [x.strip() for x in line.rstrip().split('\t')]
# Skip empty lines and lines beginning with '#' comments
if line=='':
continue
if line[0]=='#':
continue
if items[1] in ['source', 'receptor']:
sources.add(items[0])
elif items[1] in ['target', 'tr', 'tf']:
targets.add(items[0])
print('\nRead %d sources and %d targets' % (len(sources), len(targets)))
# Remove sources and targets that don't appear in the network, and do some sanity checks on sets
sources = set([s for s in sources if s in net])
targets = set([t for t in targets if t in net])
print('\tAfter removing sources and targets that are not in the network: %d sources and %d targets.' %(len(sources), len(targets)))
if len(sources)==0:
sys.exit('ERROR: No sources are in the network.')
if len(targets)==0:
sys.exit('ERROR: No targets are in the network.')
if len(sources.intersection(targets))>0:
sys.exit('ERROR: %d proteins are listed as both a source and target.' %(len(sources.intersection(targets))))
# Run PageRank on the network
# (if opts.PageRank == false, the weights were read from a file above)
if(opts.PageRank):
PR_PARAMS = {'q' : opts.q_param,\
'eps' : opts.epsilon,\
'maxIters' : opts.max_iters}
print('\nRunning PageRank on net.(q=%f)' %(opts.q_param))
# The initial weights are entirely on the source nodes, so this
# corresponds to a random walk that teleports back to the sources.
weights = dict.fromkeys(sources, 1.0)
prFinal = pagerank(net, weights, **PR_PARAMS)
# Write node visitation probabilities
# (function imported from PageRank)
writePageRankWeights(prFinal,filename='%s-node-pagerank.txt' % (opts.output))
# Weight the edges by the flux from the nodes
pl.calculateFluxEdgeWeights(net, prFinal)
# Write edge fluxes
pl.printEdgeFluxes('%s-edge-fluxes.txt' % (opts.output), net)
## Prepare the network to run KSP
# Remove improper edges from the sources and targets. This portion
# must be performed before the log transformation, so that the
# renormalization within accounts for the probability lost to the
# removed edges. These transformations are executed by default;
# to prevent them, use the opts.allow_mult_sources or opts.allow_mult_targets
# arguments.
if not opts.allow_mult_sources:
pl.modifyGraphForKSP_removeEdgesToSources(net, sources)
if not opts.allow_mult_targets:
pl.modifyGraphForKSP_removeEdgesFromTargets(net, targets)
# Apply the user specified edge penalty
pl.applyEdgePenalty(net, opts.edge_penalty)
# Transform the edge weights with a log transformation
if(not opts.no_log_transform):
pl.logTransformEdgeWeights(net)
# Add a super source and super sink. Performed after the
# transformations so that the edges can be given an additive
# weight of 0 and thus not affect the resulting path cost.
pl.modifyGraphForKSP_addSuperSourceSink(
net, sources, targets, weightForArtificialEdges = 0)
## Run the pathfinding algorithm
print('\nComputing the k=%d shortest simple paths.' %(opts.k_param))
paths = ksp.k_shortest_paths_yen(net, 'source', 'sink', opts.k_param, weight='ksp_weight', clip=not opts.include_tied_paths)
if len(paths)==0:
sys.exit('\tERROR: Targets are not reachable from the sources.')
## Use the results of KSP to rank edges
# Un-does the logarithmic transformation on the path lengths to
# make the path length in terms of the original edge weights
if not opts.no_log_transform:
paths = pl.undoLogTransformPathLengths(paths)
# Prepare the k shortest paths for output to flat files
pathgraph = nx.DiGraph()
for k,path in enumerate(paths, 1):
# Process the edges in this path
edges = []
for i in range(len(path)-1):
t = path[i][0]
h = path[i+1][0]
# Skip edges that have already been seen in an earlier path
if pathgraph.has_edge(t, h):
continue
# Skip edges that include our artificial supersource or
# supersink
if t=='source' or h=='sink':
continue
# This is a new edge. Add it to the list and note which k it
# appeared in.
else:
edges.append( (t,h,{
'ksp_id':k,
'ksp_weight':net.edge[t][h]['ksp_weight'],
'path_cost': path[-1][1]}) )
# Add all new, good edges from this path to the network
pathgraph.add_edges_from(edges)
# Each node is ranked by the first time it appears in a path.
# Identify these by check for any nodes in the graph which do
# not have 'ksp_id' attribute, meaning they were just added
# from this path.
for n in pathgraph.nodes():
if 'ksp_id' not in pathgraph.node[n]:
pathgraph.node[n]['ksp_id'] = k
## Write out the results to file
# Write a list of all edges encountered, ranked by the path they
# first appeared in.
kspGraphOutfile = '%sk-%d-ranked-edges.txt' %(opts.output, opts.k_param)
pl.printKSPGraph(kspGraphOutfile, pathgraph)
print('\nKSP results are in "%s"' %(kspGraphOutfile))
# Write a list of all paths found by the ksp algorithm, if
# requested.
if(opts.write_paths):
kspOutfile = '%sk-%d-paths.txt' %(opts.output, opts.k_param)
pl.printKSPPaths(kspOutfile, paths)
print('KSP paths are in "%s"' %(kspOutfile))
print('\nFinished!')
if __name__ == '__main__':
main(sys.argv)