-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #13 from stjude-biohackathon/sid_wip_networkhelpers
Add network module.
- Loading branch information
Showing
3 changed files
with
310 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
|
||
import os | ||
import logging | ||
import argparse | ||
|
||
SCRIPT_PATH = os.path.abspath(__file__) | ||
FORMAT = '[%(asctime)s] %(levelname)s %(message)s' | ||
l = logging.getLogger() | ||
lh = logging.StreamHandler() | ||
lh.setFormatter(logging.Formatter(FORMAT)) | ||
l.addHandler(lh) | ||
l.setLevel(logging.INFO) | ||
debug = l.debug; info = l.info; warning = l.warning; error = l.error | ||
|
||
def parse_bed(_infile): | ||
import pandas as pd | ||
df = pd.read_csv(_infile, sep='\t') | ||
df = df[['gene','motif']] # these names are mocked. | ||
df2=df.dropna() | ||
df2=df2.assign(gene=df2['gene'].str.split(','), | ||
motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True) | ||
# print(df.head()) | ||
_list=df2.values.tolist() # edgelist | ||
# return tuple | ||
# (['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE']) | ||
return list(df2.itertuples(index=False, name=None)) | ||
|
||
|
||
|
||
|
||
def network_jaccard(edgelist1, edgelist2): | ||
''' | ||
inputs: | ||
edgelist1: output from parse_bed for sampleX | ||
edgelist2: output from parse_bed for sampleY | ||
Output: | ||
float: indicating similarity index | ||
ref: | ||
https://medium.com/rapids-ai/similarity-in-graphs-jaccard-versus-the-overlap-coefficient-610e083b877d | ||
''' | ||
|
||
edgeintersection = len(set(edgelist1).intersection(edgelist2)) | ||
edgeunion = (len(edgelist1) + len(edgelist2)) - edgeintersection | ||
|
||
print(float((edgeintersection) / edgeunion)) | ||
return float((edgeintersection) / edgeunion) | ||
|
||
DESCRIPTION = ''' | ||
When you have two different network graph | ||
objects between mutiple samples | ||
compare them using jaccard similarity index | ||
''' | ||
|
||
EPILOG = ''' | ||
''' | ||
|
||
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, | ||
argparse.RawDescriptionHelpFormatter): | ||
pass | ||
parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG, | ||
formatter_class=CustomFormatter) | ||
|
||
parser.add_argument('edgelist1') | ||
parser.add_argument('-v', '--verbose', action='store_true', | ||
help='Set logging level to DEBUG') | ||
|
||
args = parser.parse_args() | ||
|
||
e1 = parse_bed(args.edgelist1) | ||
# print e1 | ||
# [('A', 'BAR'), ('B', 'BAR'), ('C', 'BAR'), ('FOO', 'X'), ('FOO', 'Y'), ('FOO', 'Z'), ('JANE1', 'DOE')] | ||
# copying the same set for testing. | ||
e2 = parse_bed(args.edgelist1) | ||
e2 = e2[0:3] # subsetting | ||
# print(e2) | ||
# [('A', 'BAR'), ('B', 'BAR'), ('C', 'BAR')] | ||
|
||
network_jaccard(e1, e2) # should print out 0.42857142857142855 | ||
|
||
if args.verbose: | ||
l.setLevel(logging.DEBUG) | ||
|
||
debug('%s begin', SCRIPT_PATH) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import click | ||
# Command Group | ||
@click.group(name='CRCminer') | ||
def cli_tools(): | ||
"""Tool related commands""" | ||
pass | ||
|
||
@cli_tools.command(name='mine', help='mines for CRC') | ||
@click.option('--fasta', type=click.Path(),help='fasta') | ||
@click.option('--enhancer', type=click.Path(), help='ROSE2 output of annotated (super)enhancers') | ||
@click.option('--mapping', type=click.Path(), help='Motif ID to gene ID mapping file.') | ||
def test(fasta,enchancer,mapping): | ||
pass | ||
|
||
@cli_tools.command(name='compare', help='compare two networks') | ||
@click.option('--edgelist1', type=click.Path(), help='edgelist from sampleX') | ||
@click.option('--edgelist2', type=click.Path(), help='edgelist from sampleY') | ||
def test2(edgelist1,edgelist2): | ||
pass | ||
|
||
@cli_tools.command(name='report', help='report vizualization') | ||
@click.option('--indir',type=click.Path(),help='input directory CRCminer results') | ||
|
||
def test3(indir): | ||
pass | ||
|
||
# todo | ||
# add more description | ||
|
||
if __name__ == '__main__': | ||
cli_tools() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
import os | ||
import logging | ||
import argparse | ||
import networkx as nx | ||
import pandas as pd | ||
|
||
|
||
SCRIPT_PATH = os.path.abspath(__file__) | ||
FORMAT = '[%(asctime)s] %(levelname)s %(message)s' | ||
l = logging.getLogger() | ||
lh = logging.StreamHandler() | ||
lh.setFormatter(logging.Formatter(FORMAT)) | ||
l.addHandler(lh) | ||
l.setLevel(logging.INFO) | ||
debug = l.debug; info = l.info; warning = l.warning; error = l.error | ||
|
||
|
||
|
||
def parse_bed(_infile): | ||
''' | ||
input: motif bed file | ||
chr st en gene motif | ||
chr1 10 20 A,B,C BAR | ||
chr1 11 21 FOO X,Y,Z | ||
chr1 1111 1121 JANE1 DOE | ||
chr1 11131 11421 FOO1 | ||
chr1 11151 11521 BAR1 | ||
output: | ||
edgelist: [['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE']] | ||
''' | ||
df = pd.read_csv(_infile, sep='\t') | ||
df = df[['gene','motif']] # these names are mocked. | ||
df2=df.dropna() | ||
df2=df2.assign(gene=df2['gene'].str.split(','), | ||
motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True) | ||
# print(df.head()) | ||
_list=df2.values.tolist() # edgelist | ||
|
||
return _list | ||
|
||
|
||
def networkX_helpers(input_TFbed): | ||
|
||
''' | ||
Input is a node edge list | ||
These will have both node and edge list | ||
my_graph.add_edges_from([('A', 'B'), | ||
('A', 'D'), | ||
('A', 'E'), | ||
('A', 'G'), | ||
('A', 'H'), | ||
('B', 'C'), | ||
('B', 'E'), | ||
('C', 'I'), | ||
('C', 'K'), | ||
('C', 'L'), | ||
('D', 'A'), | ||
('D', 'C'), | ||
('D', 'H'), | ||
('D', 'I'), | ||
('A', 'A'),('H','H'),('D','D'),('H','A'),('H','D')]) | ||
Output: | ||
text file with indegree, outdegree counts and CRC TF clique fractions. | ||
''' | ||
# Create a networkx graph object | ||
info("Initializing Graph") | ||
_graph = nx.DiGraph() | ||
|
||
# Add edges to to the graph object | ||
info("Add edges to graph object") | ||
_graph.add_edges_from(input_nodelist) | ||
|
||
info("Calculating in-degree out-degree stats") | ||
|
||
#degrees_dict = {node:deg for (node, deg) in _graph.degree()} | ||
out_degree_dict = {node:deg for (node, deg) in _graph.out_degree()} | ||
in_degree_dict = {node:deg for (node, deg) in _graph.in_degree()} | ||
|
||
out_degree=pd.DataFrame(data={'TF':out_degree_dict.keys(), 'Out':out_degree_dict.values()}) | ||
in_degree=pd.DataFrame(data={'TF':in_degree_dict.keys(), 'In':in_degree_dict.values()}) | ||
|
||
NetworkMetricsOutput=pd.merge(out_degree,in_degree,left_on="TF",right_on="TF",how="outer") | ||
# TOTAL DEGREE | ||
NetworkMetricsOutput['Total'] = NetworkMetricsOutput['Out'] + NetworkMetricsOutput['In'] | ||
|
||
info("Fetching self Loops") | ||
# Self loops | ||
#autoregulatoryLoops = nx.selfloop_edges(_graph) # not needed? | ||
|
||
selfLoops = list(nx.nodes_with_selfloops(_graph)) | ||
|
||
info("Fetch selfregulatory loops") | ||
from itertools import product | ||
nodePairs=[] | ||
for ele in list(product(selfLoops,repeat=2)): | ||
if ele[0] != ele[1] and _graph.has_edge(ele[0],ele[1]) and _graph.has_edge(ele[1],ele[0]): | ||
nodePairs.append(ele) | ||
|
||
info("Fetch Self regulating Clique") | ||
unDirGraph = nx.from_edgelist(nodePairs) | ||
cliqueGen = nx.find_cliques_recursive(unDirGraph) | ||
cliqueList = list(cliqueGen) | ||
|
||
# I am not sure what this does at this point but | ||
# this is a place holder until SV confirms | ||
# All cliques - not needed right now (SV) | ||
#cliqueGen_ALL = list(nx.find_cliques_recursive(_graph)) | ||
|
||
print("hi",len(cliqueList)) | ||
info("Scores all the CRC's") | ||
''' | ||
## SCORING THE CRCs using sum outdegree for each TF and dividing by the number of TFs in the clique | ||
''' | ||
cliqueRanking = [] | ||
outDegreeDict = _graph.out_degree() | ||
|
||
for crcs in cliqueList: | ||
score = 0 | ||
for gene in crcs: | ||
score += outDegreeDict[gene] | ||
score = score/len(crcs) | ||
if score > 0 and len(crcs) > 2: | ||
cliqueRanking.append((crcs, score)) | ||
|
||
|
||
sortedRankedCliques = pd.DataFrame(sorted(cliqueRanking, reverse=True, key=lambda x:x[1])) | ||
|
||
factorEnrichmentDict = dict.fromkeys(selfLoops,0) | ||
|
||
info("Calculate enrichment of each TF in a CRC clique") | ||
''' | ||
## Enrichment of each TF calculated as (number of CRC cliques with the given TF)/(number of CRC cliques) | ||
''' | ||
for crcClique in cliqueList: | ||
for TF in crcClique: | ||
factorEnrichmentDict[TF] += 1 | ||
|
||
factorRankingTable = dict.fromkeys(selfLoops,0) | ||
for TF in selfLoops: | ||
factorRankingTable[TF]=factorEnrichmentDict[TF]/float(len(cliqueRanking)) | ||
|
||
|
||
FactorRank=pd.DataFrame(data={'TF':factorRankingTable.keys(), 'TF_CliqueFraction':factorRankingTable.values()}) | ||
|
||
TFSpecificCliques=nx.cliques_containing_node(unDirGraph) | ||
|
||
NetworkMetricsOutput=pd.merge(NetworkMetricsOutput,FactorRank,left_on="TF",right_on="TF",how="outer") | ||
|
||
info("Write to file") | ||
NetworkMetricsOutput.to_csv('TF_Degrees.csv',index=False) | ||
sortedRankedCliques.to_csv('Putative_CRC_Cliques.csv',index=False, header=False) | ||
|
||
#print(nx.cliques_containing_node(unDirGraph,"RFX3")) | ||
#print(len(nx.cliques_containing_node(unDirGraph,"RFX3"))) | ||
#print(TFSpecificCliques) | ||
|
||
return _graph | ||
|
||
|
||
DESCRIPTION = ''' | ||
Create networkx objects. | ||
''' | ||
|
||
|
||
|
||
EPILOG = ''' | ||
''' | ||
|
||
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, | ||
argparse.RawDescriptionHelpFormatter): | ||
pass | ||
parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG, | ||
formatter_class=CustomFormatter) | ||
|
||
parser.add_argument('arg') | ||
parser.add_argument('-v', '--verbose', action='store_true', | ||
help='Set logging level to DEBUG') | ||
|
||
args = parser.parse_args() | ||
|
||
edge_list = parse_bed(args.arg) | ||
networkX_helpers(edge_list) | ||
|
||
|
||
if args.verbose: | ||
l.setLevel(logging.DEBUG) | ||
|
||
debug('%s begin', SCRIPT_PATH) | ||
|