Skip to content

Commit

Permalink
Merge pull request #13 from stjude-biohackathon/sid_wip_networkhelpers
Browse files Browse the repository at this point in the history
Add network module.
  • Loading branch information
j-andrews7 authored May 5, 2023
2 parents 35922b3 + 4555e99 commit e9d995b
Show file tree
Hide file tree
Showing 3 changed files with 310 additions and 0 deletions.
88 changes: 88 additions & 0 deletions crcminer/compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@

import os
import logging
import argparse

SCRIPT_PATH = os.path.abspath(__file__)
FORMAT = '[%(asctime)s] %(levelname)s %(message)s'
l = logging.getLogger()
lh = logging.StreamHandler()
lh.setFormatter(logging.Formatter(FORMAT))
l.addHandler(lh)
l.setLevel(logging.INFO)
debug = l.debug; info = l.info; warning = l.warning; error = l.error

def parse_bed(_infile):
import pandas as pd
df = pd.read_csv(_infile, sep='\t')
df = df[['gene','motif']] # these names are mocked.
df2=df.dropna()
df2=df2.assign(gene=df2['gene'].str.split(','),
motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True)
# print(df.head())
_list=df2.values.tolist() # edgelist
# return tuple
# (['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE'])
return list(df2.itertuples(index=False, name=None))




def network_jaccard(edgelist1, edgelist2):
'''
inputs:
edgelist1: output from parse_bed for sampleX
edgelist2: output from parse_bed for sampleY
Output:
float: indicating similarity index
ref:
https://medium.com/rapids-ai/similarity-in-graphs-jaccard-versus-the-overlap-coefficient-610e083b877d
'''

edgeintersection = len(set(edgelist1).intersection(edgelist2))
edgeunion = (len(edgelist1) + len(edgelist2)) - edgeintersection

print(float((edgeintersection) / edgeunion))
return float((edgeintersection) / edgeunion)

DESCRIPTION = '''
When you have two different network graph
objects between mutiple samples
compare them using jaccard similarity index
'''

EPILOG = '''
'''

class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG,
formatter_class=CustomFormatter)

parser.add_argument('edgelist1')
parser.add_argument('-v', '--verbose', action='store_true',
help='Set logging level to DEBUG')

args = parser.parse_args()

e1 = parse_bed(args.edgelist1)
# print e1
# [('A', 'BAR'), ('B', 'BAR'), ('C', 'BAR'), ('FOO', 'X'), ('FOO', 'Y'), ('FOO', 'Z'), ('JANE1', 'DOE')]
# copying the same set for testing.
e2 = parse_bed(args.edgelist1)
e2 = e2[0:3] # subsetting
# print(e2)
# [('A', 'BAR'), ('B', 'BAR'), ('C', 'BAR')]

network_jaccard(e1, e2) # should print out 0.42857142857142855

if args.verbose:
l.setLevel(logging.DEBUG)

debug('%s begin', SCRIPT_PATH)

31 changes: 31 additions & 0 deletions crcminer/crcminer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import click
# Command Group
@click.group(name='CRCminer')
def cli_tools():
"""Tool related commands"""
pass

@cli_tools.command(name='mine', help='mines for CRC')
@click.option('--fasta', type=click.Path(),help='fasta')
@click.option('--enhancer', type=click.Path(), help='ROSE2 output of annotated (super)enhancers')
@click.option('--mapping', type=click.Path(), help='Motif ID to gene ID mapping file.')
def test(fasta,enchancer,mapping):
pass

@cli_tools.command(name='compare', help='compare two networks')
@click.option('--edgelist1', type=click.Path(), help='edgelist from sampleX')
@click.option('--edgelist2', type=click.Path(), help='edgelist from sampleY')
def test2(edgelist1,edgelist2):
pass

@cli_tools.command(name='report', help='report vizualization')
@click.option('--indir',type=click.Path(),help='input directory CRCminer results')

def test3(indir):
pass

# todo
# add more description

if __name__ == '__main__':
cli_tools()
191 changes: 191 additions & 0 deletions crcminer/network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import os
import logging
import argparse
import networkx as nx
import pandas as pd


SCRIPT_PATH = os.path.abspath(__file__)
FORMAT = '[%(asctime)s] %(levelname)s %(message)s'
l = logging.getLogger()
lh = logging.StreamHandler()
lh.setFormatter(logging.Formatter(FORMAT))
l.addHandler(lh)
l.setLevel(logging.INFO)
debug = l.debug; info = l.info; warning = l.warning; error = l.error



def parse_bed(_infile):
'''
input: motif bed file
chr st en gene motif
chr1 10 20 A,B,C BAR
chr1 11 21 FOO X,Y,Z
chr1 1111 1121 JANE1 DOE
chr1 11131 11421 FOO1
chr1 11151 11521 BAR1
output:
edgelist: [['A', 'BAR'], ['B', 'BAR'], ['C', 'BAR'], ['FOO', 'X'], ['FOO', 'Y'], ['FOO', 'Z'], ['JANE1', 'DOE']]
'''
df = pd.read_csv(_infile, sep='\t')
df = df[['gene','motif']] # these names are mocked.
df2=df.dropna()
df2=df2.assign(gene=df2['gene'].str.split(','),
motif=df2['motif'].str.split(',')).explode('gene').explode('motif').reset_index(drop=True)
# print(df.head())
_list=df2.values.tolist() # edgelist

return _list


def networkX_helpers(input_TFbed):

'''
Input is a node edge list
These will have both node and edge list
my_graph.add_edges_from([('A', 'B'),
('A', 'D'),
('A', 'E'),
('A', 'G'),
('A', 'H'),
('B', 'C'),
('B', 'E'),
('C', 'I'),
('C', 'K'),
('C', 'L'),
('D', 'A'),
('D', 'C'),
('D', 'H'),
('D', 'I'),
('A', 'A'),('H','H'),('D','D'),('H','A'),('H','D')])
Output:
text file with indegree, outdegree counts and CRC TF clique fractions.
'''
# Create a networkx graph object
info("Initializing Graph")
_graph = nx.DiGraph()

# Add edges to to the graph object
info("Add edges to graph object")
_graph.add_edges_from(input_nodelist)

info("Calculating in-degree out-degree stats")

#degrees_dict = {node:deg for (node, deg) in _graph.degree()}
out_degree_dict = {node:deg for (node, deg) in _graph.out_degree()}
in_degree_dict = {node:deg for (node, deg) in _graph.in_degree()}

out_degree=pd.DataFrame(data={'TF':out_degree_dict.keys(), 'Out':out_degree_dict.values()})
in_degree=pd.DataFrame(data={'TF':in_degree_dict.keys(), 'In':in_degree_dict.values()})

NetworkMetricsOutput=pd.merge(out_degree,in_degree,left_on="TF",right_on="TF",how="outer")
# TOTAL DEGREE
NetworkMetricsOutput['Total'] = NetworkMetricsOutput['Out'] + NetworkMetricsOutput['In']

info("Fetching self Loops")
# Self loops
#autoregulatoryLoops = nx.selfloop_edges(_graph) # not needed?

selfLoops = list(nx.nodes_with_selfloops(_graph))

info("Fetch selfregulatory loops")
from itertools import product
nodePairs=[]
for ele in list(product(selfLoops,repeat=2)):
if ele[0] != ele[1] and _graph.has_edge(ele[0],ele[1]) and _graph.has_edge(ele[1],ele[0]):
nodePairs.append(ele)

info("Fetch Self regulating Clique")
unDirGraph = nx.from_edgelist(nodePairs)
cliqueGen = nx.find_cliques_recursive(unDirGraph)
cliqueList = list(cliqueGen)

# I am not sure what this does at this point but
# this is a place holder until SV confirms
# All cliques - not needed right now (SV)
#cliqueGen_ALL = list(nx.find_cliques_recursive(_graph))

print("hi",len(cliqueList))
info("Scores all the CRC's")
'''
## SCORING THE CRCs using sum outdegree for each TF and dividing by the number of TFs in the clique
'''
cliqueRanking = []
outDegreeDict = _graph.out_degree()

for crcs in cliqueList:
score = 0
for gene in crcs:
score += outDegreeDict[gene]
score = score/len(crcs)
if score > 0 and len(crcs) > 2:
cliqueRanking.append((crcs, score))


sortedRankedCliques = pd.DataFrame(sorted(cliqueRanking, reverse=True, key=lambda x:x[1]))

factorEnrichmentDict = dict.fromkeys(selfLoops,0)

info("Calculate enrichment of each TF in a CRC clique")
'''
## Enrichment of each TF calculated as (number of CRC cliques with the given TF)/(number of CRC cliques)
'''
for crcClique in cliqueList:
for TF in crcClique:
factorEnrichmentDict[TF] += 1

factorRankingTable = dict.fromkeys(selfLoops,0)
for TF in selfLoops:
factorRankingTable[TF]=factorEnrichmentDict[TF]/float(len(cliqueRanking))


FactorRank=pd.DataFrame(data={'TF':factorRankingTable.keys(), 'TF_CliqueFraction':factorRankingTable.values()})

TFSpecificCliques=nx.cliques_containing_node(unDirGraph)

NetworkMetricsOutput=pd.merge(NetworkMetricsOutput,FactorRank,left_on="TF",right_on="TF",how="outer")

info("Write to file")
NetworkMetricsOutput.to_csv('TF_Degrees.csv',index=False)
sortedRankedCliques.to_csv('Putative_CRC_Cliques.csv',index=False, header=False)

#print(nx.cliques_containing_node(unDirGraph,"RFX3"))
#print(len(nx.cliques_containing_node(unDirGraph,"RFX3")))
#print(TFSpecificCliques)

return _graph


DESCRIPTION = '''
Create networkx objects.
'''



EPILOG = '''
'''

class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG,
formatter_class=CustomFormatter)

parser.add_argument('arg')
parser.add_argument('-v', '--verbose', action='store_true',
help='Set logging level to DEBUG')

args = parser.parse_args()

edge_list = parse_bed(args.arg)
networkX_helpers(edge_list)


if args.verbose:
l.setLevel(logging.DEBUG)

debug('%s begin', SCRIPT_PATH)

0 comments on commit e9d995b

Please sign in to comment.