diff --git a/src/CombineNearbyInteraction.py b/src/CombineNearbyInteraction.py index e49e13e..7a069a5 100644 --- a/src/CombineNearbyInteraction.py +++ b/src/CombineNearbyInteraction.py @@ -1,10 +1,19 @@ #!/usr/bin/env python """ -Created: November 14,2017 - Combines nearby interactions generated by FitHiChIP by a connected component labeling algorithm + + +Timeline +---------- +November 2017: Creation of the first draft +Feb 2018: Modification to incorporate window size and top K% filtering +July 2018: Fixed the K (=100) and window size (2X2), for 5 Kb bin size +Nov 2018: Added support for different columns of q-value, p-value, contact count, and also provided support for + increasing or decreasing order of sorting, to make it applicable for a wide range of significance values. + + Author: Sourya Bhattacharyya Vijay-Ay lab, LJI """ @@ -16,16 +25,6 @@ import networkx as nx from Queue import heapq -# few define variables - -# defined when q-value is 0 -Log10_QVAL_THR = 500 - -# # neighborhood of a bin -# # to not include any interactions -# # we are using 25 Kb (5 bins in 5 Kb resolution) -# BIN_NEIGHBOR_DISTTHR = 25000 - #========================================================== # this function returns the element below which top K % of # all the elements reside @@ -81,14 +80,18 @@ def _GetQVal(self, count=1): #=============================================== def main(): parser = OptionParser() #(usage=usage) - parser.add_option("--InpFile", dest="InpFile", help="Input interaction file") + parser.add_option("--InpFile", dest="InpFile", help="Input interaction file. Assumed that the first six fields contain two interacting chromosome information.") parser.add_option("--headerInp", dest="headerInp", type="int", help="If 1, indicates that input interaction file has a header line (such as field names). Default 1.") parser.add_option("--OutFile", dest="OutFile", help="Output merged interaction file") parser.add_option("--binsize", dest="binsize", type="int", help="Size of bins employed. DEFAULT 5 Kb.") parser.add_option("--conn", dest="connectivity_rule", type="int", help="Rule of connectivity ( 8 or 4). DEFAULT 8.") - parser.add_option("--percent", dest="TopPctElem", type="int", help="Percentage of elements to be selected from each connected component. Currently we recommend three values: a) 0: if specified 0, only the topmost representative of each of the connected component will be selected. This is the default value. b) 50: If specified as 50, top 50% of the elements in both statistical significance and contact count will be considered for inclusion, subject to the bin and neighborhood contraints, and c) 25: If specified as 25, top 25% of the elements in both statistical significance and contact count will be considered for inclusion, subject to the bin and neighborhood contraints. Default value is 100, means that all interactions will be considered.") - parser.add_option("--Neigh", dest="NeighborHoodBin", type="int", help="Positive integer (default: 2 with 5 Kb bin size) means that within 2x2 neighborhood of 5 Kb bins, there will be only one interaction. Applicable only if the percent parameter is non-zero. User may vary this value if bin size is different.") - parser.set_defaults(InpFile=None, OutFile=None, binsize=5000, connectivity_rule=8, headerInp=1, TopPctElem=100, NeighborHoodBin=2) + parser.add_option("--percent", dest="TopPctElem", type="int", help="Percentage of elements to be selected from each connected component. Default: 100, means all loops would be considered. If specified as 0, only the most significant loops from each component would be selected. For any number x between 0 and 100, top x% of the loops in a component, considering both statistical significance and contact count, would considered for inclusion, subject to the bin and neighborhood contraints.") + parser.add_option("--Neigh", dest="NeighborHoodBin", type="int", help="Positive integer (default: 2 with 5 Kb bin size) means that if a loop is included in the final set, loops involving within 2x2 neighborhood of both the bins would be discarded. Applicable only if --percent > 0. Difference in bin size other than 5000 may prompt user to change this value.") + parser.add_option("--cccol", dest="CCCol", type="int", help="Column number storing the contact count. Default: 7.") + parser.add_option("--qcol", dest="QValCol", type="int", help="Column number storing the q-value (or any measure of statistical significance). Default: 0, means the last column of the given interaction file. Any non-zero value would prompt the user to check the corresponding column.") + parser.add_option("--pcol", dest="PValCol", type="int", help="Column number storing the p-value (or any measure of statistical significance). Default: 0, means the second last column of the given interaction file. Any non-zero value would prompt the user to check the corresponding column.") + parser.add_option("--order", dest="SortOrder", type="int", help="Binary variable indicating the sorting order of the given significance values. Default 0, means sorting is done by ascending order. If specified 1, sorting is done by descending (reverse) order.") + parser.set_defaults(InpFile=None, OutFile=None, binsize=5000, connectivity_rule=8, headerInp=1, TopPctElem=100, NeighborHoodBin=2, QValCol=0, PValCol=0, SortOrder=0, CCCol=7) (options, args) = parser.parse_args() #=========================== @@ -111,13 +114,39 @@ def main(): TopPctElem = int(options.TopPctElem) NeighborHoodBinThr = (int(options.NeighborHoodBin)) * bin_size + # parameters regarding significance and sorting order of statistical significance values + QValCol=int(options.QValCol) + PValCol=int(options.PValCol) + CCCol=int(options.CCCol) + SortOrder=int(options.SortOrder) + + #==================== + # fix the columns containing P and Q-values + # by reading the first line of the input interaction file + fp_in = open(InpFile, 'r') + l = fp_in.readline() + contents = l.rstrip().split() + if (QValCol == 0): + QValCol = len(contents) + + if (PValCol == 0): + PValCol = (len(contents) - 1) + + fp_in.close() + #==================== + # print the parameters - if 0: - print '\n *** bin_size: ', bin_size - print '\n *** headerInp: ', headerInp - print '\n *** connectivity_rule: ', connectivity_rule - print '\n *** TopPctElem: ', TopPctElem - print '\n *** NeighborHoodBinThr: ', NeighborHoodBinThr + if 1: + print '****** Merge filtering of adjacent loops is enabled *****' + print '***** within function of merged filtering - printing the parameters ***' + print '*** bin_size: ', bin_size + print '*** headerInp: ', headerInp + print '*** connectivity_rule: ', connectivity_rule + print '*** TopPctElem: ', TopPctElem + print '*** NeighborHoodBinThr: ', NeighborHoodBinThr + print '*** QValCol: ', QValCol + print '*** PValCol: ', PValCol + print '*** SortOrder: ', SortOrder # open the output file # if input interaction file has header information, @@ -129,8 +158,10 @@ def main(): l = fp_in.readline() contents = l.rstrip().split() # write the header corresponding to the chromosomes, contact count, P value and the Q value - fp_outInt.write(contents[0] + '\t' + contents[1] + '\t' + contents[2] + '\t' + contents[3] + '\t' + contents[4] + '\t' + contents[5] + '\t' + contents[6] + '\t' + contents[len(contents) - 2] + '\t' + contents[len(contents) - 1] + '\t' + 'bin1_low' + '\t' + 'bin1_high' + '\t' + 'bin2_low' + '\t' + 'bin2_high' + '\t' + 'sumCC' + '\t' + 'StrongConn') + fp_outInt.write(contents[0] + '\t' + contents[1] + '\t' + contents[2] + '\t' + contents[3] + '\t' + contents[4] + '\t' + contents[5] + '\t' + contents[CCCol-1] + '\t' + contents[PValCol - 1] + '\t' + contents[QValCol - 1] + '\t' + 'bin1_low' + '\t' + 'bin1_high' + '\t' + 'bin2_low' + '\t' + 'bin2_high' + '\t' + 'sumCC' + '\t' + 'StrongConn') fp_in.close() + else: + fp_outInt.write('chr1' + '\t' + 'start1' + '\t' + 'end1' + '\t' + 'chr2' + '\t' + 'start2' + '\t' + 'end2' + '\t' + 'CC' + '\t' + 'p' + '\t' + 'fdr' + '\t' + 'bin1_low' + '\t' + 'bin1_high' + '\t' + 'bin2_low' + '\t' + 'bin2_high' + '\t' + 'sumCC' + '\t' + 'StrongConn') # list of chromosomes to be experimented TargetChrList = [] @@ -142,7 +173,7 @@ def main(): # output directory OutDir = os.path.dirname(os.path.realpath(OutFile)) - if 0: + if 1: print 'OutDir: ', str(OutDir) #========================================= @@ -150,7 +181,7 @@ def main(): #========================================= for chridx in range(len(TargetChrList)): curr_chr = TargetChrList[chridx] - if 0: + if 1: print 'Processing the chromosome: ', str(curr_chr) # extract the interactions of current chromosome from the complete set of interactions @@ -172,24 +203,24 @@ def main(): if 0: print 'Extracted interactions for the current chromosome' - # extract also the max span of interactions (6th column maximum element) - # so as to estimate the matrix size - temp_log_file = OutDir + '/Temp.log' - sys_cmd = "cat " + str(tempchrdumpfile) + " | cut -f6 | sort -nr > " + str(temp_log_file) - os.system(sys_cmd) + # # extract also the max span of interactions (6th column maximum element) + # # so as to estimate the matrix size + # temp_log_file = OutDir + '/Temp.log' + # sys_cmd = "cat " + str(tempchrdumpfile) + " | cut -f6 | sort -nr > " + str(temp_log_file) + # os.system(sys_cmd) - # max_coord = int((subprocess.Popen(sys_cmd, shell=True)).stdout.read()) - with open(temp_log_file, 'r') as fp_in: - l = fp_in.readline() - max_coord = int((l.rstrip()).split()[0]) + # # determine the maximum coordinate + # with open(temp_log_file, 'r') as fp_in: + # l = fp_in.readline() + # max_coord = int((l.rstrip()).split()[0]) - sys.stdout.flush() # test + # sys.stdout.flush() # test - # number of bins (matrix dimension) - nbins = (max_coord / bin_size) - if 0: - print 'max_coord of the interactions: ', str(max_coord) - print 'nbins: ', str(nbins) + # # number of bins (matrix dimension) + # nbins = (max_coord / bin_size) + # if 0: + # print 'max_coord of the interactions: ', str(max_coord) + # print 'nbins: ', str(nbins) # create a graph which will store the interactions G = nx.Graph() @@ -211,7 +242,7 @@ def main(): curr_key = (bin2, bin1) # assign the key to the dictionary # add the contact count, P value and Q value information as well - CurrChrDict.setdefault(curr_key, Interaction(int(linecontents[6]), float(linecontents[len(linecontents) - 2]), float(linecontents[len(linecontents) - 1]))) + CurrChrDict.setdefault(curr_key, Interaction(int(linecontents[CCCol-1]), float(linecontents[PValCol - 1]), float(linecontents[QValCol - 1]))) # add the node to the given graph as well G.add_node(curr_key) if 0: @@ -243,7 +274,7 @@ def main(): # check the edges of G edgelist = list(G.edges()) - if 0: + if 1: print 'No of nodes of G: ', G.number_of_nodes() print 'No of edges of G: ', G.number_of_edges() print 'Number of connected components of G: ', nx.number_connected_components(G) @@ -315,8 +346,10 @@ def main(): #================================================== # approach 1 : # if TopPctElem = 0 then - # get the bin having min P and Q values and corresponding bin pairs + # get the bin having maximum statistical significance and corresponding bin pairs # ties are resolved by maximum contact count + # if SortOrder = 0, maximum statistical significance === min P and Q values + # if SortOrder = 1, maximum statistical significance === max P and Q (equivalent) measures #================================================== if (TopPctElem == 0): @@ -325,24 +358,19 @@ def main(): curr_cc = CurrChrDict[curr_key]._GetCC() curr_pval = CurrChrDict[curr_key]._GetPVal() curr_qval = CurrChrDict[curr_key]._GetQVal() - if (curr_qval > 0): - curr_logqval = (-1) * math.log(curr_qval, 10) - else: - curr_logqval = Log10_QVAL_THR # a default constant - - # curr_conn_comp_CCList.append(curr_cc) - # curr_conn_comp_QValList.append(curr_qval) - # curr_conn_comp_LogQValList.append(curr_logqval) curr_key_bin1_mid = (((curr_key[0] - 1) * bin_size) + (curr_key[0] * bin_size)) / 2 curr_key_bin2_mid = (((curr_key[1] - 1) * bin_size) + (curr_key[1] * bin_size)) / 2 if 0: - print ' Connected component index: ', j, ' curr_key: ', curr_key, ' bin 1 mid: ', curr_key_bin1_mid, ' bin 2 mid: ', curr_key_bin2_mid, ' CC: ', curr_cc, ' Pval: ', curr_pval, ' Qval: ', curr_qval, ' -Log10 Qval: ', curr_logqval + print ' Connected component index: ', j, ' curr_key: ', curr_key, ' bin 1 mid: ', curr_key_bin1_mid, ' bin 2 mid: ', curr_key_bin2_mid, ' CC: ', curr_cc, ' Pval: ', curr_pval, ' Qval: ', curr_qval if (j == 0): # first index + rep_bin_key = curr_key + elif (SortOrder == 0) and (curr_pval < CurrChrDict[rep_bin_key]._GetPVal()) and (curr_qval < CurrChrDict[rep_bin_key]._GetQVal()): + # current element has higher statistical significance (lower P or Q value when SortOrder = 0) rep_bin_key = curr_key - elif (curr_pval < CurrChrDict[rep_bin_key]._GetPVal()) and (curr_qval < CurrChrDict[rep_bin_key]._GetQVal()): - # current element has lower P or Q value + elif (SortOrder == 1) and (curr_pval > CurrChrDict[rep_bin_key]._GetPVal()) and (curr_qval > CurrChrDict[rep_bin_key]._GetQVal()): + # current element has higher statistical significance (higher P or Q value when SortOrder = 1) rep_bin_key = curr_key elif (curr_pval == CurrChrDict[rep_bin_key]._GetPVal()) and (curr_qval == CurrChrDict[rep_bin_key]._GetQVal()) and (curr_cc > CurrChrDict[rep_bin_key]._GetCC()): # current element has equal P and Q values @@ -373,14 +401,13 @@ def main(): #================================================== if (TopPctElem > 0) and (TopPctElem < 100): - # this is a list which would act as a max-priority queue - # used to process one connected component at a time + # list to store the q-values and the CC values for individual bin pairs + # for their sequential extraction Curr_Comp_Tuple_List = [] # lists storing different attributes curr_conn_comp_CCList = [] curr_conn_comp_QValList = [] - curr_conn_comp_LogQValList = [] # process individual elements within this connected component for j in range(len(curr_comp_list)): @@ -388,21 +415,19 @@ def main(): curr_cc = CurrChrDict[curr_key]._GetCC() curr_pval = CurrChrDict[curr_key]._GetPVal() curr_qval = CurrChrDict[curr_key]._GetQVal() - if (curr_qval > 0): - curr_logqval = (-1) * math.log(curr_qval, 10) - else: - curr_logqval = Log10_QVAL_THR # a default constant - curr_conn_comp_CCList.append(curr_cc) curr_conn_comp_QValList.append(curr_qval) - curr_conn_comp_LogQValList.append(curr_logqval) - # create a sublist - # Note: ordering is very important - # first element: log 10 q value - # if this is equal, second element would be used - # important: as the heap is min heap by default, we use negative signs for both of the keys - subl = [((-1) * curr_logqval), ((-1) * curr_cc), curr_key[0], curr_key[1]] + # create a min-heap structure + # first element: q-value + # if SortOrder = 0, lower: better: use the same sign when insering in the heap + # if SortOrder = 1, higher: better: reverse the sign when insering in the heap + # for ties, second element (contact count) - higher: better + # so, use negative signs + if (SortOrder == 0): + subl = [curr_qval, ((-1) * curr_cc), curr_key[0], curr_key[1]] + else: + subl = [((-1) * curr_qval), ((-1) * curr_cc), curr_key[0], curr_key[1]] # insert the element in the designated queue heapq.heappush(Curr_Comp_Tuple_List, subl) @@ -410,17 +435,15 @@ def main(): # first get the maximum / minimum values from these lists max_cc = max(curr_conn_comp_CCList) min_qval = min(curr_conn_comp_QValList) - max_logqval = max(curr_conn_comp_LogQValList) # now obtain the values of top K % elements # from these lists # where K = 50 means it is median custom_cc = custom_percent(curr_conn_comp_CCList, TopPctElem, 2) - custom_qval = custom_percent(curr_conn_comp_QValList, TopPctElem, 1) - custom_logqval = custom_percent(curr_conn_comp_LogQValList, TopPctElem, 2) + custom_qval = custom_percent(curr_conn_comp_QValList, TopPctElem, (SortOrder+1)) if 0: - print ' --> current connected component: max CC: ', max_cc, ' min Q val: ', min_qval, ' max log-Q val: ', max_logqval, ' top K (TopPctElem): ', TopPctElem, ' custom_cc threshold: ', custom_cc, ' custom_qval threshold: ', custom_qval, ' custom_logqval threshold: ', custom_logqval + print ' --> current connected component: max CC: ', max_cc, ' min Q val: ', min_qval, ' top K (TopPctElem): ', TopPctElem, ' custom_cc threshold: ', custom_cc, ' custom_qval threshold: ', custom_qval # this list stores the candidate interactions # from this particular connected component @@ -451,7 +474,7 @@ def main(): # modified condition - sourya # consider those interactions having # significance value > K percentile - if ((curr_elem[0] * (-1)) < custom_logqval): + if ((SortOrder == 0) and (curr_elem[0] > custom_qval)) or ((SortOrder == 1) and (curr_elem[0] < custom_qval)): break #=================================== @@ -500,7 +523,7 @@ def main(): pval = CurrChrDict[rep_bin_key]._GetPVal() qval = CurrChrDict[rep_bin_key]._GetQVal() - if 0: + if 1: print '**** Selected bin key: ', rep_bin_key, ' start bin mid: ', (rep_bin1_low + rep_bin1_high)/2, ' end bin mid: ', (rep_bin2_low + rep_bin2_high) / 2, ' cc: ', cc, ' pval: ', pval, ' qval: ', qval # write the interaction in the specified output file @@ -531,14 +554,18 @@ def main(): curr_conn_comp_CCList.append(curr_cc) curr_conn_comp_QValList.append(curr_qval) - # create a sublist - # Note: ordering is very important - # first element: q-value (lower: better) + # create a min-heap structure + # first element: q-value + # if SortOrder = 0, lower: better: use the same sign when insering in the heap + # if SortOrder = 1, higher: better: reverse the sign when insering in the heap # for ties, second element (contact count) - higher: better - # important: as the heap is min heap by default, we use negative signs for both of the keys - subl = [curr_qval, ((-1) * curr_cc), curr_key[0], curr_key[1]] - - # insert the element in the designated queue + # so, use negative signs + if (SortOrder == 0): + subl = [curr_qval, ((-1) * curr_cc), curr_key[0], curr_key[1]] + else: + subl = [((-1) * curr_qval), ((-1) * curr_cc), curr_key[0], curr_key[1]] + + # insert the element in the designated queue (min-heap property) heapq.heappush(Curr_Comp_Tuple_List, subl) # this list stores the candidate interactions @@ -551,6 +578,8 @@ def main(): # now extract elements from the constructed queue while (len(Curr_Comp_Tuple_List) > 0): + # extract the first element from the min-heap + # element with the lowest value curr_elem = heapq.heappop(Curr_Comp_Tuple_List) if 0: print 'extracted element from heap: ', curr_elem @@ -614,16 +643,16 @@ def main(): # after processing all the chromosomes, now close the output interaction file fp_outInt.close() - # remove the temp log file which stores the max coordinate for a chromosome - sys_cmd = "rm " + str(temp_log_file) - os.system(sys_cmd) + # # remove the temp log file which stores the max coordinate for a chromosome + # sys_cmd = "rm " + str(temp_log_file) + # os.system(sys_cmd) # remove the temporary chromosome specific interaction dump file sys_cmd = "rm " + str(tempchrdumpfile) os.system(sys_cmd) - if 0: - print 'End of merging nearby interactions !!! ' + if 1: + print '==================== End of merge filtering adjacent interactions !!! ======================' #=============================================== if __name__ == "__main__":