porthomclPairsOrthologs.py

#!/usr/bin/python
import time
from datetime import datetime
import sys, os
import gzip
import random, math
from optparse import OptionParser


options = None
DEBUG = False

def readTaxonList(filename):

	taxon_list = []
	taxon_list_file = open(filename)
	for line in taxon_list_file:
		line = line.strip()
		if line:
			taxon_list += [line]
	taxon_list_file.close()
	return taxon_list


def readBestHit(query_taxon, file_name):
	best_hits = {}
	with open(file_name) as best_hit_file:
		for line in best_hit_file:
			cols = line.strip().split('\t')
			
			
			query_id = query_taxon+'|'+cols[0]

			subject_id = cols[1]

			subject_taxon = subject_id.split('|')[0]

			try: 
				a = best_hits[subject_taxon]
			except:
				best_hits[subject_taxon] = {}

			try:
				best_hits[subject_taxon][query_id] += [(subject_id, int(cols[2]), float(cols[3]))]
			except:
				best_hits[subject_taxon][query_id] = [(subject_id, int(cols[2]), float(cols[3]))]
	return best_hits


def memory_usage_resource():
	import resource
	rusage_denom = 1024.
	if sys.platform == 'darwin':
		# ... it seems that in OSX the output is different units ...
		rusage_denom = rusage_denom * rusage_denom
	mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / rusage_denom
	return round(mem,0)


def log(s):
	global options
	print >> sys.stderr, s
	if options.logfile:
		l = open(options.logfile, 'a')
		l.write(s+'\n')
		l.close()


if __name__ == '__main__':
	usage = "This is STEP 5.2 of PorthoMcl.\n\nusage: %prog options\n"
	parser = OptionParser(usage)

	parser.add_option("-t", "--taxonlist", dest="taxonlistfile", help="A single column file containing the list of taxon to work with")
	parser.add_option("-x", "--index", dest="index", help="An integer number identifying which taxon to work on [1-size_of_taxon_list]" , type='int')

	parser.add_option('-b', '--inBestHitFolder', dest='inBestHitFolder', help='Input folder that contains Best Hit files (bh files)')

	parser.add_option('-o', '--outOrthologFolder', dest='outOrthologFolder', help='Folder that will store Orthologous pairs (ort files) in')
	parser.add_option("-l", "--logfile", dest="logfile", help="log file (optional, if not supplied STDERR will be used)")

	parser.add_option('', '--OverwiteOutput', dest='OverwiteOutput', help='If the output file exists, overwrite it. (default=process terminates)', default=False, action="store_true")
	parser.add_option('', '--KeepUnnormalizedScore', dest='KeepUnnormalizedScore', help='Write the un-normalize scores for ortholog pairs (default=False)', default=False, action="store_true")
	

	#
	
	(options, args) = parser.parse_args()


	if len(args) != 0 or not options.taxonlistfile or not options.inBestHitFolder or not options.index or not options.outOrthologFolder:
		parser.error("incorrect arguments.\n\t\tUse -h to get more information or refer to the MANUAL.md")


	#print best_hit

	log('{2} | Orthology | {0} | {1} | {3} | {4} MB | {5}'.format(1 , 'reading taxon list', options.index, '' ,memory_usage_resource(), datetime.now() ))

	taxon_list = readTaxonList(options.taxonlistfile)

	if options.index <= 0 or options.index > len(taxon_list):
		log('{2} | Orthology | {0} | {1} | {3} | {4} MB | {5}'.format('ERROR' , 'Error in index', options.index, '', memory_usage_resource(), datetime.now() ))
		exit()


	taxon1s = taxon_list[options.index - 1]

	# if not options.OverwiteOutput and os.path.exists(os.path.join(options.outputfolder, taxon1s + '.ort.tsv')):
	# 	exit(0)


	log('{2} | Orthology | {0} | {1} | {3} | {4} MB | {5}'.format(2 , 'Reading Best hit (bh file)', options.index, taxon1s , memory_usage_resource(), datetime.now() ))

	taxon1_filename =  os.path.join(options.inBestHitFolder , taxon1s + '.bh.tsv')
	taxon1_dic = readBestHit(taxon1s, taxon1_filename)

	# for taxon2s in sorted(taxon1_dic.keys()):
	# 	print taxon2s
	# 	for query_id in taxon1_dic[taxon2s]:
	# 		print '\t\t', query_id, ':   ',
	# 		for (s_id, ev_exp, ev_mnt) in taxon1_dic[taxon2s][query_id]:
	# 			print s_id, ',',
	# 		print 

	# exit()

	log('{2} | Orthology | {0} | {1} | {3} | {4} MB | {5}'.format(2.1 , 'Size: ' + str(len(taxon1_dic)) + ' taxons.', options.index, taxon1s , memory_usage_resource(), datetime.now() ))

	orthologs = []
	orthologs_index = 0

	taxon2_index = 0

	for taxon2s in sorted(taxon1_dic.keys()):

		if taxon1s < taxon2s:
			taxon2_filename =  os.path.join(options.inBestHitFolder , taxon2s + '.bh.tsv')
		
			taxon2_index += 1
			
			if not os.path.exists(taxon2_filename):
				log('{2} | Orthology | {0} | {1} | {3} | {4} MB | {5}'.format('3.' + str(taxon2_index) , 'ERROR: Targeted Best hit ('+taxon2s+'.bh.tsv) Does not exists', options.index, taxon1s , memory_usage_resource(), datetime.now() ))
				continue			

			taxon2_dic = readBestHit(taxon2s, taxon2_filename)

			log('{2} | Orthology | {0} | {1} | {3} | {4} MB | {5}'.format('3.' + str(taxon2_index) , 'Reading Targeted Best hit ('+taxon2s+'.bh.tsv)', options.index, taxon1s , memory_usage_resource(), datetime.now() ))
			
			taxon1_taxon2_score_sum = 0
			taxon1_taxon2_score_count = 0
			taxon1_taxon2_notfound_count = 0

			for seq1 in taxon1_dic[taxon2s]:

				for (seq2, evalue_exp1, evalue_mant1) in taxon1_dic[taxon2s][seq1]:

					try:
						for (seq0, evalue_exp2, evalue_mant2) in taxon2_dic[taxon1s][seq2]:

							unnormalized_score = None
							if evalue_mant1 < 0.01 or evalue_mant2 < 0.01:
								unnormalized_score = (evalue_exp1 + evalue_exp2) / -2 
							else:
								unnormalized_score = (math.log10(evalue_mant1 * evalue_mant2) + evalue_exp2 + evalue_exp1) / -2

							orthologs += [ [seq1, seq2, unnormalized_score, None] ]

							taxon1_taxon2_score_sum +=	unnormalized_score	
							taxon1_taxon2_score_count += 1
							
							if DEBUG:
								print '\t' , seq1 ,'(',len(taxon1_dic[taxon2s][seq1]),')', '<----->' , seq2 , str(unnormalized_score)

					except KeyError:
						taxon1_taxon2_notfound_count += 1
						if DEBUG:
							print '\t', seq1, '(',len(taxon1_dic[taxon2s][seq1]),')', '<----> NOTHING'
							

			log('{2} | Orthology | {0} | {1} | {3} | {4} MB | {5}'.format('4.' + str(taxon2_index) , 'Orthologs: ' + str( taxon1_taxon2_score_count) , options.index, taxon1s , memory_usage_resource(), datetime.now() ))
			log('{2} | Orthology | {0} | {1} | {3} | {4} MB | {5}'.format('4.' + str(taxon2_index), 'No hits: ' + str( taxon1_taxon2_notfound_count) , options.index, taxon1s , memory_usage_resource(), datetime.now() ))
			
			average = 1
			if taxon1_taxon2_score_count>0:
				average = taxon1_taxon2_score_sum/taxon1_taxon2_score_count

			if DEBUG:
				print  taxon1s,'<->',taxon2s, 'Count:',taxon1_taxon2_score_count ,'Average:' ,average
				print

			
			for i in xrange(taxon1_taxon2_score_count):
				if average>0:
					orthologs[orthologs_index+i][3] = orthologs[orthologs_index+i][2] / average 
				else:
					orthologs[orthologs_index+i][3] = 1

			orthologs_index += taxon1_taxon2_score_count

	out_f = open (os.path.join(options.outOrthologFolder , taxon1s + '.ort.tsv'), 'w')
	if options.KeepUnnormalizedScore:
		out_f.write('query_id\tsubject_id\tunnormalized_score\tnormalized_score\n')
	for ortholog in orthologs:
		out_f.write(ortholog[0] + '\t')
		out_f.write(ortholog[1] + '\t')
		if options.KeepUnnormalizedScore:
			out_f.write(str(ortholog[2]) +'\t')
		else:
			ortholog[3] = int( ortholog[3] * 1000 + .5) / 1000.0 
		out_f.write(str(ortholog[3]) + '\n')

	out_f.close()
	log('{2} | Orthology | {0} | {1} | {3} | {4} MB | {5}'.format(5, 'Finished' , options.index, taxon1s , memory_usage_resource(), datetime.now() ))