-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathTrapnell_wrapper.py
71 lines (61 loc) · 3.14 KB
/
Trapnell_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import getopt
import sys
import numpy as np
# Parse the options from command line
try:
opts, args = getopt.getopt(sys.argv[1:],"i:n:k:t:",["idir=","njobs=","hacked-kallisto-path=","reference-transcriptome="])
except getopt.GetoptError:
print ("getopterrror")
print ('usage is : \n python Trapnell_wrapper.py -i input_SRA_dir -k path-to-hacked-kallisto -t path-to-human-reference-transcriptome [-n number-of-processes-to-use]')
sys.exit(1)
# The four options used in this pipeline are
SRA_dir=''
num_proc=1
kallipso_path=''
ref_transcriptome=''
for opt,arg in opts:
if opt in ("-i", "--idir"):
SRA_dir=arg
elif opt in ("-n","--njobs"):
num_proc=int(arg)
elif opt in ("-k","--hacked-kallisto-path"):
kallipso_path=arg
elif opt in ("-t","--reference-transcriptome"):
ref_transcriptome=arg
# Error message if function not called correctly
if (not SRA_dir) or (not kallipso_path) or (not ref_transcriptome):
print ('usage is : \n python Trapnell_wrapper.py -i input_SRA_dir -k path-to-hacked-kallisto -t path-to-human-reference-transcriptome [-n number-of-processes-to-use]')
sys.exit(1)
# Execute process_SRA, which converts SRA to fastq
print('Extracting reads from SRAs...')
os.system('mkdir -p ./reads/')
os.system('rm -f ./reads/*')
os.system('python process_SRA.py -i '+SRA_dir+' -o ./reads/ -n '+str(num_proc))
# This removes files that Trapnell didn't use. (Specific for analysis of this dataset)
print('Removing files that Trapnell throws...')
relevant_fls=np.loadtxt('Files_to_keep.txt',dtype=str)
fls_to_remove=[cells for cells in os.listdir('./reads/') if cells.split('_')[0] not in relevant_fls ]
for fl in fls_to_remove:
os.system('rm '+'./reads/'+fl)
# Use kallisto to build index. should return same results as hacked kallisto
print('Generating the Kallisto index (with hacked kallisto)...')
os.system('mkdir -p ./kallisto_index')
os.system('rm -f ./kallisto_index/*')
index_path='./kallisto_index/Trapnell_index.idx'
os.system(kallipso_path+' index -i '+index_path+' '+ref_transcriptome)
metadata_cmd=kallipso_path+' metadata '+index_path
os.system(metadata_cmd)
num_ec = sum(1 for line in open('./kallisto_index/Trapnell_index.idx_ecmap.txt'))
print(num_ec)
# Use hacked kallisto to generate TCCs (output is .class file that contains counts of ECs)
print('Generating TCC (with hacked kallisto)...')
os.system('mkdir -p ./transcript_compatibility_counts/')
os.system('rm -f ./transcript_compatibility_counts/*')
os.system('python get_pseudoalignments_paired_end.py -i ./reads/ -o ./transcript_compatibility_counts/ -k '+kallipso_path+ ' -t '+ index_path +' -n '+ str(num_proc))
# Process the .class file into a matrix. normalize such that each row sums to 1
print('Generating TCC distribution...')
os.system('python get_tcc_dist.py -i ./transcript_compatibility_counts/ -m '+str(num_ec)+' -t ./Trapnell_TCC.dat -d ./Trapnell_TCC_distribution.dat')
# Process the TCC distribution matrix to get pairwise distributions
print('Generating pairwise distances...')
os.system('python get_pairwise_distances.py ./Trapnell_TCC_distribution.dat ./Trapnell_TCC_pairwise_distance.dat '+str(num_proc))