-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrnaSeqDump.py
executable file
·189 lines (153 loc) · 7.82 KB
/
rnaSeqDump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from collections import defaultdict
import requests
from argparse import ArgumentParser
import logging
import re
import json
from time import sleep
logger = logging.getLogger()
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter('%(levelname)s - %(asctime)s - %(message)s'))
logger.addHandler(ch)
class RnaSeqParams(object):
def __init__(self, args, Session):
self.args = args
self.Session = Session
self.organismList = self.getOrganismList()
self.experimentNodes = self.getExperimentNodes()
def _parseTree(self, json, array):
if len(json['children']) == 0:
array.append(json['data']['term'])
else:
for child in json['children']:
self._parseTree(child, array)
def getOrganismList(self):
logger.info("Retrieving organism list")
url = ('{0}/service/record-types/transcript/searches/GenesByTaxon'.format(self.Session.baseUrl))
res = self.Session.session.get(url, verify=True)
sleep(0.5)
j = self.Session.getDataResponse(res, url)
organismArray = []
for parameter in j['searchData']['parameters']:
if parameter['displayName'] == 'Organism':
self._parseTree(parameter['vocabulary'], organismArray)
return organismArray
def getExperimentNodes(self):
logger.info("Retrieving experiments and nodes")
url = ('{0}/service/record-types/transcript'.format(self.Session.baseUrl))
res = self.Session.session.get(url, verify=True)
sleep(0.5)
j = self.Session.getDataResponse(res, url)
datasetNodes = defaultdict(list)
for key in j['attributes']:
if 'help' in key.keys():
# We could use a better way than this to figure out if a dataset is RNAseq...
# Needs to be done in web services
if re.match(r'^Transcript', key['help']):
dataset = re.search('\[.*?\]', key['help']).group(0)
dataset = dataset.split(':')[1].replace(']','').lstrip()
datasetNodes[dataset].append(key['name'])
return datasetNodes
class RnaSeqDumper(object):
def __init__(self, Session, RnaSeqParams, outputDir):
self.Session = Session
self.RnaSeqParams = RnaSeqParams
self.outputDir = outputDir
organismList = ['"'+ organism + '"' for organism in self.RnaSeqParams.organismList]
organismList = '[{0}]'.format(','.join(organismList))
for experiment, sampleList in self.RnaSeqParams.experimentNodes.items():
jsonPayLoad = self._buildPayLoad(sampleList, organismList)
self._writeData(experiment, jsonPayLoad)
def _buildPayLoad(self, sampleList, organismList):
payLoad = {'searchConfig':{'parameters':{'organism': organismList}}}
payLoad['searchConfig']['wdkWeight'] = 10
sampleList = ['primary_key'] + sampleList
payLoad['reportConfig'] = {'attributes' : sampleList, 'includeHeader' : 'true', 'attachmentType' : 'plain', 'applyFilter': 'false'}
jsonPayLoad = json.dumps(payLoad)
return jsonPayLoad
def _getData(self, jsonPayLoad, experiment):
logger.info("Starting next experiment:")
logger.info('Attempting to retrieve RNAseq data for experiment \"{0}\"'.format(experiment))
url = ('{0}/service/record-types/transcript/searches/GenesByTaxon/reports/attributesTabular'.format(self.Session.baseUrl))
logger.info("Sending a POST request to {0}".format(url))
logger.info("JSON payload:\t{0}".format(jsonPayLoad))
res = self.Session.session.post(url, jsonPayLoad, headers={'Content-Type': 'application/json'}, stream=True)
data = self.Session.getDataResponse(res, url, dataType="text")
return data
def _writeData(self, experiment, payLoad):
data = self._getData(payLoad, experiment)
fileName = experiment.replace(' ', '_').replace('/', '-')
fileName = '{0}/{1}.txt'.format(self.outputDir, fileName)
logger.info('Writing data from experiment \"{0}\" to file {1}\n\n'.format(experiment, fileName))
try:
outFile = open(fileName, 'w')
for line in data.iter_lines():
line = line.decode('utf-8').rstrip()
data = line.split('\t')
# At the moment, we cannot determine from the web services which organism an experiment belongs to
# So this retrieves the data for all organisms, and then discards rows where all the values are N/A
# We should fix this in web services
if all(elem == 'N/A' for elem in data[1:len(data)]):
next
else:
outFile.write(line + '\n')
except FileNotFoundError as e:
logger.error('Cannot open file {0} for writing\n\n{1}'.format(fileName, e))
raise SystemExit()
outFile.close()
class Session(object):
def __init__(self, project):
self.webAppMapper = {'amoebadb': 'amoeba', 'cryptodb': 'cryptodb', 'fungidb': 'fungidb', 'giardiadb': 'giardiadb', \
'hostdb': 'hostdb', 'microsporidiadb': 'micro', 'piroplasmadb': 'piro', 'plasmodb': 'plasmo', \
'toxodb': 'toxo', 'tritrypdb': 'tritrypdb', 'trichdb': 'trichdb', 'vectorbase': 'vectorbase'}
self.baseUrl = self._getBaseUrl(project)
self.session = self._getSession(project)
def _getBaseUrl(self, project):
webApp = self._getWebApp(project)
baseUrl = "https://{0}.org/{1}".format(project, webApp)
return baseUrl
def _getSession(self, project):
logger.info("Attempting to connect to {0}".format(self.baseUrl))
try:
s = requests.session()
s.get(self.baseUrl)
except requests.exceptions.ConnectionError as e:
logger.error("Cannot connect to {0}. Please check the project name '{1}' is correct and try again.\n\n{2}".format(self.baserUrl, project, e))
raise SystemExit()
logger.info("Connection succeeded")
return s
def _getWebApp (self, project):
if project.lower() in self.webAppMapper:
return self.webAppMapper[project.lower()]
else:
logger.error("Cannot find webapp for {0}. Please check that {0} is a valid VEuPathDB project\n".format(project))
raise SystemExit()
def getDataResponse(self, res, url, dataType='json'):
if (res.ok):
if dataType == 'json':
d = res.json()
elif dataType == 'text':
d = res
else:
logger.error("Data type {0} is not recognised as a download type".format(dataType))
raise SystemExit()
else:
try:
res.raise_for_status()
except requests.exceptions.HTTPError as e:
logger.error("Cannot retrieve file from url: {0}. Please check the URL is correct. In case of an outage at VEuPathDB, please try again later.\n\n{1}".format(url, e))
raise SystemExit()
logger.info("Data successfully retrieved")
return d
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--project', required=True, help='VEuPathDB project from which you wish to download RNA sequence data, e.g., PlasmoDB. For downloads from multiple projects, use a comma separated list, e.g, CryptoDB,ToxoDB')
parser.add_argument('--outputDir', required=True, help='Directory for output files')
args = parser.parse_args()
for project in args.project.split(','):
session = Session(project)
rnaSeqParams = RnaSeqParams(args, session)
RnaSeqDumper(session, rnaSeqParams, args.outputDir)