-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathniid_upload.py
154 lines (148 loc) · 5.96 KB
/
niid_upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os, re, time, datetime, csv, sys, json
from upload import upload
from rethinkdb import r
from Bio import SeqIO
import argparse
import subprocess
import unicodedata
from parse import parse
from upload import parser
sys.path.append('') # need to import from base
from base.rethink_io import rethink_io
from vdb.flu_upload import flu_upload
import logging
# logger = logging.getLogger()
parser.add_argument('--assay_type', default='hi')
def read_niid(path, fstem, subtype, assay_type):
'''
Convert xls tables to csv tables, then parse to flat tsv
'''
possible_files = [ path + '/' + fstem + ext for ext in ['.xls', '.xlsm', '.xlsx']]
real_file = ''
for possible_file in possible_files:
if os.path.isfile(possible_file):
real_file = possible_file
if real_file != '':
print("real_file: " + real_file)
ind = '.{}'.format(real_file.split('.')[-1])
convert_xls_to_csv(path, fstem, ind)
fname = "data/tmp/{}.csv".format(fstem)
parse_niid_matrix_to_tsv(fname, path, subtype, assay_type)
def convert_xls_to_csv(path, fstem, ind):
import xlrd
wb_name = path + '/' + fstem + ind
workbook = xlrd.open_workbook(filename=wb_name, encoding_override="cp1252")
for sheet in workbook.sheets():
with open('data/tmp/%s.csv'%(fstem), 'w') as f:
writer = csv.writer(f)
rows = []
for i in range(sheet.nrows):
row = []
for j in range(sheet.ncols):
val = sheet.cell_value(i, j)
row.append(val)
rows.append(row)
writer.writerows(rows)
return
def parse_niid_matrix_to_tsv(fname, original_path, subtype, assay_type):
suptype=subtype.lower()
flutype = ""
if subtype == "h3n2" or subtype == "h1n1pdm":
flutype = "A"
if subtype == "vic" or subtype == "yam":
flutype = "B"
src_id = fname.split('/')[-1]
with open(fname) as infile:
csv_reader = csv.reader(infile)
mat = list(csv_reader)
with open('data/tmp/%s.tsv'%(src_id[:-4]), 'w') as outfile:
header = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
outfile.write("%s\n" % ("\t".join(header)))
original_path = original_path.split('/')
try:
original_path.remove('')
except:
pass
if subtype == "h3n2":
serum_id_row_index = 5 #5
start_row = 7
virus_id_col_index = 1
start_col = 4
elif subtype == "h1n1pdm":
serum_id_row_index = 5
start_row = 6
virus_id_col_index = 1
start_col = 4
elif subtype == "vic":
serum_id_row_index = 4
start_row = 5
virus_id_col_index = 1
start_col = 4
elif subtype == "yam":
serum_id_row_index = 3
start_row = 5
virus_id_col_index = 1
start_col = 4
for i in range(start_row, len(mat)):
for j in range(start_col, len(mat[0])):
virus_strain = mat[i][virus_id_col_index]
serum_id = mat[serum_id_row_index][j]
serum_id = re.sub(r'[\r\n ]+', '', serum_id)
m = re.search(r'^(\S+)(egg|cell|siat|hck|nib121|ivr|\(bvr)', serum_id, re.IGNORECASE)
if m is None:
m = re.search(r'^(\S+)(no\.)', serum_id, re.IGNORECASE)
serum_strain = ""
# import pdb; pdb.set_trace()
if m:
serum_strain = m.group(1)
if not serum_strain.startswith(flutype + "/"):
serum_strain = flutype + "/" + serum_strain
# Normalize U+ff1c '<' to U+003c '<'
titer = unicodedata.normalize('NFKC', mat[i][j])
# Allow either "< 10" or "<10"
titer = re.sub(r'< ', '<', titer)
source = "niid_%s"%(src_id)
virus_passage = mat[i][2]
virus_passage_category = ''
serum_passage = "unknown"
m = re.search(r'(egg)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
m = re.search(r'(cell|siat|hck)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
serum_passage_category = ''
line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type]))
outfile.write(line)
def determine_subtype(original_path):
original_path = original_path.lower().split('/')
if 'h3n2' in original_path:
subtype = 'h3n2'
elif 'h1n1pdm' in original_path:
subtype = 'h1n1pdm'
elif 'victoria' in original_path:
subtype = 'vic'
elif 'yamagata' in original_path:
subtype = 'yam'
else:
subtype = "UnknownSubtype"
return subtype
if __name__=="__main__":
args = parser.parse_args()
if args.path is None:
args.path = "data/"
if args.database is None:
args.database = "niid_tdb"
if not os.path.isdir(args.path):
os.makedirs(args.path)
subtype = determine_subtype(args.path)
read_niid(args.path, args.fstem, subtype, args.assay_type)
args.fstem = args.fstem.replace('(','\\(').replace(')','\\)')
if args.preview:
command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview"
print(command)
subprocess.call(command, shell=True)
else:
command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + subtype + " --path data/tmp/ --fstem " + args.fstem
print(command)
subprocess.call(command, shell=True)