Skip to content

Commit 15fcceb

Browse files
authored
Merge pull request #2056 from antgonza/fix-1913
fix #1913
2 parents 131dd6a + dfe2e83 commit 15fcceb

File tree

4 files changed

+89
-23
lines changed

4 files changed

+89
-23
lines changed

qiita_db/metadata_template/test/test_util.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,8 @@ def test_get_pgsql_reserved_words(self):
359359
"has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t"
360360
"physical_location\trequired_sample_info_status\tsample_type\t"
361361
"str_column\n"
362-
"2.Sample1 \t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t"
362+
"2.Sample1 \t05/29/2014 12:24:51\tTest Sample 1\t"
363+
'"True\t"\t"\nTrue"\t'
363364
"NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t"
364365
"Value for sample 1\n"
365366
"2.Sample2 \t05/29/2014 12:24:51\t"

qiita_db/metadata_template/util.py

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,13 @@ def prefix_sample_names_with_id(md_template, study_id):
6060
md_template.index.name = None
6161

6262

63-
def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
63+
def load_template_to_dataframe(fn, index='sample_name'):
6464
"""Load a sample/prep template or a QIIME mapping file into a data frame
6565
6666
Parameters
6767
----------
6868
fn : str or file-like object
6969
filename of the template to load, or an already open template file
70-
strip_whitespace : bool, optional
71-
Defaults to True. Whether or not to strip whitespace from values in the
72-
input file
7370
index : str, optional
7471
Defaults to 'sample_name'. The index to use in the loaded information
7572
@@ -110,19 +107,6 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
110107
if not holdfile:
111108
raise ValueError('Empty file passed!')
112109

113-
# Strip all values in the cells in the input file, if requested
114-
if strip_whitespace:
115-
for pos, line in enumerate(holdfile):
116-
holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c")
117-
for d in line.split('\t'))
118-
119-
# get and clean the controlled columns
120-
cols = holdfile[0].split('\t')
121-
controlled_cols = {'sample_name'}
122-
controlled_cols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
123-
holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c
124-
for c in cols)
125-
126110
if index == "#SampleID":
127111
# We're going to parse a QIIME mapping file. We are going to first
128112
# parse it with the QIIME function so we can remove the comments
@@ -133,11 +117,29 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
133117
# The QIIME parser fixes the index and removes the #
134118
index = 'SampleID'
135119

136-
# Check that we don't have duplicate columns
137-
col_names = [c.lower() for c in holdfile[0].strip().split('\t')]
138-
if len(set(col_names)) != len(col_names):
139-
raise qdb.exceptions.QiitaDBDuplicateHeaderError(
140-
find_duplicates(col_names))
120+
# Strip all values in the cells in the input file
121+
for pos, line in enumerate(holdfile):
122+
cols = line.split('\t')
123+
if pos == 0 and index != 'SampleID':
124+
# get and clean the controlled columns
125+
ccols = {'sample_name'}
126+
ccols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
127+
newcols = [
128+
c.lower().strip() if c.lower().strip() in ccols
129+
else c.strip()
130+
for c in cols]
131+
132+
# while we are here, let's check for duplicate columns headers
133+
if len(set(newcols)) != len(newcols):
134+
raise qdb.exceptions.QiitaDBDuplicateHeaderError(
135+
find_duplicates(newcols))
136+
else:
137+
# .strip will remove odd chars, newlines, tabs and multiple
138+
# spaces but we need to read a new line at the end of the
139+
# line(+'\n')
140+
newcols = [d.strip(" \r\x0b\x0c\n") for d in cols]
141+
142+
holdfile[pos] = '\t'.join(newcols) + '\n'
141143

142144
# index_col:
143145
# is set as False, otherwise it is cast as a float and we want a string
@@ -158,6 +160,9 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
158160
index_col=False,
159161
comment='\t',
160162
converters={index: lambda x: str(x).strip()})
163+
# remove newlines and tabs from fields
164+
template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
165+
regex=True, inplace=True)
161166
except UnicodeDecodeError:
162167
# Find row number and col number for utf-8 encoding errors
163168
headers = holdfile[0].strip().split('\t')

qiita_db/support_files/patches/48.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
-- Jan 20, 2017
2+
-- see py file
3+
4+
SELECT 1;
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# replacing all \t and \n for space as those chars brake QIIME
2+
3+
from qiita_db.study import Study
4+
from qiita_db.sql_connection import TRN
5+
6+
7+
def searcher(df):
8+
search = r"\t|\n"
9+
10+
return [col for col in df
11+
if df[col].str.contains(search, na=False, regex=True).any()]
12+
13+
14+
studies = Study.get_by_status('private').union(
15+
Study.get_by_status('public')).union(Study.get_by_status('sandbox'))
16+
17+
# we will start search using pandas as is much easier and faster
18+
# than using pgsql. remember that to_dataframe actually transforms what's
19+
# in the db
20+
to_fix = []
21+
for s in studies:
22+
st = s.sample_template
23+
if st is None:
24+
continue
25+
cols = searcher(st.to_dataframe())
26+
if cols:
27+
to_fix.append((st, cols))
28+
29+
for pt in s.prep_templates():
30+
if pt is None:
31+
continue
32+
cols = searcher(pt.to_dataframe())
33+
if cols:
34+
to_fix.append((pt, cols))
35+
36+
37+
# now let's fix the database and regenerate the files
38+
for infofile, cols in to_fix:
39+
with TRN:
40+
for col in cols:
41+
# removing tabs
42+
sql = """UPDATE qiita.{0}{1}
43+
SET {2} = replace({2}, chr(9), ' ')""".format(
44+
infofile._table_prefix, infofile.id, col)
45+
TRN.add(sql)
46+
47+
# removing enters
48+
sql = """UPDATE qiita.{0}{1}
49+
SET {2} = regexp_replace(
50+
{2}, E'[\\n\\r\\u2028]+', ' ', 'g' )""".format(
51+
infofile._table_prefix, infofile.id, col)
52+
TRN.add(sql)
53+
54+
TRN.execute()
55+
56+
infofile.generate_files()

0 commit comments

Comments
 (0)