@@ -60,16 +60,13 @@ def prefix_sample_names_with_id(md_template, study_id):
60
60
md_template .index .name = None
61
61
62
62
63
- def load_template_to_dataframe (fn , strip_whitespace = True , index = 'sample_name' ):
63
+ def load_template_to_dataframe (fn , index = 'sample_name' ):
64
64
"""Load a sample/prep template or a QIIME mapping file into a data frame
65
65
66
66
Parameters
67
67
----------
68
68
fn : str or file-like object
69
69
filename of the template to load, or an already open template file
70
- strip_whitespace : bool, optional
71
- Defaults to True. Whether or not to strip whitespace from values in the
72
- input file
73
70
index : str, optional
74
71
Defaults to 'sample_name'. The index to use in the loaded information
75
72
@@ -110,19 +107,6 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
110
107
if not holdfile :
111
108
raise ValueError ('Empty file passed!' )
112
109
113
- # Strip all values in the cells in the input file, if requested
114
- if strip_whitespace :
115
- for pos , line in enumerate (holdfile ):
116
- holdfile [pos ] = '\t ' .join (d .strip (" \r \x0b \x0c " )
117
- for d in line .split ('\t ' ))
118
-
119
- # get and clean the controlled columns
120
- cols = holdfile [0 ].split ('\t ' )
121
- controlled_cols = {'sample_name' }
122
- controlled_cols .update (qdb .metadata_template .constants .CONTROLLED_COLS )
123
- holdfile [0 ] = '\t ' .join (c .lower () if c .lower () in controlled_cols else c
124
- for c in cols )
125
-
126
110
if index == "#SampleID" :
127
111
# We're going to parse a QIIME mapping file. We are going to first
128
112
# parse it with the QIIME function so we can remove the comments
@@ -133,11 +117,29 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
133
117
# The QIIME parser fixes the index and removes the #
134
118
index = 'SampleID'
135
119
136
- # Check that we don't have duplicate columns
137
- col_names = [c .lower () for c in holdfile [0 ].strip ().split ('\t ' )]
138
- if len (set (col_names )) != len (col_names ):
139
- raise qdb .exceptions .QiitaDBDuplicateHeaderError (
140
- find_duplicates (col_names ))
120
+ # Strip all values in the cells in the input file
121
+ for pos , line in enumerate (holdfile ):
122
+ cols = line .split ('\t ' )
123
+ if pos == 0 and index != 'SampleID' :
124
+ # get and clean the controlled columns
125
+ ccols = {'sample_name' }
126
+ ccols .update (qdb .metadata_template .constants .CONTROLLED_COLS )
127
+ newcols = [
128
+ c .lower ().strip () if c .lower ().strip () in ccols
129
+ else c .strip ()
130
+ for c in cols ]
131
+
132
+ # while we are here, let's check for duplicate columns headers
133
+ if len (set (newcols )) != len (newcols ):
134
+ raise qdb .exceptions .QiitaDBDuplicateHeaderError (
135
+ find_duplicates (newcols ))
136
+ else :
137
+ # .strip will remove odd chars, newlines, tabs and multiple
138
+ # spaces but we need to read a new line at the end of the
139
+ # line(+'\n')
140
+ newcols = [d .strip (" \r \x0b \x0c \n " ) for d in cols ]
141
+
142
+ holdfile [pos ] = '\t ' .join (newcols ) + '\n '
141
143
142
144
# index_col:
143
145
# is set as False, otherwise it is cast as a float and we want a string
@@ -158,6 +160,9 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
158
160
index_col = False ,
159
161
comment = '\t ' ,
160
162
converters = {index : lambda x : str (x ).strip ()})
163
+ # remove newlines and tabs from fields
164
+ template .replace (to_replace = '[\t \n \r \x0b \x0c ]+' , value = '' ,
165
+ regex = True , inplace = True )
161
166
except UnicodeDecodeError :
162
167
# Find row number and col number for utf-8 encoding errors
163
168
headers = holdfile [0 ].strip ().split ('\t ' )
0 commit comments