Merge pull request #2056 from antgonza/fix-1913

josenavas · web-flow · commit 15fcceb12770 · 2017-01-24T07:39:50.000-08:00
fix #1913
diff --git a/qiita_db/metadata_template/test/test_util.py b/qiita_db/metadata_template/test/test_util.py
@@ -359,7 +359,8 @@ def test_get_pgsql_reserved_words(self):
     "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t"
     "physical_location\trequired_sample_info_status\tsample_type\t"
     "str_column\n"
-    "2.Sample1         \t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t"
+    "2.Sample1         \t05/29/2014 12:24:51\tTest Sample 1\t"
+    '"True\t"\t"\nTrue"\t'
     "NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t"
     "Value for sample 1\n"
     "2.Sample2  \t05/29/2014 12:24:51\t"
diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py
@@ -60,16 +60,13 @@ def prefix_sample_names_with_id(md_template, study_id):
         md_template.index.name = None
 
 
-def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
+def load_template_to_dataframe(fn, index='sample_name'):
     """Load a sample/prep template or a QIIME mapping file into a data frame
 
     Parameters
     ----------
     fn : str or file-like object
         filename of the template to load, or an already open template file
-    strip_whitespace : bool, optional
-        Defaults to True. Whether or not to strip whitespace from values in the
-        input file
     index : str, optional
         Defaults to 'sample_name'. The index to use in the loaded information
 
@@ -110,19 +107,6 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
     if not holdfile:
         raise ValueError('Empty file passed!')
 
-    # Strip all values in the cells in the input file, if requested
-    if strip_whitespace:
-        for pos, line in enumerate(holdfile):
-            holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c")
-                                      for d in line.split('\t'))
-
-    # get and clean the controlled columns
-    cols = holdfile[0].split('\t')
-    controlled_cols = {'sample_name'}
-    controlled_cols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
-    holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c
-                            for c in cols)
-
     if index == "#SampleID":
         # We're going to parse a QIIME mapping file. We are going to first
         # parse it with the QIIME function so we can remove the comments
@@ -133,11 +117,29 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
         # The QIIME parser fixes the index and removes the #
         index = 'SampleID'
 
-    # Check that we don't have duplicate columns
-    col_names = [c.lower() for c in holdfile[0].strip().split('\t')]
-    if len(set(col_names)) != len(col_names):
-        raise qdb.exceptions.QiitaDBDuplicateHeaderError(
-            find_duplicates(col_names))
+    # Strip all values in the cells in the input file
+    for pos, line in enumerate(holdfile):
+        cols = line.split('\t')
+        if pos == 0 and index != 'SampleID':
+            # get and clean the controlled columns
+            ccols = {'sample_name'}
+            ccols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
+            newcols = [
+                c.lower().strip() if c.lower().strip() in ccols
+                else c.strip()
+                for c in cols]
+
+            # while we are here, let's check for duplicate columns headers
+            if len(set(newcols)) != len(newcols):
+                raise qdb.exceptions.QiitaDBDuplicateHeaderError(
+                    find_duplicates(newcols))
+        else:
+            # .strip will remove odd chars, newlines, tabs and multiple
+            # spaces but we need to read a new line at the end of the
+            # line(+'\n')
+            newcols = [d.strip(" \r\x0b\x0c\n") for d in cols]
+
+        holdfile[pos] = '\t'.join(newcols) + '\n'
 
     # index_col:
     #   is set as False, otherwise it is cast as a float and we want a string
@@ -158,6 +160,9 @@ def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
             index_col=False,
             comment='\t',
             converters={index: lambda x: str(x).strip()})
+        # remove newlines and tabs from fields
+        template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
+                         regex=True, inplace=True)
     except UnicodeDecodeError:
         # Find row number and col number for utf-8 encoding errors
         headers = holdfile[0].strip().split('\t')
diff --git a/qiita_db/support_files/patches/48.sql b/qiita_db/support_files/patches/48.sql
@@ -0,0 +1,4 @@
+-- Jan 20, 2017
+-- see py file
+
+SELECT 1;
diff --git a/qiita_db/support_files/patches/python_patches/48.py b/qiita_db/support_files/patches/python_patches/48.py
@@ -0,0 +1,56 @@
+# replacing all \t and \n for space as those chars brake QIIME
+
+from qiita_db.study import Study
+from qiita_db.sql_connection import TRN
+
+
+def searcher(df):
+    search = r"\t|\n"
+
+    return [col for col in df
+            if df[col].str.contains(search, na=False, regex=True).any()]
+
+
+studies = Study.get_by_status('private').union(
+    Study.get_by_status('public')).union(Study.get_by_status('sandbox'))
+
+# we will start search using pandas as is much easier and faster
+# than using pgsql. remember that to_dataframe actually transforms what's
+# in the db
+to_fix = []
+for s in studies:
+    st = s.sample_template
+    if st is None:
+        continue
+    cols = searcher(st.to_dataframe())
+    if cols:
+        to_fix.append((st, cols))
+
+    for pt in s.prep_templates():
+        if pt is None:
+            continue
+        cols = searcher(pt.to_dataframe())
+        if cols:
+            to_fix.append((pt, cols))
+
+
+# now let's fix the database and regenerate the files
+for infofile, cols in to_fix:
+    with TRN:
+        for col in cols:
+            # removing tabs
+            sql = """UPDATE qiita.{0}{1}
+                        SET {2} = replace({2}, chr(9), ' ')""".format(
+                            infofile._table_prefix, infofile.id, col)
+            TRN.add(sql)
+
+            # removing enters
+            sql = """UPDATE qiita.{0}{1}
+                        SET {2} = regexp_replace(
+                            {2}, E'[\\n\\r\\u2028]+', ' ', 'g' )""".format(
+                            infofile._table_prefix, infofile.id, col)
+            TRN.add(sql)
+
+        TRN.execute()
+
+    infofile.generate_files()