qiita-spots · ElDeveloper · Aug 21, 2017 · Aug 16, 2017 · Aug 16, 2017 · Aug 17, 2017
diff --git a/qiita_db/metadata_template/test/test_util.py b/qiita_db/metadata_template/test/test_util.py
@@ -149,7 +149,7 @@ def test_load_template_to_dataframe_lowercase(self):
 
     def test_load_template_to_dataframe_non_utf8(self):
         bad = EXP_SAMPLE_TEMPLATE.replace('Test Sample 2', 'Test Sample\x962')
-        with self.assertRaises(qdb.exceptions.QiitaDBError):
+        with self.assertRaises(ValueError):
             qdb.metadata_template.util.load_template_to_dataframe(
                 StringIO(bad))
 
@@ -387,20 +387,20 @@ def test_get_pgsql_reserved_words(self):
 
 EXP_SAMPLE_TEMPLATE_SPACES_EMPTY_ROW = (
     "sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t"
-    "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t"
-    "physical_location\trequired_sample_info_status\tsample_type\t"
-    "str_column\n"
-    "2.Sample1         \t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t"
+    "has_physical_specimen\thost_subject_id\tint_column\tlatitude\t"
+    "longitude\t   physical_location\trequired_sample_info_status"
+    "\tsample_type\tstr_column\n"
+    "   2.Sample1         \t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t"
     "NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t"
     "Value for sample 1\n"
-    "2.Sample2  \t2014-05-29 12:24:51\t"
+    " 2.Sample2  \t2014-05-29 12:24:51\t"
     "Test Sample 2\tTrue\tTrue\tNotIdentified\t2\t4.2\t1.1\tlocation1\t"
     "received\ttype1\tValue for sample 2\n"
     "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t"
     "True\tNotIdentified\t3\t4.8\t4.41\tlocation1\treceived\ttype1\t"
     "Value for sample 3\n"
     "\t\t\t\t\t\t\t\t\t\t\t\t\n"
-    "\t\t\t\t\t\t\t\t\t\t\t\t\n")
+    "\t\t\t\t\t\t\t\t\t\t   \t\t\n")
 
 EXP_ST_SPACES_EMPTY_COLUMN = (
     "sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t"

diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py
@@ -7,9 +7,9 @@
 # -----------------------------------------------------------------------------
 
 from __future__ import division
-from collections import defaultdict
 from future.utils import PY3, viewitems
 from six import StringIO
+from string import printable
 
 import pandas as pd
 import numpy as np
@@ -103,7 +103,27 @@ def load_template_to_dataframe(fn, index='sample_name'):
     # Load in file lines
     holdfile = None
     with open_file(fn, mode='U') as f:
+        errors = {}
         holdfile = f.readlines()
+        # here we are checking for non printable chars AKA non UTF-8 chars
+        for row, line in enumerate(holdfile):
+            for col, block in enumerate(line.split('\t')):
+                tblock = ''.join([c for c in block if c in printable])
+                if len(block) != len(tblock):
+                    tblock = ''.join([c if c in printable else '&#128062;'
+                                      for c in block])
+                    if tblock not in errors:
+                        errors[tblock] = []
+                    errors[tblock].append('(%d, %d)' % (row, col))
+        if bool(errors):
+            raise ValueError(
+                "There are invalid (non UTF-8) characters in your information "
+                "file. The offending fields and their location (row, column) "
+                "are listed below, invalid characters are represented using "
+                "&#128062;: %s" % '; '.join(
+                    ['"%s" = %s' % (k, ', '.join(v))
+                     for k, v in viewitems(errors)]))
+
     if not holdfile:
         raise ValueError('Empty file passed!')
 
@@ -137,7 +157,7 @@ def load_template_to_dataframe(fn, index='sample_name'):
             # .strip will remove odd chars, newlines, tabs and multiple
             # spaces but we need to read a new line at the end of the
             # line(+'\n')
-            newcols = [d.strip(" \r\x0b\x0c\n") for d in cols]
+            newcols = [d.strip(" \r\n") for d in cols]
 
         holdfile[pos] = '\t'.join(newcols) + '\n'
 
@@ -149,34 +169,19 @@ def load_template_to_dataframe(fn, index='sample_name'):
     # comment:
     #   using the tab character as "comment" we remove rows that are
     #   constituted only by delimiters i. e. empty rows.
-    try:
-        template = pd.read_csv(
-            StringIO(''.join(holdfile)),
-            sep='\t',
-            dtype=str,
-            encoding='utf-8',
-            infer_datetime_format=False,
-            keep_default_na=False,
-            index_col=False,
-            comment='\t',
-            converters={index: lambda x: str(x).strip()})
-        # remove newlines and tabs from fields
-        template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
-                         regex=True, inplace=True)
-    except UnicodeDecodeError:
-        # Find row number and col number for utf-8 encoding errors
-        headers = holdfile[0].strip().split('\t')
-        errors = defaultdict(list)
-        for row, line in enumerate(holdfile, 1):
-            for col, cell in enumerate(line.split('\t')):
-                try:
-                    cell.encode('utf-8')
-                except UnicodeError:
-                    errors[headers[col]].append(row)
-        lines = ['%s: row(s) %s' % (header, ', '.join(map(str, rows)))
-                 for header, rows in viewitems(errors)]
-        raise qdb.exceptions.QiitaDBError(
-            'Non UTF-8 characters found in columns:\n' + '\n'.join(lines))
+    template = pd.read_csv(
+        StringIO(''.join(holdfile)),
+        sep='\t',
+        dtype=str,
+        encoding='utf-8',
+        infer_datetime_format=False,
+        keep_default_na=False,
+        index_col=False,
+        comment='\t',
+        converters={index: lambda x: str(x).strip()})
+    # remove newlines and tabs from fields
+    template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
+                     regex=True, inplace=True)
 
     initial_columns = set(template.columns)
 

diff --git a/qiita_ware/test/test_dispatchable.py b/qiita_ware/test/test_dispatchable.py
@@ -84,8 +84,11 @@ def test_create_sample_template_nonutf8(self):
                   'sample_info_utf8_error.txt')
         obs = create_sample_template(fp, Study(1), False)
         exp = {'status': 'danger',
-               'message': u"Non UTF-8 characters found in columns:"
-                          u"\n\ufffdcollection_timestamp: row(s) 1"}
+               'message': 'There are invalid (non UTF-8) characters in your '
+                          'information file. The offending fields and their '
+                          'location (row, column) are listed below, invalid '
+                          'characters are represented using &#128062;: '
+                          '"&#128062;collection_timestamp" = (0, 13)'}
         self.assertEqual(obs, exp)
 
     def test_update_sample_template(self):