qiita-spots · josenavas · Aug 23, 2017 · Aug 16, 2017 · Aug 16, 2017 · Aug 16, 2017
diff --git a/qiita_db/artifact.py b/qiita_db/artifact.py
@@ -573,11 +573,12 @@ def delete(cls, artifact_id):
                      WHERE artifact_id = %s"""
             qdb.sql_connection.TRN.add(sql, [artifact_id])
 
-            # If the artifact doesn't have parents, we move the files to the
-            # uploads folder. We also need to nullify the column in the prep
-            # template table
-            if not instance.parents:
-                qdb.util.move_filepaths_to_upload_folder(study.id, filepaths)
+            # If the artifact doesn't have parents and study is not None (is an
+            # analysis), we move the files to the uploads folder. We also need
+            # to nullify the column in the prep template table
+            if not instance.parents and study is not None:
+                qdb.util.move_filepaths_to_upload_folder(
+                    study.id, filepaths)
 
                 sql = """UPDATE qiita.prep_template
                          SET artifact_id = NULL

diff --git a/qiita_db/metadata_template/base_metadata_template.py b/qiita_db/metadata_template/base_metadata_template.py
@@ -1124,7 +1124,7 @@ def extend(self, md_template):
             self.validate(self.columns_restrictions)
             self.generate_files()
 
-    def update(self, md_template):
+    def _update(self, md_template):
         r"""Update values in the template
 
         Parameters
@@ -1143,22 +1143,19 @@ def update(self, md_template):
             passed md_template
         """
         with qdb.sql_connection.TRN:
-            # Clean and validate the metadata template given
-            new_map = self._clean_validate_template(
-                md_template, self.study_id, current_columns=self.categories())
             # Retrieving current metadata
             current_map = self.to_dataframe()
 
             # simple validations of sample ids and column names
-            samples_diff = set(new_map.index).difference(current_map.index)
+            samples_diff = set(md_template.index).difference(current_map.index)
             if samples_diff:
                 raise qdb.exceptions.QiitaDBError(
                     'The new template differs from what is stored '
                     'in database by these samples names: %s'
                     % ', '.join(samples_diff))
 
-            if not set(current_map.columns).issuperset(new_map.columns):
-                columns_diff = set(new_map.columns).difference(
+            if not set(current_map.columns).issuperset(md_template.columns):
+                columns_diff = set(md_template.columns).difference(
                     current_map.columns)
                 raise qdb.exceptions.QiitaDBError(
                     'Some of the columns in your template are not present in '
@@ -1168,15 +1165,16 @@ def update(self, md_template):
 
             # In order to speed up some computation, let's compare only the
             # common columns and rows. current_map.columns and
-            # current_map.index are supersets of new_map.columns and
-            # new_map.index, respectivelly, so this will not fail
-            current_map = current_map[new_map.columns].loc[new_map.index]
+            # current_map.index are supersets of md_template.columns and
+            # md_template.index, respectivelly, so this will not fail
+            current_map = current_map[
+                md_template.columns].loc[md_template.index]
 
             # Get the values that we need to change
             # diff_map is a DataFrame that hold boolean values. If a cell is
-            # True, means that the new_map is different from the current_map
-            # while False means that the cell has the same value
-            diff_map = current_map != new_map
+            # True, means that the md_template is different from the
+            # current_map while False means that the cell has the same value
+            diff_map = current_map != md_template
             # ne_stacked holds a MultiIndexed DataFrame in which the first
             # level of indexing is the sample_name and the second one is the
             # columns. We only have 1 column, which holds if that
@@ -1195,8 +1193,8 @@ def update(self, md_template):
             changed.index.names = ['sample_name', 'column']
             # the combination of np.where and boolean indexing produces
             # a numpy array with only the values that actually changed
-            # between the current_map and new_map
-            changed_to = new_map.values[np.where(diff_map)]
+            # between the current_map and md_template
+            changed_to = md_template.values[np.where(diff_map)]
 
             # to_update is a MultiIndexed DataFrame, in which the index 0 is
             # the samples and the index 1 is the columns, we define these
@@ -1235,12 +1233,57 @@ def update(self, md_template):
                     """.format(self._table_name(self._id), sql_eq_cols,
                                single_value, sql_cols)
             for sample in samples_to_update:
-                sample_vals = [new_map[col][sample] for col in cols_to_update]
+                sample_vals = [md_template[col][sample]
+                               for col in cols_to_update]
                 sample_vals.insert(0, sample)
                 qdb.sql_connection.TRN.add(sql, sample_vals)
 
             qdb.sql_connection.TRN.execute()
 
+    def update(self, md_template):
+        r"""Update values in the template
+
+        Parameters
+        ----------
+        md_template : DataFrame
+            The metadata template file contents indexed by samples ids
+
+        Raises
+        ------
+        QiitaDBError
+            If md_template and db do not have the same sample ids
+            If md_template and db do not have the same column headers
+            If self.can_be_updated is not True
+        QiitaDBWarning
+            If there are no differences between the contents of the DB and the
+            passed md_template
+        """
+        with qdb.sql_connection.TRN:
+            # Clean and validate the metadata template given
+            new_map = self._clean_validate_template(
+                md_template, self.study_id, current_columns=self.categories())
+            self._update(new_map)
+            self.validate(self.columns_restrictions)
+            self.generate_files()
+
+    def extend_and_update(self, md_template):
+        """Performs the update and extend operations at once
+
+        Parameters
+        ----------
+        md_template : DataFrame
+            The metadata template contents indexed by sample ids
+
+        See Also
+        --------
+        update
+        extend
+        """
+        with qdb.sql_connection.TRN:
+            md_template = self._clean_validate_template(
+                md_template, self.study_id, current_columns=self.categories())
+            self._common_extend_steps(md_template)
+            self._update(md_template)
             self.validate(self.columns_restrictions)
             self.generate_files()
 

diff --git a/qiita_db/metadata_template/test/test_prep_template.py b/qiita_db/metadata_template/test/test_prep_template.py
@@ -1280,8 +1280,7 @@ def test_extend_update(self):
         self.metadata['str_column']['SKB7.640196'] = 'NEW VAL'
 
         npt.assert_warns(
-            qdb.exceptions.QiitaDBWarning, pt.extend, self.metadata)
-        pt.update(self.metadata)
+            qdb.exceptions.QiitaDBWarning, pt.extend_and_update, self.metadata)
 
         sql = "SELECT * FROM qiita.prep_{0}".format(pt.id)
         obs = [dict(o) for o in self.conn_handler.execute_fetchall(sql)]

diff --git a/qiita_db/metadata_template/test/test_sample_template.py b/qiita_db/metadata_template/test/test_sample_template.py
@@ -1732,8 +1732,8 @@ def test_extend_update(self):
         md_ext['TOT_NITRO'] = pd.Series(['val1', 'val2', 'val3', 'val4'],
                                         index=md_ext.index)
 
-        npt.assert_warns(qdb.exceptions.QiitaDBWarning, st.extend, md_ext)
-        st.update(md_ext)
+        npt.assert_warns(qdb.exceptions.QiitaDBWarning, st.extend_and_update,
+                         md_ext)
         exp_sample_ids = {"%s.Sample1" % st.id, "%s.Sample2" % st.id,
                           "%s.Sample3" % st.id, "%s.Sample4" % st.id}
         self.assertEqual(st._get_sample_ids(), exp_sample_ids)

diff --git a/qiita_db/metadata_template/test/test_util.py b/qiita_db/metadata_template/test/test_util.py
@@ -149,7 +149,7 @@ def test_load_template_to_dataframe_lowercase(self):
 
     def test_load_template_to_dataframe_non_utf8(self):
         bad = EXP_SAMPLE_TEMPLATE.replace('Test Sample 2', 'Test Sample\x962')
-        with self.assertRaises(qdb.exceptions.QiitaDBError):
+        with self.assertRaises(ValueError):
             qdb.metadata_template.util.load_template_to_dataframe(
                 StringIO(bad))
 
@@ -387,20 +387,20 @@ def test_get_pgsql_reserved_words(self):
 
 EXP_SAMPLE_TEMPLATE_SPACES_EMPTY_ROW = (
     "sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t"
-    "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t"
-    "physical_location\trequired_sample_info_status\tsample_type\t"
-    "str_column\n"
-    "2.Sample1         \t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t"
+    "has_physical_specimen\thost_subject_id\tint_column\tlatitude\t"
+    "longitude\t   physical_location\trequired_sample_info_status"
+    "\tsample_type\tstr_column\n"
+    "   2.Sample1         \t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t"
     "NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t"
     "Value for sample 1\n"
-    "2.Sample2  \t2014-05-29 12:24:51\t"
+    " 2.Sample2  \t2014-05-29 12:24:51\t"
     "Test Sample 2\tTrue\tTrue\tNotIdentified\t2\t4.2\t1.1\tlocation1\t"
     "received\ttype1\tValue for sample 2\n"
     "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t"
     "True\tNotIdentified\t3\t4.8\t4.41\tlocation1\treceived\ttype1\t"
     "Value for sample 3\n"
     "\t\t\t\t\t\t\t\t\t\t\t\t\n"
-    "\t\t\t\t\t\t\t\t\t\t\t\t\n")
+    "\t\t\t\t\t\t\t\t\t\t   \t\t\n")
 
 EXP_ST_SPACES_EMPTY_COLUMN = (
     "sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t"

diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py
@@ -7,9 +7,9 @@
 # -----------------------------------------------------------------------------
 
 from __future__ import division
-from collections import defaultdict
 from future.utils import PY3, viewitems
 from six import StringIO
+from string import printable
 
 import pandas as pd
 import numpy as np
@@ -103,7 +103,27 @@ def load_template_to_dataframe(fn, index='sample_name'):
     # Load in file lines
     holdfile = None
     with open_file(fn, mode='U') as f:
+        errors = {}
         holdfile = f.readlines()
+        # here we are checking for non printable chars AKA non UTF-8 chars
+        for row, line in enumerate(holdfile):
+            for col, block in enumerate(line.split('\t')):
+                tblock = ''.join([c for c in block if c in printable])
+                if len(block) != len(tblock):
+                    tblock = ''.join([c if c in printable else '&#128062;'
+                                      for c in block])
+                    if tblock not in errors:
+                        errors[tblock] = []
+                    errors[tblock].append('(%d, %d)' % (row, col))
+        if bool(errors):
+            raise ValueError(
+                "There are invalid (non UTF-8) characters in your information "
+                "file. The offending fields and their location (row, column) "
+                "are listed below, invalid characters are represented using "
+                "&#128062;: %s" % '; '.join(
+                    ['"%s" = %s' % (k, ', '.join(v))
+                     for k, v in viewitems(errors)]))
+
     if not holdfile:
         raise ValueError('Empty file passed!')
 
@@ -137,7 +157,7 @@ def load_template_to_dataframe(fn, index='sample_name'):
             # .strip will remove odd chars, newlines, tabs and multiple
             # spaces but we need to read a new line at the end of the
             # line(+'\n')
-            newcols = [d.strip(" \r\x0b\x0c\n") for d in cols]
+            newcols = [d.strip(" \r\n") for d in cols]
 
         holdfile[pos] = '\t'.join(newcols) + '\n'
 
@@ -149,34 +169,19 @@ def load_template_to_dataframe(fn, index='sample_name'):
     # comment:
     #   using the tab character as "comment" we remove rows that are
     #   constituted only by delimiters i. e. empty rows.
-    try:
-        template = pd.read_csv(
-            StringIO(''.join(holdfile)),
-            sep='\t',
-            dtype=str,
-            encoding='utf-8',
-            infer_datetime_format=False,
-            keep_default_na=False,
-            index_col=False,
-            comment='\t',
-            converters={index: lambda x: str(x).strip()})
-        # remove newlines and tabs from fields
-        template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
-                         regex=True, inplace=True)
-    except UnicodeDecodeError:
-        # Find row number and col number for utf-8 encoding errors
-        headers = holdfile[0].strip().split('\t')
-        errors = defaultdict(list)
-        for row, line in enumerate(holdfile, 1):
-            for col, cell in enumerate(line.split('\t')):
-                try:
-                    cell.encode('utf-8')
-                except UnicodeError:
-                    errors[headers[col]].append(row)
-        lines = ['%s: row(s) %s' % (header, ', '.join(map(str, rows)))
-                 for header, rows in viewitems(errors)]
-        raise qdb.exceptions.QiitaDBError(
-            'Non UTF-8 characters found in columns:\n' + '\n'.join(lines))
+    template = pd.read_csv(
+        StringIO(''.join(holdfile)),
+        sep='\t',
+        dtype=str,
+        encoding='utf-8',
+        infer_datetime_format=False,
+        keep_default_na=False,
+        index_col=False,
+        comment='\t',
+        converters={index: lambda x: str(x).strip()})
+    # remove newlines and tabs from fields
+    template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
+                     regex=True, inplace=True)
 
     initial_columns = set(template.columns)
 

diff --git a/qiita_db/private.py b/qiita_db/private.py
@@ -47,8 +47,27 @@ def build_analysis_files(job):
             j.submit()
             sleep(1)
 
+    # The validator jobs no longer finish the job automatically so we need
+    # to release the validators here
+    job.release_validators()
 
-TASK_DICT = {'build_analysis_files': build_analysis_files}
+
+def release_validators(job):
+    """Waits until all the validators of a job are completed
+
+    Parameters
+    ----------
+    job : qiita_db.processing_job.ProcessingJob
+        The processing job with the information of the parent job
+    """
+    with qdb.sql_connection.TRN:
+        qdb.processing_job.ProcessingJob(
+            job.parameters.values['job']).release_validators()
+        job._set_status('success')
+
+
+TASK_DICT = {'build_analysis_files': build_analysis_files,
+             'release_validators': release_validators}
 
 
 def private_task(job_id):