Skip to content

fix #2211 #2240

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions qiita_db/metadata_template/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def test_load_template_to_dataframe_lowercase(self):

def test_load_template_to_dataframe_non_utf8(self):
bad = EXP_SAMPLE_TEMPLATE.replace('Test Sample 2', 'Test Sample\x962')
with self.assertRaises(qdb.exceptions.QiitaDBError):
with self.assertRaises(ValueError):
qdb.metadata_template.util.load_template_to_dataframe(
StringIO(bad))

Expand Down Expand Up @@ -387,20 +387,20 @@ def test_get_pgsql_reserved_words(self):

EXP_SAMPLE_TEMPLATE_SPACES_EMPTY_ROW = (
"sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t"
"has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t"
"physical_location\trequired_sample_info_status\tsample_type\t"
"str_column\n"
"2.Sample1 \t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t"
"has_physical_specimen\thost_subject_id\tint_column\tlatitude\t"
"longitude\t physical_location\trequired_sample_info_status"
"\tsample_type\tstr_column\n"
" 2.Sample1 \t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t"
"NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t"
"Value for sample 1\n"
"2.Sample2 \t2014-05-29 12:24:51\t"
" 2.Sample2 \t2014-05-29 12:24:51\t"
"Test Sample 2\tTrue\tTrue\tNotIdentified\t2\t4.2\t1.1\tlocation1\t"
"received\ttype1\tValue for sample 2\n"
"2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t"
"True\tNotIdentified\t3\t4.8\t4.41\tlocation1\treceived\ttype1\t"
"Value for sample 3\n"
"\t\t\t\t\t\t\t\t\t\t\t\t\n"
"\t\t\t\t\t\t\t\t\t\t\t\t\n")
"\t\t\t\t\t\t\t\t\t\t \t\t\n")

EXP_ST_SPACES_EMPTY_COLUMN = (
"sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t"
Expand Down
65 changes: 35 additions & 30 deletions qiita_db/metadata_template/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
# -----------------------------------------------------------------------------

from __future__ import division
from collections import defaultdict
from future.utils import PY3, viewitems
from six import StringIO
from string import printable

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -103,7 +103,27 @@ def load_template_to_dataframe(fn, index='sample_name'):
# Load in file lines
holdfile = None
with open_file(fn, mode='U') as f:
errors = {}
holdfile = f.readlines()
# here we are checking for non printable chars AKA non UTF-8 chars
for row, line in enumerate(holdfile):
for col, block in enumerate(line.split('\t')):
tblock = ''.join([c for c in block if c in printable])
if len(block) != len(tblock):
tblock = ''.join([c if c in printable else '🐾'
for c in block])
if tblock not in errors:
errors[tblock] = []
errors[tblock].append('(%d, %d)' % (row, col))
if bool(errors):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just out of curiosity. why this specific call to bool. AFAIK, this is not pythonic, and an empty list/dict evaluates to false.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand your link? If you check the actual example it is not using the call to bool. In the first part of the post it uses it to demonstrate the behavior, but in the actual code the call to bool is not used.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I guess it shows that with or without the bool it works as expected, want me to change it?

raise ValueError(
"There are invalid (non UTF-8) characters in your information "
"file. The offending fields and their location (row, column) "
"are listed below, invalid characters are represented using "
"🐾: %s" % '; '.join(
['"%s" = %s' % (k, ', '.join(v))
for k, v in viewitems(errors)]))

if not holdfile:
raise ValueError('Empty file passed!')

Expand Down Expand Up @@ -137,7 +157,7 @@ def load_template_to_dataframe(fn, index='sample_name'):
# .strip will remove odd chars, newlines, tabs and multiple
# spaces but we need to read a new line at the end of the
# line(+'\n')
newcols = [d.strip(" \r\x0b\x0c\n") for d in cols]
newcols = [d.strip(" \r\n") for d in cols]

holdfile[pos] = '\t'.join(newcols) + '\n'

Expand All @@ -149,34 +169,19 @@ def load_template_to_dataframe(fn, index='sample_name'):
# comment:
# using the tab character as "comment" we remove rows that are
# constituted only by delimiters i. e. empty rows.
try:
template = pd.read_csv(
StringIO(''.join(holdfile)),
sep='\t',
dtype=str,
encoding='utf-8',
infer_datetime_format=False,
keep_default_na=False,
index_col=False,
comment='\t',
converters={index: lambda x: str(x).strip()})
# remove newlines and tabs from fields
template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
regex=True, inplace=True)
except UnicodeDecodeError:
# Find row number and col number for utf-8 encoding errors
headers = holdfile[0].strip().split('\t')
errors = defaultdict(list)
for row, line in enumerate(holdfile, 1):
for col, cell in enumerate(line.split('\t')):
try:
cell.encode('utf-8')
except UnicodeError:
errors[headers[col]].append(row)
lines = ['%s: row(s) %s' % (header, ', '.join(map(str, rows)))
for header, rows in viewitems(errors)]
raise qdb.exceptions.QiitaDBError(
'Non UTF-8 characters found in columns:\n' + '\n'.join(lines))
template = pd.read_csv(
StringIO(''.join(holdfile)),
sep='\t',
dtype=str,
encoding='utf-8',
infer_datetime_format=False,
keep_default_na=False,
index_col=False,
comment='\t',
converters={index: lambda x: str(x).strip()})
# remove newlines and tabs from fields
template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
regex=True, inplace=True)

initial_columns = set(template.columns)

Expand Down
7 changes: 5 additions & 2 deletions qiita_ware/test/test_dispatchable.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,11 @@ def test_create_sample_template_nonutf8(self):
'sample_info_utf8_error.txt')
obs = create_sample_template(fp, Study(1), False)
exp = {'status': 'danger',
'message': u"Non UTF-8 characters found in columns:"
u"\n\ufffdcollection_timestamp: row(s) 1"}
'message': 'There are invalid (non UTF-8) characters in your '
'information file. The offending fields and their '
'location (row, column) are listed below, invalid '
'characters are represented using 🐾: '
'"🐾collection_timestamp" = (0, 13)'}
self.assertEqual(obs, exp)

def test_update_sample_template(self):
Expand Down
Loading