Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: support decimal option in PythonParser #12933 #13189

Closed
wants to merge 10 commits into from
38 changes: 37 additions & 1 deletion asv_bench/benchmarks/parser_vb.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@ def time_read_csv_default_converter(self):
read_csv(StringIO(self.data), sep=',', header=None, float_precision=None)


class read_csv_default_converter_with_decimal(object):
goal_time = 0.2

def setup(self):
self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n '
self.data = (self.data * 200)

def time_read_csv_default_converter_with_decimal(self):
read_csv(StringIO(self.data), sep=';', header=None,
float_precision=None, decimal=',')


class read_csv_precise_converter(object):
goal_time = 0.2

Expand Down Expand Up @@ -109,4 +121,28 @@ def setup(self):
self.data = (self.data * 200)

def time_read_table_multiple_date_baseline(self):
read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])


class read_csv_default_converter_python_engine(object):
goal_time = 0.2

def setup(self):
self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
self.data = (self.data * 200)

def time_read_csv_default_converter(self):
read_csv(StringIO(self.data), sep=',', header=None,
float_precision=None, engine='python')


class read_csv_default_converter_with_decimal_python_engine(object):
goal_time = 0.2

def setup(self):
self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n '
self.data = (self.data * 200)

def time_read_csv_default_converter_with_decimal(self):
read_csv(StringIO(self.data), sep=';', header=None,
float_precision=None, decimal=',', engine='python')
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ Other enhancements

pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30)

- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)

.. _whatsnew_0182.api:

API changes
Expand Down
37 changes: 30 additions & 7 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds):
'keep_default_na': True,
'thousands': None,
'comment': None,
'decimal': b'.',

# 'engine': 'c',
'parse_dates': False,
Expand Down Expand Up @@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines': True,
'warn_bad_lines': True,
'dtype': None,
'decimal': b'.',
'float_precision': None
}

Expand All @@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines',
'warn_bad_lines',
'dtype',
'decimal',
'float_precision',
])

Expand Down Expand Up @@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds):
self.converters = kwds['converters']

self.thousands = kwds['thousands']
self.decimal = kwds['decimal']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls check / update io.rst and the doc-string. IIRC we list in the option if it doesn't support a particular engine (so that can now be removed).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, might not be the case, but pls check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback I can't find any reference in io.rst to a particular engine when decimal option is used

self.comment = kwds['comment']
self._comment_lines = []

Expand Down Expand Up @@ -1639,6 +1639,15 @@ def __init__(self, f, **kwds):
else:
self._no_thousands_columns = None

if len(self.decimal) != 1:
raise ValueError('Only length-1 decimal markers supported')

if self.thousands is None:
self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
else:
self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
self.decimal))

def _set_no_thousands_columns(self):
# Create a set of column ids that are not to be stripped of thousands
# operators.
Expand Down Expand Up @@ -2050,22 +2059,35 @@ def _check_empty(self, lines):
def _check_thousands(self, lines):
if self.thousands is None:
return lines
nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands)

return self._search_replace_num_columns(lines=lines,
search=self.thousands,
replace='')

def _search_replace_num_columns(self, lines, search, replace):
ret = []
for l in lines:
rl = []
for i, x in enumerate(l):
if (not isinstance(x, compat.string_types) or
self.thousands not in x or
search not in x or
(self._no_thousands_columns and
i in self._no_thousands_columns) or
nonnum.search(x.strip())):
self.nonnum.search(x.strip())):
rl.append(x)
else:
rl.append(x.replace(self.thousands, ''))
rl.append(x.replace(search, replace))
ret.append(rl)
return ret

def _check_decimal(self, lines):
if self.decimal == _parser_defaults['decimal']:
return lines

return self._search_replace_num_columns(lines=lines,
search=self.decimal,
replace='.')

def _clear_buffer(self):
self.buf = []

Expand Down Expand Up @@ -2249,7 +2271,8 @@ def _get_lines(self, rows=None):
lines = self._check_comments(lines)
if self.skip_blank_lines:
lines = self._check_empty(lines)
return self._check_thousands(lines)
lines = self._check_thousands(lines)
return self._check_decimal(lines)


def _make_date_converter(date_parser=None, dayfirst=False,
Expand Down
45 changes: 0 additions & 45 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,17 +353,6 @@ def test_disable_bool_parsing(self):
result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
self.assertEqual(result['B'][2], '')

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""

df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
self.assertEqual(df2['Number1'].dtype, float)
self.assertEqual(df2['Number2'].dtype, float)
self.assertEqual(df2['Number3'].dtype, float)

def test_custom_lineterminator(self):
data = 'a,b,c~1,2,3~4,5,6'

Expand Down Expand Up @@ -444,40 +433,6 @@ def test_raise_on_no_columns(self):
data = "\n\n\n"
self.assertRaises(ValueError, self.read_csv, StringIO(data))

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

tm.assert_equal(expected.A.dtype, 'int64')
tm.assert_equal(expected.B.dtype, 'float')
tm.assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|',
thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_grow_boundary_at_cap(self):
# See gh-12494
#
Expand Down
53 changes: 49 additions & 4 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ def test_empty_decimal_marker(self):
1|2,334|5
10|13|10.
"""
# C parser: supports only length-1 decimals
# Python parser: 'decimal' not supported yet
self.assertRaises(ValueError, self.read_csv,
StringIO(data), decimal='')
# Parsers support only length-1 decimals
msg = 'Only length-1 decimal markers supported'
with tm.assertRaisesRegexp(ValueError, msg):
self.read_csv(StringIO(data), decimal='')

def test_read_csv(self):
if not compat.PY3:
Expand Down Expand Up @@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self):
result = self.read_table(f, squeeze=True, header=None)
expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
tm.assert_series_equal(result, expected)

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

tm.assert_equal(expected.A.dtype, 'int64')
tm.assert_equal(expected.B.dtype, 'float')
tm.assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|',
thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""

df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
self.assertEqual(df2['Number1'].dtype, float)
self.assertEqual(df2['Number2'].dtype, float)
self.assertEqual(df2['Number3'].dtype, float)